Compare commits

..

35 Commits

Author SHA1 Message Date
Maximilian Gaul
b024adaf5b Re-measure for 2000 time steps 2022-02-05 14:13:36 +01:00
Maximilian Gaul
696e6da01d Implement Neighbour list AoS memory layout + performance measurement 2022-01-31 20:27:59 +01:00
Maximilian Gaul
b2a6574426 Remove unnecessary atom force backcopy in computeForce 2022-01-24 18:09:27 +01:00
Maximilian Gaul
c4080e866e Make integrate kernels aware of neighbour list update 2022-01-24 18:04:50 +01:00
Maximilian Gaul
7b592b5fc7 Moved presentation resources to second presentation 2022-01-05 12:48:37 +01:00
Maximilian Gaul
4690542db5 Added CPU metrics {Cache, FLOPS, L2, L3}, restructured resource folders 2022-01-05 12:31:47 +01:00
Maximilian Gaul
8c131a7699 Reminder for likwid perf measurements 2022-01-04 13:51:53 +01:00
Maximilian Gaul
dc4d5f1a9c Porting atom velocity memory layout to AoS, porting velocity integration to CUDA, adding measurements + logbook update 2022-01-01 18:18:12 +01:00
Maximilian Gaul
50007216ed Implemented atom force AoS memory layout, added performance measurements + logbook Update 2022-01-01 16:09:21 +01:00
Maximilian Gaul
72e4599acc Copy neighbour lists only when reneighbouring happens, added measurements + logbook update 2022-01-01 12:56:42 +01:00
Maximilian Gaul
8fa03733e9 Copy parameters & cutforces threshold only once at the start + measurements 2021-12-28 16:48:26 +01:00
Maximilian Gaul
bf1ae3d013 Removed debug prints, only zero atom forces and not copy them, added measurements 2021-12-28 16:32:54 +01:00
Maximilian Gaul
8009b54113 Trying to debug segfault if cudaMemcpy is limited to neighbour list update 2021-12-25 15:36:08 +01:00
Maximilian Gaul
0ea0587442 Only malloc once at the beginning plus measurement csv 2021-12-25 13:52:33 +01:00
Maximilian Gaul
134e3f4b78 Also pinnend neighbor-struct memory, added additional performance measurements, added nvprof result to logbook 2021-12-18 15:58:56 +01:00
Maximilian Gaul
c2bfa3ca3f Add scripts for perf measurement, made atom-memory allocation pinnend using 'cudaMallocHost', added measurements for atom pinnend memory 2021-12-18 13:02:04 +01:00
Maximilian Gaul
2a099da5b7 Started cuda profiling, added first result to logbook 2021-12-03 08:13:43 +01:00
Maximilian Gaul
7691b23d67 Measure memory transfer of CPU to GPU, add explanation how to distribute calculation among multiple GPUs 2021-12-01 17:16:32 +01:00
Maximilian Gaul
da90466f98 Added first performance measurements with threads per block from 1 to 32 2021-11-25 08:09:20 +01:00
Maximilian Gaul
8f723c1299 Added command line description of MD-Bench, added memory transfer rate from CPU to GPU to force.cu 2021-11-23 15:55:23 +01:00
Maximilian Gaul
0586ef150a Fix num of threads instead of num of blocks, add logbook template 2021-11-15 19:39:09 +01:00
Maximilian Gaul
2e5d973f7d Rough rewrite to execute outer loop of force calculation in parallel, not inner loop 2021-11-14 10:02:23 +01:00
Maximilian Gaul
e2fd1a0476 Fixed bug, results are now equal to master branch (but still slow) 2021-11-11 21:00:30 +01:00
Maximilian Gaul
4105c844c6 Runs fine (but slow), results seem to be slightly off from original 2021-11-11 20:47:06 +01:00
Maximilian Gaul
1f5c9c4b23 Fixed segfault error, added more cudaErrorChecks, added cudaFree to avoid memory leak 2021-11-11 20:29:14 +01:00
Maximilian Gaul
29e115464b Fixed cudaMemcpy for AOS data layout, added debug outputs, added cudaErrorChecks 2021-11-11 20:14:30 +01:00
Maximilian Gaul
1a54314c8b First run but segfault at the moment after a few seconds 2021-11-11 15:23:46 +01:00
Maximilian Gaul
280f595b7f Fixed linker error by putting includes and cuda function in extern 'C' 2021-11-11 14:49:29 +01:00
Maximilian Gaul
3428974730 getTimeStamp() couldn't get linked 2021-11-11 08:03:56 +01:00
Maximilian Gaul
b54842f764 Added Makefile instructions for .cu files 2021-11-11 07:27:12 +01:00
Maximilian Gaul
9730164e6f Rename force.c to force.cu because of cuda build errors 2021-11-10 16:20:04 +01:00
Maximilian Gaul
0f5fdd3708 Sum results after cuda function executed 2021-11-10 16:02:05 +01:00
Maximilian Gaul
3f7fb7f22a cudaMemcpy of Atom and other properties, first draft implementation of CUDA kernel 2021-11-09 16:40:25 +01:00
Maximilian Gaul
bfa6c581c3 Copy necessary values for force calculation into cuda memory 2021-11-09 08:37:37 +01:00
Maximilian Gaul
fd886e77eb Added make config for NVCC 2021-11-08 20:32:12 +01:00
133 changed files with 5301 additions and 109676 deletions

View File

@ -1,176 +0,0 @@
---
Language: Cpp
# BasedOnStyle: WebKit
AccessModifierOffset: -4
AlignAfterOpenBracket: DontAlign
AlignArrayOfStructures: None
AlignConsecutiveAssignments: Consecutive
AlignConsecutiveBitFields: None
AlignConsecutiveDeclarations: None
AlignConsecutiveMacros: Consecutive
AlignEscapedNewlines: Right
AlignOperands: Align
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortEnumsOnASingleLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: OnlyFirstIf
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: MultiLine
AttributeMacros:
- __capability
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterFunction: true
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: WebKit
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeComma
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 90
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DeriveLineEnding: true
DerivePointerAlignment: false
DisableFormat: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: LogicalBlock
ExperimentalAutoDetectBinPacking: false
BasedOnStyle: ''
ConstructorInitializerAllOnOneLineOrOnePerLine: false
AllowAllConstructorInitializersOnNextLine: true
FixNamespaceComments: false
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IfMacros:
- KJ_IF_MAYBE
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
Priority: 2
SortPriority: 0
CaseSensitive: false
- Regex: '^(<|"(gtest|gmock|isl|json)/)'
Priority: 3
SortPriority: 0
CaseSensitive: false
- Regex: '.*'
Priority: 1
SortPriority: 0
CaseSensitive: false
IncludeIsMainRegex: '(Test)?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseLabels: false
IndentCaseBlocks: false
IndentGotoLabels: true
IndentPPDirectives: None
IndentExternBlock: AfterExternBlock
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertTrailingCommas: None
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: true
LambdaBodyIndentation: Signature
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: Inner
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 4
ObjCBreakBeforeNestedBlockParam: true
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 200
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PenaltyIndentedWhitespace: 0
PointerAlignment: Left
PPIndentWidth: -1
ReferenceAlignment: Pointer
ReflowComments: true
ShortNamespaceLines: 1
SortIncludes: CaseSensitive
SortJavaStaticImport: Before
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: true
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceAroundPointerQualifiers: Default
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: Never
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
BitFieldColonSpacing: Both
Standard: Latest
StatementAttributeLikeMacros:
- Q_EMIT
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseCRLF: false
UseTab: Never
WhitespaceSensitiveMacros:
- STRINGIZE
- PP_STRINGIZE
- BOOST_PP_STRINGIZE
- NS_SWIFT_NAME
- CF_SWIFT_NAME
...

View File

@ -1,14 +0,0 @@
---
Checks: 'clang-diagnostic-*,clang-analyzer-*,clang-bugprone-*,readability-identifier-naming'
WarningsAsErrors: true
HeaderFilterRegex: '.*'
AnalyzeTemporaryDtors: false
CheckOptions:
- key: readability-identifier-naming.StructCase
value: 'CamelCase'
- key: readability-identifier-naming.FunctionCase
value: 'camelBack'
- key: readability-identifier-naming.VariableCase
value: 'camelBack'
- key: readability-identifier-naming.GlobalConstantCase
value: 'UPPER_CASE'

View File

@ -1,3 +0,0 @@
CompileFlags:
Add: [-I/Users/jan/prg/MD-Bench/src/verletlist/, -I/Users/jan/prg/MD-Bench/src/common/, -DALIGNMENT=64]
Compiler: clang

19
.gitignore vendored
View File

@ -27,6 +27,7 @@
*.so
*.so.*
*.dylib
.DS_Store
# Executables
*.exe
@ -51,17 +52,9 @@ Module.symvers
Mkfile.old
dkms.conf
# Logs
*.log
# TODO list
todo.txt
# Build directories and executables
#GCC-*/
#ICC-*/
#ICX-*/
#CLANG-*/
#NVCC-*/
build-*/
MDBench-*
.vscode/
GCC/
ICC/
MDBench-GCC*
MDBench-ICC*

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "gather-bench"]
path = gather-bench
url = https://github.com/RRZE-HPC/gather-bench

View File

@ -1,32 +1,70 @@
#CONFIGURE BUILD SYSTEM
TAG = $(OPT_TAG)-$(TOOLCHAIN)-$(DATA_TYPE)
TARGET = MDBench-$(TAG)
BUILD_DIR = ./build/build-$(TAG)
SRC_ROOT = ./src
SRC_DIR = $(SRC_ROOT)/$(OPT_SCHEME)
COMMON_DIR = $(SRC_ROOT)/common
CUDA_DIR = $(SRC_DIR)/cuda
MAKE_DIR = ./make
BUILD_DIR = ./$(TAG)
SRC_DIR = ./src
ASM_DIR = ./asm
MAKE_DIR = ./
Q ?= @
#DO NOT EDIT BELOW
include config.mk
include $(MAKE_DIR)/include_$(TOOLCHAIN).mk
include $(MAKE_DIR)/config.mk
include $(MAKE_DIR)/include_$(TAG).mk
include $(MAKE_DIR)/include_LIKWID.mk
ifneq ($(strip $(ISA)),NONE)
include $(MAKE_DIR)/include_ISA.mk
endif
INCLUDES += -I./$(SRC_DIR) -I./$(COMMON_DIR)
INCLUDES += -I./src/includes
VPATH = $(SRC_DIR) $(COMMON_DIR) $(CUDA_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OBJ = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(COMMON_DIR)/*.c))
ifeq ($(strip $(TAG)),NVCC)
OBJ += $(patsubst $(CUDA_DIR)/%.cu, $(BUILD_DIR)/%-cuda.o,$(wildcard $(CUDA_DIR)/*.cu))
ifeq ($(strip $(DATA_LAYOUT)),AOS)
DEFINES += -DAOS
endif
ifeq ($(strip $(DATA_TYPE)),SP)
DEFINES += -DPRECISION=1
else
DEFINES += -DPRECISION=2
endif
ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifneq ($(ATOMS_LOOP_RUNS),)
DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
endif
ifneq ($(NEIGHBORS_LOOP_RUNS),)
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
ifeq ($(strip $(MEM_TRACER)),true)
DEFINES += -DMEM_TRACER
endif
ifeq ($(strip $(INDEX_TRACER)),true)
DEFINES += -DINDEX_TRACER
endif
ifeq ($(strip $(COMPUTE_STATS)),true)
DEFINES += -DCOMPUTE_STATS
endif
ifneq ($(VECTOR_WIDTH),)
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
endif
VPATH = $(SRC_DIR) $(ASM_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
OBJ += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
# $(warning $(OBJ))
ifneq ($(VARIANT),)
.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
DEFINES += -DVARIANT=$(VARIANT)
@ -45,11 +83,6 @@ $(BUILD_DIR)/%.o: %.c
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%-cuda.o: %.cu
$(info ===> COMPILE $@)
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.s: %.c
$(info ===> GENERATE ASM $@)
$(Q)$(CC) -S $(ASFLAGS) $(CPPFLAGS) $(CFLAGS) $< -o $@
@ -58,21 +91,21 @@ $(BUILD_DIR)/%.o: %.s
$(info ===> ASSEMBLE $@)
$(Q)$(AS) $< -o $@
$(BUILD_DIR)/%.o: %.cu
$(info ===> COMPILE $@)
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
.PHONY: clean distclean tags info asm
clean:
$(info ===> CLEAN)
@rm -rf $(BUILD_DIR)
cleanall:
$(info ===> CLEAN)
@rm -rf build
@rm -rf MDBench-*
@rm -f tags
distclean: clean
$(info ===> DIST CLEAN)
@rm -f $(TARGET)
@rm -f $(TARGET)*
@rm -f tags
info:
@ -86,6 +119,6 @@ tags:
$(Q)ctags -R
$(BUILD_DIR):
@mkdir -p $(BUILD_DIR)
@mkdir $(BUILD_DIR)
-include $(OBJ:.o=.d)

View File

@ -1,95 +1,27 @@
# MD-Bench
MD-Bench is a toolbox for the performance engineering of short-range force
calculation kernels on molecular-dynamics applications. It aims at covering all
available state-of-the-art algorithms from different community codes such as
LAMMPS and GROMACS.
A simple, sequential C implementation of the [Mantevo miniMD](https://github.com/Mantevo/miniMD) benchmark in less than 1000 LOC.
## Build instructions
## Build
Properly configure your building by changing `config.mk` file. The following
options are available:
1. Open `config.mk` and edit the `TAG` value according to the tool chain used. Currently supported is GCC, CLANG (LLVM), and ICC (Intel).
2. Change `DATA_LAYOUT` and `DATA_TYPE` if desired in config.mk.
3. Open and adapt the compiler flags in `<include_<TOOLCHAIN>.mk`, e.g. in `include_ICC.mk` for the Intel tool chain.
4. Build the binary calling `make`.
- **TAG:** Compiler tag (available options: GCC, CLANG, ICC, ONEAPI, NVCC).
- **ISA:** Instruction set (available options: SSE, AVX, AVX\_FMA, AVX2, AVX512).
- **MASK\_REGISTERS:** Use AVX512 mask registers (always true when ISA is set to AVX512).
- **OPT\_SCHEME:** Optimization algorithm (available options: lammps, gromacs).
- **ENABLE\_LIKWID:** Enable likwid to make use of HPM counters.
- **DATA\_TYPE:** Floating-point precision (available options: SP, DP).
- **DATA\_LAYOUT:** Data layout for atom vector properties (available options: AOS, SOA).
- **ASM\_SYNTAX:** Assembly syntax to use when generating assembly files (available options: ATT, INTEL).
- **DEBUG:** Toggle debug mode.
- **EXPLICIT\_TYPES:** Explicitly store and load atom types.
- **MEM\_TRACER:** Trace memory addresses for cache simulator.
- **INDEX\_TRACER:** Trace indexes and distances for gather-md.
- **COMPUTE\_STATS:** Compute statistics.
Configurations for LAMMPS Verlet Lists optimization scheme:
- **ENABLE\_OMP\_SIMD:** Use omp simd pragma on half neighbor-lists kernels.
- **USE\_SIMD\_KERNEL:** Compile kernel with explicit SIMD intrinsics.
Configurations for GROMACS MxN optimization scheme:
- **USE\_REFERENCE\_VERSION:** Use reference version (only for correction purposes).
- **XTC\_OUTPUT:** Enable XTC output.
- **HALF\_NEIGHBOR\_LISTS\_CHECK\_CJ:** Check if j-clusters are local when decreasing the reaction force.
Configurations for CUDA:
- **USE\_CUDA\_HOST\_MEMORY:** Use CUDA host memory to optimize host-device transfers.
When done, just use `make` to compile the code.
You can clean intermediate build results with `make clean`, and all build results with `make distclean`.
You have to call `make clean` before `make` if you changed the build settings.
## Usage
## Configuration
Use the following command to run a simulation:
Currently all settings apart from the options described below are hard-coded in `main.c`.
```bash
./MD-Bench-<TAG>-<OPT_SCHEME> [OPTION]...
## Run the benchmark
Without any options 200 steps with system size 32x32x32 is used.
The default can be changed using the following options:
```
Where `TAG` and `OPT_SCHEME` correspond to the building options with the same
name. Without any options, a Copper FCC lattice system with size 32x32x32
(131072 atoms) over 200 time-steps using the Lennard-Jones potential (sigma=1.0,
epsilon=1.0) is simulated.
The default behavior and other options can be changed using the following parameters:
```sh
-p <string>: file to read parameters from (can be specified more than once)
-f <string>: force field (lj or eam), default lj
-i <string>: input file with atom positions (dump)
-e <string>: input file for EAM
-n / --nsteps <int>: set number of timesteps for simulation
-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction
-r / --radius <real>: set cutoff radius
-s / --skin <real>: set skin (verlet buffer)
--freq <real>: processor frequency (GHz)
--vtk <string>: VTK file for visualization
--xtc <string>: XTC file for visualization
```
## Examples
TBD
## Citations
Rafael Ravedutti Lucio Machado, Jan Eitzinger, Harald Köstler, and Gerhard
Wellein: MD-Bench: A generic proxy-app toolbox for state-of-the-art molecular
dynamics algorithms. Accepted for [PPAM](https://ppam.edu.pl/) 2022, the 14th
International Conference on Parallel Processing and Applied Mathematics, Gdansk,
Poland, September 11-14, 2022. PPAM 2022 Best Paper Award. Preprint:
[arXiv:2207.13094](https://arxiv.org/abs/2207.13094)
## Credits
MD-Bench is developed by the Erlangen National High Performance Computing Center
([NHR@FAU](https://hpc.fau.de/)) at the University of Erlangen-Nürnberg.
## License
[LGPL-3.0](https://github.com/RRZE-HPC/MD-Bench/blob/master/LICENSE)

0
asm/.gitkeep Normal file
View File

View File

@ -0,0 +1,626 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
# mark_description "ICC/force.s";
.file "force.c"
.text
..TXTST0:
.L_2__routine_start_computeForce_0:
# -- Begin computeForce
.text
# mark_begin;
.align 16,0x90
.globl computeForce
# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
computeForce:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %ecx
# parameter 5: %r8d
# parameter 6: %r9d
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_computeForce.1:
..L2:
#121.112
pushq %rbp #121.112
.cfi_def_cfa_offset 16
movq %rsp, %rbp #121.112
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-64, %rsp #121.112
pushq %r12 #121.112
pushq %r13 #121.112
pushq %r14 #121.112
pushq %r15 #121.112
pushq %rbx #121.112
subq $88, %rsp #121.112
xorl %eax, %eax #124.16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
movq %rdx, %r15 #121.112
movq %rsi, %r12 #121.112
movq %rdi, %rbx #121.112
..___tag_value_computeForce.11:
# getTimeStamp()
call getTimeStamp #124.16
..___tag_value_computeForce.12:
# LOE rbx r12 r15 xmm0
..B1.51: # Preds ..B1.1
# Execution count [1.00e+00]
vmovsd %xmm0, 24(%rsp) #124.16[spill]
# LOE rbx r12 r15
..B1.2: # Preds ..B1.51
# Execution count [1.00e+00]
movl 4(%r12), %r13d #125.18
movq 64(%r12), %r9 #127.20
movq 72(%r12), %r14 #127.45
movq 80(%r12), %r8 #127.70
vmovsd 72(%rbx), %xmm2 #129.27
vmovsd 8(%rbx), %xmm1 #130.23
vmovsd (%rbx), %xmm0 #131.24
testl %r13d, %r13d #134.24
jle ..B1.43 # Prob 50% #134.24
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.3: # Preds ..B1.2
# Execution count [1.00e+00]
xorl %ebx, %ebx #134.5
movl %r13d, %edx #134.5
xorl %ecx, %ecx #134.5
movl $1, %esi #134.5
xorl %eax, %eax #135.17
shrl $1, %edx #134.5
je ..B1.7 # Prob 9% #134.5
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.5: # Preds ..B1.3 ..B1.5
# Execution count [2.50e+00]
movq %rax, (%rcx,%r9) #135.9
incq %rbx #134.5
movq %rax, (%rcx,%r14) #136.9
movq %rax, (%rcx,%r8) #137.9
movq %rax, 8(%rcx,%r9) #135.9
movq %rax, 8(%rcx,%r14) #136.9
movq %rax, 8(%rcx,%r8) #137.9
addq $16, %rcx #134.5
cmpq %rdx, %rbx #134.5
jb ..B1.5 # Prob 63% #134.5
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.6: # Preds ..B1.5
# Execution count [9.00e-01]
lea 1(%rbx,%rbx), %esi #135.9
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.7: # Preds ..B1.3 ..B1.6
# Execution count [1.00e+00]
lea -1(%rsi), %edx #134.5
cmpl %r13d, %edx #134.5
jae ..B1.9 # Prob 9% #134.5
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.8: # Preds ..B1.7
# Execution count [9.00e-01]
movslq %esi, %rsi #134.5
movq %rax, -8(%r9,%rsi,8) #135.9
movq %rax, -8(%r14,%rsi,8) #136.9
movq %rax, -8(%r8,%rsi,8) #137.9
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.9: # Preds ..B1.7 ..B1.8
# Execution count [5.00e-01]
movl $.L_2__STRING.0, %edi #141.5
movq %r8, 32(%rsp) #141.5[spill]
movq %r9, 80(%rsp) #141.5[spill]
vmovsd %xmm2, (%rsp) #141.5[spill]
vmovsd %xmm1, 8(%rsp) #141.5[spill]
vmovsd %xmm0, 16(%rsp) #141.5[spill]
..___tag_value_computeForce.18:
# likwid_markerStartRegion(const char *)
call likwid_markerStartRegion #141.5
..___tag_value_computeForce.19:
# LOE r12 r14 r15 r13d
..B1.10: # Preds ..B1.9
# Execution count [9.00e-01]
vmovsd 16(%rsp), %xmm0 #[spill]
xorl %esi, %esi #143.15
vmovsd (%rsp), %xmm2 #[spill]
xorl %eax, %eax #143.5
vmulsd %xmm2, %xmm2, %xmm13 #129.45
xorl %edi, %edi #143.5
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #173.13
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #197.45
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #173.13
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #197.58
vmovsd 8(%rsp), %xmm1 #[spill]
vbroadcastsd %xmm13, %zmm14 #129.25
vbroadcastsd %xmm1, %zmm13 #130.21
vbroadcastsd %xmm0, %zmm9 #197.45
movslq %r13d, %r13 #143.5
movq 24(%r15), %r10 #145.25
movslq 16(%r15), %rdx #144.43
movq 8(%r15), %rcx #144.19
movq 32(%rsp), %r8 #[spill]
movq 16(%r12), %rbx #146.25
shlq $2, %rdx #126.5
movq %r13, 64(%rsp) #143.5[spill]
movq %r10, 72(%rsp) #143.5[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.11: # Preds ..B1.41 ..B1.10
# Execution count [5.00e+00]
movq 72(%rsp), %r9 #145.25[spill]
vxorpd %xmm24, %xmm24, %xmm24 #149.22
vmovapd %xmm24, %xmm18 #150.22
movl (%r9,%rax,4), %r10d #145.25
vmovapd %xmm18, %xmm4 #151.22
vmovsd (%rdi,%rbx), %xmm10 #146.25
vmovsd 8(%rdi,%rbx), %xmm6 #147.25
vmovsd 16(%rdi,%rbx), %xmm12 #148.25
testl %r10d, %r10d #173.32
jle ..B1.41 # Prob 50% #173.32
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
vpxord %zmm8, %zmm8, %zmm8 #149.22
vmovaps %zmm8, %zmm7 #150.22
vmovaps %zmm7, %zmm11 #151.22
cmpl $8, %r10d #173.13
jl ..B1.48 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.13: # Preds ..B1.12
# Execution count [4.50e+00]
cmpl $1200, %r10d #173.13
jl ..B1.47 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.14: # Preds ..B1.13
# Execution count [4.50e+00]
movq %rdx, %r15 #144.43
imulq %rsi, %r15 #144.43
addq %rcx, %r15 #126.5
movq %r15, %r11 #173.13
andq $63, %r11 #173.13
testl $3, %r11d #173.13
je ..B1.16 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.15: # Preds ..B1.14
# Execution count [2.25e+00]
xorl %r11d, %r11d #173.13
jmp ..B1.18 # Prob 100% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.16: # Preds ..B1.14
# Execution count [2.25e+00]
testl %r11d, %r11d #173.13
je ..B1.18 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.17: # Preds ..B1.16
# Execution count [2.50e+01]
negl %r11d #173.13
addl $64, %r11d #173.13
shrl $2, %r11d #173.13
cmpl %r11d, %r10d #173.13
cmovl %r10d, %r11d #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.18: # Preds ..B1.15 ..B1.17 ..B1.16
# Execution count [5.00e+00]
movl %r10d, %r13d #173.13
subl %r11d, %r13d #173.13
andl $7, %r13d #173.13
negl %r13d #173.13
addl %r10d, %r13d #173.13
cmpl $1, %r11d #173.13
jb ..B1.26 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.19: # Preds ..B1.18
# Execution count [4.50e+00]
vmovdqa %ymm15, %ymm4 #173.13
xorl %r12d, %r12d #173.13
vpbroadcastd %r11d, %ymm3 #173.13
vbroadcastsd %xmm10, %zmm2 #146.23
vbroadcastsd %xmm6, %zmm1 #147.23
vbroadcastsd %xmm12, %zmm0 #148.23
movslq %r11d, %r9 #173.13
movq %r8, 32(%rsp) #173.13[spill]
movq %r14, (%rsp) #173.13[spill]
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.20: # Preds ..B1.24 ..B1.19
# Execution count [2.50e+01]
vpcmpgtd %ymm4, %ymm3, %k3 #173.13
vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z} #174.25
kmovw %k3, %r14d #173.13
vpaddd %ymm17, %ymm17, %ymm18 #175.40
vpaddd %ymm18, %ymm17, %ymm17 #175.40
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.23: # Preds ..B1.20
# Execution count [1.25e+01]
kmovw %k3, %k1 #175.40
kmovw %k3, %k2 #175.40
vpxord %zmm18, %zmm18, %zmm18 #175.40
vpxord %zmm19, %zmm19, %zmm19 #175.40
vpxord %zmm20, %zmm20, %zmm20 #175.40
vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1} #175.40
vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #175.40
vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3} #175.40
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
..B1.24: # Preds ..B1.23
# Execution count [2.50e+01]
addq $8, %r12 #173.13
#vpaddd %ymm16, %ymm4, %ymm4 #173.13
#vsubpd %zmm18, %zmm0, %zmm29 #177.40
#vsubpd %zmm19, %zmm1, %zmm27 #176.40
#vsubpd %zmm20, %zmm2, %zmm26 #175.40
#vmulpd %zmm27, %zmm27, %zmm25 #178.53
#vfmadd231pd %zmm26, %zmm26, %zmm25 #178.53
#vfmadd231pd %zmm29, %zmm29, %zmm25 #178.67
#vrcp14pd %zmm25, %zmm24 #195.42
#vcmppd $1, %zmm14, %zmm25, %k2 #194.26
#vfpclasspd $30, %zmm24, %k0 #195.42
#kmovw %k2, %r8d #194.26
#knotw %k0, %k1 #195.42
#vmovaps %zmm25, %zmm17 #195.42
#andl %r8d, %r14d #194.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
#kmovw %r14d, %k3 #198.21
#vmulpd %zmm17, %zmm17, %zmm18 #195.42
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #195.42
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #195.42
#vmulpd %zmm13, %zmm24, %zmm19 #196.42
#vmulpd %zmm9, %zmm24, %zmm21 #197.58
#vmulpd %zmm19, %zmm24, %zmm22 #196.48
#vmulpd %zmm22, %zmm24, %zmm20 #196.54
#vfmsub213pd %zmm5, %zmm22, %zmm24 #197.58
#vmulpd %zmm21, %zmm20, %zmm23 #197.65
#vmulpd %zmm24, %zmm23, %zmm28 #197.71
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #198.21
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #199.21
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #200.21
cmpq %r9, %r12 #173.13
jb ..B1.20 # Prob 82% #173.13
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.25: # Preds ..B1.24
# Execution count [4.50e+00]
movq 32(%rsp), %r8 #[spill]
movq (%rsp), %r14 #[spill]
cmpl %r11d, %r10d #173.13
je ..B1.40 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.26: # Preds ..B1.25 ..B1.18 ..B1.47
# Execution count [2.50e+01]
lea 8(%r11), %r9d #173.13
cmpl %r9d, %r13d #173.13
jl ..B1.34 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.27: # Preds ..B1.26
# Execution count [4.50e+00]
movq %rdx, %r12 #144.43
imulq %rsi, %r12 #144.43
vbroadcastsd %xmm10, %zmm1 #146.23
vbroadcastsd %xmm6, %zmm0 #147.23
vbroadcastsd %xmm12, %zmm2 #148.23
movslq %r11d, %r9 #173.13
addq %rcx, %r12 #126.5
movq %rdi, 8(%rsp) #126.5[spill]
movq %rdx, 16(%rsp) #126.5[spill]
movq %rcx, 40(%rsp) #126.5[spill]
movq %rax, 48(%rsp) #126.5[spill]
movq %rsi, 56(%rsp) #126.5[spill]
movq %r8, 32(%rsp) #126.5[spill]
movq %r14, (%rsp) #126.5[spill]
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.28: # Preds ..B1.32 ..B1.27
# Execution count [2.50e+01]
vmovdqu (%r12,%r9,4), %ymm3 #174.25
vpaddd %ymm3, %ymm3, %ymm4 #175.40
vpaddd %ymm4, %ymm3, %ymm3 #175.40
movl (%r12,%r9,4), %r14d #174.25
movl 4(%r12,%r9,4), %r8d #174.25
movl 8(%r12,%r9,4), %edi #174.25
movl 12(%r12,%r9,4), %esi #174.25
lea (%r14,%r14,2), %r14d #175.40
movl 16(%r12,%r9,4), %ecx #174.25
lea (%r8,%r8,2), %r8d #175.40
movl 20(%r12,%r9,4), %edx #174.25
lea (%rdi,%rdi,2), %edi #175.40
movl 24(%r12,%r9,4), %eax #174.25
lea (%rsi,%rsi,2), %esi #175.40
movl 28(%r12,%r9,4), %r15d #174.25
lea (%rcx,%rcx,2), %ecx #175.40
lea (%rdx,%rdx,2), %edx #175.40
lea (%rax,%rax,2), %eax #175.40
lea (%r15,%r15,2), %r15d #175.40
# LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.31: # Preds ..B1.28
# Execution count [1.25e+01]
vpcmpeqb %xmm0, %xmm0, %k1 #175.40
vpcmpeqb %xmm0, %xmm0, %k2 #175.40
vpcmpeqb %xmm0, %xmm0, %k3 #175.40
vpxord %zmm4, %zmm4, %zmm4 #175.40
vpxord %zmm17, %zmm17, %zmm17 #175.40
vpxord %zmm18, %zmm18, %zmm18 #175.40
vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1} #175.40
vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2} #175.40
vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3} #175.40
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
..B1.32: # Preds ..B1.31
# Execution count [2.50e+01]
addl $8, %r11d #173.13
addq $8, %r9 #173.13
#vsubpd %zmm4, %zmm2, %zmm26 #177.40
#vsubpd %zmm17, %zmm0, %zmm24 #176.40
#vsubpd %zmm18, %zmm1, %zmm23 #175.40
#vmulpd %zmm24, %zmm24, %zmm3 #178.53
#vfmadd231pd %zmm23, %zmm23, %zmm3 #178.53
#vfmadd231pd %zmm26, %zmm26, %zmm3 #178.67
#vrcp14pd %zmm3, %zmm22 #195.42
#vcmppd $1, %zmm14, %zmm3, %k2 #194.26
#vfpclasspd $30, %zmm22, %k0 #195.42
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
#knotw %k0, %k1 #195.42
#vmulpd %zmm3, %zmm3, %zmm4 #195.42
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #195.42
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #195.42
#vmulpd %zmm13, %zmm22, %zmm17 #196.42
#vmulpd %zmm9, %zmm22, %zmm19 #197.58
#vmulpd %zmm17, %zmm22, %zmm20 #196.48
#vmulpd %zmm20, %zmm22, %zmm18 #196.54
#vfmsub213pd %zmm5, %zmm20, %zmm22 #197.58
#vmulpd %zmm19, %zmm18, %zmm21 #197.65
#vmulpd %zmm22, %zmm21, %zmm25 #197.71
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #198.21
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #199.21
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #200.21
cmpl %r13d, %r11d #173.13
jb ..B1.28 # Prob 82% #173.13
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32
# Execution count [4.50e+00]
movq 8(%rsp), %rdi #[spill]
movq 16(%rsp), %rdx #[spill]
movq 40(%rsp), %rcx #[spill]
movq 48(%rsp), %rax #[spill]
movq 56(%rsp), %rsi #[spill]
movq 32(%rsp), %r8 #[spill]
movq (%rsp), %r14 #[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.34: # Preds ..B1.33 ..B1.26 ..B1.48
# Execution count [5.00e+00]
lea 1(%r13), %r9d #173.13
cmpl %r10d, %r9d #173.13
ja ..B1.40 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.35: # Preds ..B1.34
# Execution count [2.50e+01]
imulq %rdx, %rsi #144.43
vbroadcastsd %xmm10, %zmm4 #146.23
subl %r13d, %r10d #173.13
addq %rcx, %rsi #126.5
vpbroadcastd %r10d, %ymm0 #173.13
vpcmpgtd %ymm15, %ymm0, %k3 #173.13
movslq %r13d, %r13 #173.13
kmovw %k3, %r9d #173.13
vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z} #174.25
vpaddd %ymm1, %ymm1, %ymm2 #175.40
vpaddd %ymm2, %ymm1, %ymm0 #175.40
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.38: # Preds ..B1.35
# Execution count [1.25e+01]
kmovw %k3, %k1 #175.40
kmovw %k3, %k2 #175.40
vpxord %zmm1, %zmm1, %zmm1 #175.40
vpxord %zmm2, %zmm2, %zmm2 #175.40
vpxord %zmm3, %zmm3, %zmm3 #175.40
vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1} #175.40
vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2} #175.40
vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3} #175.40
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.39: # Preds ..B1.38
# Execution count [2.50e+01]
#vbroadcastsd %xmm6, %zmm6 #147.23
#vbroadcastsd %xmm12, %zmm12 #148.23
#vsubpd %zmm1, %zmm12, %zmm23 #177.40
#vsubpd %zmm2, %zmm6, %zmm21 #176.40
#vsubpd %zmm3, %zmm4, %zmm20 #175.40
#vmulpd %zmm21, %zmm21, %zmm19 #178.53
#vfmadd231pd %zmm20, %zmm20, %zmm19 #178.53
#vfmadd231pd %zmm23, %zmm23, %zmm19 #178.67
#vrcp14pd %zmm19, %zmm18 #195.42
#vcmppd $1, %zmm14, %zmm19, %k2 #194.26
#vfpclasspd $30, %zmm18, %k0 #195.42
#kmovw %k2, %esi #194.26
#knotw %k0, %k1 #195.42
#vmovaps %zmm19, %zmm0 #195.42
#andl %esi, %r9d #194.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
#kmovw %r9d, %k3 #198.21
#vmulpd %zmm0, %zmm0, %zmm1 #195.42
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #195.42
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #195.42
#vmulpd %zmm13, %zmm18, %zmm2 #196.42
#vmulpd %zmm9, %zmm18, %zmm4 #197.58
#vmulpd %zmm2, %zmm18, %zmm10 #196.48
#vmulpd %zmm10, %zmm18, %zmm3 #196.54
#vfmsub213pd %zmm5, %zmm10, %zmm18 #197.58
#vmulpd %zmm4, %zmm3, %zmm17 #197.65
#vmulpd %zmm18, %zmm17, %zmm22 #197.71
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #198.21
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #199.21
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #200.21
# LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.40: # Preds ..B1.25 ..B1.39 ..B1.34
# Execution count [4.50e+00]
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #151.22
vpermd %zmm11, %zmm19, %zmm0 #151.22
vpermd %zmm7, %zmm19, %zmm6 #150.22
vpermd %zmm8, %zmm19, %zmm20 #149.22
vaddpd %zmm11, %zmm0, %zmm11 #151.22
vaddpd %zmm7, %zmm6, %zmm7 #150.22
vaddpd %zmm8, %zmm20, %zmm8 #149.22
vpermpd $78, %zmm11, %zmm1 #151.22
vpermpd $78, %zmm7, %zmm10 #150.22
vpermpd $78, %zmm8, %zmm21 #149.22
vaddpd %zmm1, %zmm11, %zmm2 #151.22
vaddpd %zmm10, %zmm7, %zmm12 #150.22
vaddpd %zmm21, %zmm8, %zmm22 #149.22
vpermpd $177, %zmm2, %zmm3 #151.22
vpermpd $177, %zmm12, %zmm17 #150.22
vpermpd $177, %zmm22, %zmm23 #149.22
vaddpd %zmm3, %zmm2, %zmm4 #151.22
vaddpd %zmm17, %zmm12, %zmm18 #150.22
vaddpd %zmm23, %zmm22, %zmm24 #149.22
# LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.41: # Preds ..B1.40 ..B1.11
# Execution count [5.00e+00]
movq 80(%rsp), %rsi #208.9[spill]
addq $24, %rdi #143.5
vaddsd (%rsi,%rax,8), %xmm24, %xmm0 #208.9
vmovsd %xmm0, (%rsi,%rax,8) #208.9
movslq %eax, %rsi #143.32
vaddsd (%r14,%rax,8), %xmm18, %xmm1 #209.9
vmovsd %xmm1, (%r14,%rax,8) #209.9
incq %rsi #143.32
vaddsd (%r8,%rax,8), %xmm4, %xmm2 #210.9
vmovsd %xmm2, (%r8,%rax,8) #210.9
incq %rax #143.5
cmpq 64(%rsp), %rax #143.5[spill]
jb ..B1.11 # Prob 82% #143.5
jmp ..B1.44 # Prob 100% #143.5
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.43: # Preds ..B1.2
# Execution count [5.00e-01]
movl $.L_2__STRING.0, %edi #141.5
..___tag_value_computeForce.48:
# likwid_markerStartRegion(const char *)
call likwid_markerStartRegion #141.5
..___tag_value_computeForce.49:
# LOE
..B1.44: # Preds ..B1.41 ..B1.43
# Execution count [1.00e+00]
movl $.L_2__STRING.0, %edi #219.5
vzeroupper #219.5
..___tag_value_computeForce.50:
# likwid_markerStopRegion(const char *)
call likwid_markerStopRegion #219.5
..___tag_value_computeForce.51:
# LOE
..B1.45: # Preds ..B1.44
# Execution count [1.00e+00]
xorl %eax, %eax #221.16
..___tag_value_computeForce.52:
# getTimeStamp()
call getTimeStamp #221.16
..___tag_value_computeForce.53:
# LOE xmm0
..B1.46: # Preds ..B1.45
# Execution count [1.00e+00]
vsubsd 24(%rsp), %xmm0, %xmm0 #224.14[spill]
addq $88, %rsp #224.14
.cfi_restore 3
popq %rbx #224.14
.cfi_restore 15
popq %r15 #224.14
.cfi_restore 14
popq %r14 #224.14
.cfi_restore 13
popq %r13 #224.14
.cfi_restore 12
popq %r12 #224.14
movq %rbp, %rsp #224.14
popq %rbp #224.14
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #224.14
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.47: # Preds ..B1.13
# Execution count [4.50e-01]: Infreq
movl %r10d, %r13d #173.13
xorl %r11d, %r11d #173.13
andl $-8, %r13d #173.13
jmp ..B1.26 # Prob 100% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.48: # Preds ..B1.12
# Execution count [4.50e-01]: Infreq
xorl %r13d, %r13d #173.13
jmp ..B1.34 # Prob 100% #173.13
.align 16,0x90
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
.cfi_endproc
# mark_end;
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.5:
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 64
.L_2il0floatpacket.7:
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
.type .L_2il0floatpacket.7,@object
.size .L_2il0floatpacket.7,64
.align 64
.L_2il0floatpacket.8:
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
.type .L_2il0floatpacket.8,@object
.size .L_2il0floatpacket.8,64
.align 64
.L_2il0floatpacket.10:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.10,@object
.size .L_2il0floatpacket.10,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.9:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.9,@object
.size .L_2il0floatpacket.9,8
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
.L_2__STRING.0:
.long 1668444006
.word 101
.type .L_2__STRING.0,@object
.size .L_2__STRING.0,6
.data
.section .note.GNU-stack, ""
# End

585
asm/unused/force-mem-only.s Normal file
View File

@ -0,0 +1,585 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
.file "force.c"
.text
..TXTST0:
.L_2__routine_start_computeForce_0:
# -- Begin computeForce
.text
# mark_begin;
.align 16,0x90
.globl computeForce
# --- computeForce(Parameter *, Atom *, Neighbor *, int)
computeForce:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %ecx
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_computeForce.1:
..L2:
#103.87
pushq %rbp #103.87
.cfi_def_cfa_offset 16
movq %rsp, %rbp #103.87
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-64, %rsp #103.87
pushq %r12 #103.87
pushq %r13 #103.87
pushq %r14 #103.87
subq $104, %rsp #103.87
xorl %eax, %eax #106.16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
movq %rdx, %r14 #103.87
movq %rsi, %r13 #103.87
movq %rdi, %r12 #103.87
..___tag_value_computeForce.9:
# getTimeStamp()
call getTimeStamp #106.16
..___tag_value_computeForce.10:
# LOE rbx r12 r13 r14 r15 xmm0
..B1.48: # Preds ..B1.1
# Execution count [1.00e+00]
vmovsd %xmm0, 16(%rsp) #106.16[spill]
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.48
# Execution count [1.00e+00]
movl 4(%r13), %ecx #107.18
movq 64(%r13), %r11 #109.20
movq 72(%r13), %r10 #109.45
movq 80(%r13), %r9 #109.70
vmovsd 72(%r12), %xmm2 #111.27
vmovsd 8(%r12), %xmm1 #112.23
vmovsd (%r12), %xmm0 #113.24
testl %ecx, %ecx #116.24
jle ..B1.42 # Prob 50% #116.24
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.3: # Preds ..B1.2
# Execution count [1.00e+00]
xorl %edi, %edi #116.5
movl %ecx, %edx #116.5
xorl %esi, %esi #116.5
movl $1, %r8d #116.5
xorl %eax, %eax #117.17
shrl $1, %edx #116.5
je ..B1.7 # Prob 9% #116.5
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.5: # Preds ..B1.3 ..B1.5
# Execution count [2.50e+00]
movq %rax, (%rsi,%r11) #117.9
incq %rdi #116.5
movq %rax, (%rsi,%r10) #118.9
movq %rax, (%rsi,%r9) #119.9
movq %rax, 8(%rsi,%r11) #117.9
movq %rax, 8(%rsi,%r10) #118.9
movq %rax, 8(%rsi,%r9) #119.9
addq $16, %rsi #116.5
cmpq %rdx, %rdi #116.5
jb ..B1.5 # Prob 63% #116.5
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.6: # Preds ..B1.5
# Execution count [9.00e-01]
lea 1(%rdi,%rdi), %r8d #117.9
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.7: # Preds ..B1.3 ..B1.6
# Execution count [1.00e+00]
lea -1(%r8), %edx #116.5
cmpl %ecx, %edx #116.5
jae ..B1.9 # Prob 9% #116.5
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.8: # Preds ..B1.7
# Execution count [9.00e-01]
movslq %r8d, %r8 #116.5
movq %rax, -8(%r11,%r8,8) #117.9
movq %rax, -8(%r10,%r8,8) #118.9
movq %rax, -8(%r9,%r8,8) #119.9
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.9: # Preds ..B1.7 ..B1.8
# Execution count [9.00e-01]
vmulsd %xmm2, %xmm2, %xmm13 #111.45
xorl %edi, %edi #124.15
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #153.13
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #177.45
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #153.13
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #177.58
vbroadcastsd %xmm13, %zmm14 #111.25
vbroadcastsd %xmm1, %zmm13 #112.21
vbroadcastsd %xmm0, %zmm9 #177.45
movq 16(%r13), %rdx #127.25
xorl %r8d, %r8d #124.5
movslq %ecx, %r12 #124.5
xorl %eax, %eax #124.5
movq 24(%r14), %r13 #126.25
movslq 16(%r14), %rcx #125.43
movq 8(%r14), %rsi #125.19
shlq $2, %rcx #108.5
movq %r12, 80(%rsp) #124.5[spill]
movq %r13, 88(%rsp) #124.5[spill]
movq %r11, 96(%rsp) #124.5[spill]
movq %r15, 8(%rsp) #124.5[spill]
movq %rbx, (%rsp) #124.5[spill]
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.10: # Preds ..B1.40 ..B1.9
# Execution count [5.00e+00]
movq 88(%rsp), %rbx #126.25[spill]
vxorpd %xmm24, %xmm24, %xmm24 #130.22
vmovapd %xmm24, %xmm18 #131.22
movl (%rbx,%r8,4), %r11d #126.25
vmovapd %xmm18, %xmm4 #132.22
vmovsd (%rax,%rdx), %xmm10 #127.25
vmovsd 8(%rax,%rdx), %xmm6 #128.25
vmovsd 16(%rax,%rdx), %xmm12 #129.25
testl %r11d, %r11d #153.32
jle ..B1.40 # Prob 50% #153.32
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.11: # Preds ..B1.10
# Execution count [4.50e+00]
vpxord %zmm8, %zmm8, %zmm8 #130.22
vmovaps %zmm8, %zmm7 #131.22
vmovaps %zmm7, %zmm11 #132.22
cmpl $8, %r11d #153.13
jl ..B1.45 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
cmpl $1200, %r11d #153.13
jl ..B1.44 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.13: # Preds ..B1.12
# Execution count [4.50e+00]
movq %rcx, %r15 #125.43
imulq %rdi, %r15 #125.43
addq %rsi, %r15 #108.5
movq %r15, %r12 #153.13
andq $63, %r12 #153.13
testl $3, %r12d #153.13
je ..B1.15 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.14: # Preds ..B1.13
# Execution count [2.25e+00]
xorl %r12d, %r12d #153.13
jmp ..B1.17 # Prob 100% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.15: # Preds ..B1.13
# Execution count [2.25e+00]
testl %r12d, %r12d #153.13
je ..B1.17 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.16: # Preds ..B1.15
# Execution count [2.50e+01]
negl %r12d #153.13
addl $64, %r12d #153.13
shrl $2, %r12d #153.13
cmpl %r12d, %r11d #153.13
cmovl %r11d, %r12d #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.17: # Preds ..B1.14 ..B1.16 ..B1.15
# Execution count [5.00e+00]
movl %r11d, %r14d #153.13
subl %r12d, %r14d #153.13
andl $7, %r14d #153.13
negl %r14d #153.13
addl %r11d, %r14d #153.13
cmpl $1, %r12d #153.13
jb ..B1.25 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.18: # Preds ..B1.17
# Execution count [4.50e+00]
vmovdqa %ymm15, %ymm4 #153.13
xorl %r13d, %r13d #153.13
vpbroadcastd %r12d, %ymm3 #153.13
vbroadcastsd %xmm10, %zmm2 #127.23
vbroadcastsd %xmm6, %zmm1 #128.23
vbroadcastsd %xmm12, %zmm0 #129.23
movslq %r12d, %rbx #153.13
movq %r9, 24(%rsp) #153.13[spill]
movq %r10, 32(%rsp) #153.13[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.19: # Preds ..B1.23 ..B1.18
# Execution count [2.50e+01]
vpcmpgtd %ymm4, %ymm3, %k3 #153.13
vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #154.25
kmovw %k3, %r10d #153.13
vpaddd %ymm17, %ymm17, %ymm18 #155.40
vpaddd %ymm18, %ymm17, %ymm17 #155.40
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.22: # Preds ..B1.19
# Execution count [1.25e+01]
kmovw %k3, %k1 #155.40
kmovw %k3, %k2 #155.40
vpxord %zmm18, %zmm18, %zmm18 #155.40
vpxord %zmm19, %zmm19, %zmm19 #155.40
vpxord %zmm20, %zmm20, %zmm20 #155.40
vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1} #155.40
vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2} #155.40
vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3} #155.40
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
..B1.23: # Preds ..B1.22
# Execution count [2.50e+01]
addq $8, %r13 #153.13
#vpaddd %ymm16, %ymm4, %ymm4 #153.13
#vsubpd %zmm18, %zmm0, %zmm29 #157.40
#vsubpd %zmm19, %zmm1, %zmm27 #156.40
#vsubpd %zmm20, %zmm2, %zmm26 #155.40
#vmulpd %zmm27, %zmm27, %zmm25 #158.53
#vfmadd231pd %zmm26, %zmm26, %zmm25 #158.53
#vfmadd231pd %zmm29, %zmm29, %zmm25 #158.67
#vrcp14pd %zmm25, %zmm24 #175.42
#vcmppd $1, %zmm14, %zmm25, %k2 #174.26
#vfpclasspd $30, %zmm24, %k0 #175.42
#kmovw %k2, %r9d #174.26
#knotw %k0, %k1 #175.42
#vmovaps %zmm25, %zmm17 #175.42
#andl %r9d, %r10d #174.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
#kmovw %r10d, %k3 #178.21
#vmulpd %zmm17, %zmm17, %zmm18 #175.42
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #175.42
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #175.42
#vmulpd %zmm13, %zmm24, %zmm19 #176.42
#vmulpd %zmm9, %zmm24, %zmm21 #177.58
#vmulpd %zmm19, %zmm24, %zmm22 #176.48
#vmulpd %zmm22, %zmm24, %zmm20 #176.54
#vfmsub213pd %zmm5, %zmm22, %zmm24 #177.58
#vmulpd %zmm21, %zmm20, %zmm23 #177.65
#vmulpd %zmm24, %zmm23, %zmm28 #177.71
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #178.21
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #179.21
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #180.21
cmpq %rbx, %r13 #153.13
jb ..B1.19 # Prob 82% #153.13
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.24: # Preds ..B1.23
# Execution count [4.50e+00]
movq 24(%rsp), %r9 #[spill]
movq 32(%rsp), %r10 #[spill]
cmpl %r12d, %r11d #153.13
je ..B1.39 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.25: # Preds ..B1.24 ..B1.17 ..B1.44
# Execution count [2.50e+01]
lea 8(%r12), %ebx #153.13
cmpl %ebx, %r14d #153.13
jl ..B1.33 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.26: # Preds ..B1.25
# Execution count [4.50e+00]
movq %rcx, %r13 #125.43
imulq %rdi, %r13 #125.43
vbroadcastsd %xmm10, %zmm1 #127.23
vbroadcastsd %xmm6, %zmm0 #128.23
vbroadcastsd %xmm12, %zmm2 #129.23
movslq %r12d, %rbx #153.13
addq %rsi, %r13 #108.5
movq %rax, 40(%rsp) #108.5[spill]
movq %rcx, 48(%rsp) #108.5[spill]
movq %rsi, 56(%rsp) #108.5[spill]
movq %r8, 64(%rsp) #108.5[spill]
movq %rdi, 72(%rsp) #108.5[spill]
movq %r9, 24(%rsp) #108.5[spill]
movq %r10, 32(%rsp) #108.5[spill]
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.27: # Preds ..B1.31 ..B1.26
# Execution count [2.50e+01]
vmovdqu (%r13,%rbx,4), %ymm3 #154.25
vpaddd %ymm3, %ymm3, %ymm4 #155.40
vpaddd %ymm4, %ymm3, %ymm3 #155.40
movl (%r13,%rbx,4), %r10d #154.25
movl 4(%r13,%rbx,4), %r9d #154.25
movl 8(%r13,%rbx,4), %r8d #154.25
movl 12(%r13,%rbx,4), %edi #154.25
lea (%r10,%r10,2), %r10d #155.40
movl 16(%r13,%rbx,4), %esi #154.25
lea (%r9,%r9,2), %r9d #155.40
movl 20(%r13,%rbx,4), %ecx #154.25
lea (%r8,%r8,2), %r8d #155.40
movl 24(%r13,%rbx,4), %eax #154.25
lea (%rdi,%rdi,2), %edi #155.40
movl 28(%r13,%rbx,4), %r15d #154.25
lea (%rsi,%rsi,2), %esi #155.40
lea (%rcx,%rcx,2), %ecx #155.40
lea (%rax,%rax,2), %eax #155.40
lea (%r15,%r15,2), %r15d #155.40
# LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.30: # Preds ..B1.27
# Execution count [1.25e+01]
vpcmpeqb %xmm0, %xmm0, %k1 #155.40
vpcmpeqb %xmm0, %xmm0, %k2 #155.40
vpcmpeqb %xmm0, %xmm0, %k3 #155.40
vpxord %zmm4, %zmm4, %zmm4 #155.40
vpxord %zmm17, %zmm17, %zmm17 #155.40
vpxord %zmm18, %zmm18, %zmm18 #155.40
vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1} #155.40
vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2} #155.40
vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3} #155.40
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
..B1.31: # Preds ..B1.30
# Execution count [2.50e+01]
addl $8, %r12d #153.13
addq $8, %rbx #153.13
#vsubpd %zmm4, %zmm2, %zmm26 #157.40
#vsubpd %zmm17, %zmm0, %zmm24 #156.40
#vsubpd %zmm18, %zmm1, %zmm23 #155.40
#vmulpd %zmm24, %zmm24, %zmm3 #158.53
#vfmadd231pd %zmm23, %zmm23, %zmm3 #158.53
#vfmadd231pd %zmm26, %zmm26, %zmm3 #158.67
#vrcp14pd %zmm3, %zmm22 #175.42
#vcmppd $1, %zmm14, %zmm3, %k2 #174.26
#vfpclasspd $30, %zmm22, %k0 #175.42
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
#knotw %k0, %k1 #175.42
#vmulpd %zmm3, %zmm3, %zmm4 #175.42
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #175.42
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #175.42
#vmulpd %zmm13, %zmm22, %zmm17 #176.42
#vmulpd %zmm9, %zmm22, %zmm19 #177.58
#vmulpd %zmm17, %zmm22, %zmm20 #176.48
#vmulpd %zmm20, %zmm22, %zmm18 #176.54
#vfmsub213pd %zmm5, %zmm20, %zmm22 #177.58
#vmulpd %zmm19, %zmm18, %zmm21 #177.65
#vmulpd %zmm22, %zmm21, %zmm25 #177.71
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #178.21
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #179.21
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #180.21
cmpl %r14d, %r12d #153.13
jb ..B1.27 # Prob 82% #153.13
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.32: # Preds ..B1.31
# Execution count [4.50e+00]
movq 40(%rsp), %rax #[spill]
movq 48(%rsp), %rcx #[spill]
movq 56(%rsp), %rsi #[spill]
movq 64(%rsp), %r8 #[spill]
movq 72(%rsp), %rdi #[spill]
movq 24(%rsp), %r9 #[spill]
movq 32(%rsp), %r10 #[spill]
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32 ..B1.25 ..B1.45
# Execution count [5.00e+00]
lea 1(%r14), %ebx #153.13
cmpl %r11d, %ebx #153.13
ja ..B1.39 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.34: # Preds ..B1.33
# Execution count [2.50e+01]
imulq %rcx, %rdi #125.43
vbroadcastsd %xmm10, %zmm4 #127.23
subl %r14d, %r11d #153.13
addq %rsi, %rdi #108.5
vpbroadcastd %r11d, %ymm0 #153.13
vpcmpgtd %ymm15, %ymm0, %k3 #153.13
movslq %r14d, %r14 #153.13
vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z} #154.25
kmovw %k3, %edi #153.13
vpaddd %ymm1, %ymm1, %ymm2 #155.40
vpaddd %ymm2, %ymm1, %ymm0 #155.40
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.37: # Preds ..B1.34
# Execution count [1.25e+01]
kmovw %k3, %k1 #155.40
kmovw %k3, %k2 #155.40
vpxord %zmm1, %zmm1, %zmm1 #155.40
vpxord %zmm2, %zmm2, %zmm2 #155.40
vpxord %zmm3, %zmm3, %zmm3 #155.40
vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1} #155.40
vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2} #155.40
vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3} #155.40
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.38: # Preds ..B1.37
# Execution count [2.50e+01]
#vbroadcastsd %xmm6, %zmm6 #128.23
#vbroadcastsd %xmm12, %zmm12 #129.23
#vsubpd %zmm1, %zmm12, %zmm23 #157.40
#vsubpd %zmm2, %zmm6, %zmm21 #156.40
#vsubpd %zmm3, %zmm4, %zmm20 #155.40
#vmulpd %zmm21, %zmm21, %zmm19 #158.53
#vfmadd231pd %zmm20, %zmm20, %zmm19 #158.53
#vfmadd231pd %zmm23, %zmm23, %zmm19 #158.67
#vrcp14pd %zmm19, %zmm18 #175.42
#vcmppd $1, %zmm14, %zmm19, %k2 #174.26
#vfpclasspd $30, %zmm18, %k0 #175.42
#kmovw %k2, %ebx #174.26
#knotw %k0, %k1 #175.42
#vmovaps %zmm19, %zmm0 #175.42
#andl %ebx, %edi #174.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
#kmovw %edi, %k3 #178.21
#vmulpd %zmm0, %zmm0, %zmm1 #175.42
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #175.42
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #175.42
#vmulpd %zmm13, %zmm18, %zmm2 #176.42
#vmulpd %zmm9, %zmm18, %zmm4 #177.58
#vmulpd %zmm2, %zmm18, %zmm10 #176.48
#vmulpd %zmm10, %zmm18, %zmm3 #176.54
#vfmsub213pd %zmm5, %zmm10, %zmm18 #177.58
#vmulpd %zmm4, %zmm3, %zmm17 #177.65
#vmulpd %zmm18, %zmm17, %zmm22 #177.71
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #178.21
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #179.21
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #180.21
# LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.39: # Preds ..B1.24 ..B1.38 ..B1.33
# Execution count [4.50e+00]
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #132.22
vpermd %zmm11, %zmm19, %zmm0 #132.22
vpermd %zmm7, %zmm19, %zmm6 #131.22
vpermd %zmm8, %zmm19, %zmm20 #130.22
vaddpd %zmm11, %zmm0, %zmm11 #132.22
vaddpd %zmm7, %zmm6, %zmm7 #131.22
vaddpd %zmm8, %zmm20, %zmm8 #130.22
vpermpd $78, %zmm11, %zmm1 #132.22
vpermpd $78, %zmm7, %zmm10 #131.22
vpermpd $78, %zmm8, %zmm21 #130.22
vaddpd %zmm1, %zmm11, %zmm2 #132.22
vaddpd %zmm10, %zmm7, %zmm12 #131.22
vaddpd %zmm21, %zmm8, %zmm22 #130.22
vpermpd $177, %zmm2, %zmm3 #132.22
vpermpd $177, %zmm12, %zmm17 #131.22
vpermpd $177, %zmm22, %zmm23 #130.22
vaddpd %zmm3, %zmm2, %zmm4 #132.22
vaddpd %zmm17, %zmm12, %zmm18 #131.22
vaddpd %zmm23, %zmm22, %zmm24 #130.22
# LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.40: # Preds ..B1.39 ..B1.10
# Execution count [5.00e+00]
movq 96(%rsp), %rbx #188.9[spill]
addq $24, %rax #124.5
movslq %r8d, %rdi #124.32
incq %rdi #124.32
#vaddsd (%rbx,%r8,8), %xmm24, %xmm0 #188.9
#vmovsd %xmm0, (%rbx,%r8,8) #188.9
#vaddsd (%r10,%r8,8), %xmm18, %xmm1 #189.9
#vmovsd %xmm1, (%r10,%r8,8) #189.9
#vaddsd (%r9,%r8,8), %xmm4, %xmm2 #190.9
#vmovsd %xmm2, (%r9,%r8,8) #190.9
incq %r8 #124.5
cmpq 80(%rsp), %r8 #124.5[spill]
jb ..B1.10 # Prob 82% #124.5
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.41: # Preds ..B1.40
# Execution count [9.00e-01]
movq 8(%rsp), %r15 #[spill]
.cfi_restore 15
movq (%rsp), %rbx #[spill]
.cfi_restore 3
# LOE rbx r15
..B1.42: # Preds ..B1.2 ..B1.41
# Execution count [1.00e+00]
xorl %eax, %eax #201.16
vzeroupper #201.16
..___tag_value_computeForce.43:
# getTimeStamp()
call getTimeStamp #201.16
..___tag_value_computeForce.44:
# LOE rbx r15 xmm0
..B1.43: # Preds ..B1.42
# Execution count [1.00e+00]
vsubsd 16(%rsp), %xmm0, %xmm0 #204.14[spill]
addq $104, %rsp #204.14
.cfi_restore 14
popq %r14 #204.14
.cfi_restore 13
popq %r13 #204.14
.cfi_restore 12
popq %r12 #204.14
movq %rbp, %rsp #204.14
popq %rbp #204.14
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #204.14
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.44: # Preds ..B1.12
# Execution count [4.50e-01]: Infreq
movl %r11d, %r14d #153.13
xorl %r12d, %r12d #153.13
andl $-8, %r14d #153.13
jmp ..B1.25 # Prob 100% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.45: # Preds ..B1.11
# Execution count [4.50e-01]: Infreq
xorl %r14d, %r14d #153.13
jmp ..B1.33 # Prob 100% #153.13
.align 16,0x90
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
.cfi_endproc
# mark_end;
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.5:
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 64
.L_2il0floatpacket.7:
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
.type .L_2il0floatpacket.7,@object
.size .L_2il0floatpacket.7,64
.align 64
.L_2il0floatpacket.8:
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
.type .L_2il0floatpacket.8,@object
.size .L_2il0floatpacket.8,64
.align 64
.L_2il0floatpacket.10:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.10,@object
.size .L_2il0floatpacket.10,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.9:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.9,@object
.size .L_2il0floatpacket.9,8
.data
.section .note.GNU-stack, ""
# End

312
asm/unused/force.s Normal file
View File

@ -0,0 +1,312 @@
.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForce
computeForce:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push r12
push r13
push r14
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [72+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [8+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..exit_func
..B1.2:
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..B1.6 # ecx == 0
# Init forces to zero loop
..B1.4:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..B1.4
..B1.5:
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..B1.6:
lea ecx, DWORD PTR [-1+r11] # r11d <- i * 2
cmp ecx, r9d # i < Nlocal
jae ..B1.8
..B1.7:
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
..B1.8:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
xor r8d, r8d # r8d <- 0
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm16 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
# Loop over all atoms
..B1.9:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0
vmovapd xmm20, xmm25 # xmm20 <- 0
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
### SOA
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
test r13d, r13d # numneighs <= 0
jle ..exit_func
..B1.10:
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov r14d, r13d # r14d <- numneighs
xor r11d, r11d # r11d <- 0
and r14d, -8 # r14d <- numneighs & (-8)
lea r9d, DWORD PTR [8+r11] # r9d <- 8 (why lea?)
cmp r14d, r9d # r14d < r9d
jl ..B1.33
# cmp r13d, 8 # numneighs < 8
# jl ..B1.32
#..B1.11:
# cmp r13d, 1200 # numneighs < 1200
# jl ..B1.31
#..B1.12:
# mov rcx, r12
# imul rcx, r8
# add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i (r8)]
# mov r9, rcx # r9 <- neighs
# and r9, 63 # r9 <- neighs & 63
# test r9d, 3 # (r9d & 3) == 0 => r9d divisible by 8
# je ..B1.14
#..B1.13:
# xor r9d, r9d # r9d <- 0
# jmp ..B1.16
#..B1.14:
# test r9d, r9d # r9d == 0
# je ..B1.16
#..B1.15:
# neg r9d
# add r9d, 64
# shr r9d, 2 # r9d <- (64 - r9d) / 4
# cmp r13d, r9d # numneighs < r9d
# cmovl r9d, r13d # r9d <- MIN(numneighs, r9d)
#..B1.16:
# mov ebx, r13d
# sub ebx, r9d
# and ebx, 7
# neg ebx
# add ebx, r13d # ebx <- -((numneighs - r9d) & 7) + numneighs
# cmp r9d, 1 # r9d < 1
# jb ..B1.20
#..B1.20:
# lea ecx, DWORD PTR [8+r9] # ecx <- r9d[1]
# cmp ebx, ecx # -((numneighs - r9d) & 7) + numneighs < neighs
# jl ..B1.24
..B1.21:
mov rcx, r12
imul rcx, r8
vbroadcastsd zmm0, xmm8
vbroadcastsd zmm1, xmm9
vbroadcastsd zmm2, xmm10
movsxd r14, r9d
add rcx, r11
..B1.22:
vpcmpeqb k2, xmm0, xmm0
add r9d, 8
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r14*4]
add r14, 8
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
vgatherdpd zmm5{k2}, QWORD PTR [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, QWORD PTR [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8]
#vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5
vsubpd zmm28, zmm0, zmm4
vsubpd zmm31, zmm2, zmm6
vmulpd zmm20, zmm29, zmm29
vfmadd231pd zmm20, zmm28, zmm28
vfmadd231pd zmm20, zmm31, zmm31
# if condition cutoff radius
vrcp14pd zmm27, zmm20 #-> sr2
vcmppd k5, zmm20, zmm16, 1
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28
vfmadd231pd zmm12{k5}, zmm30, zmm29
vfmadd231pd zmm11{k5}, zmm30, zmm31
cmp r9d, ebx
jb ..B1.22
#end neighbor loop
..B1.26:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
#exit function
..exit_func:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
movsxd r8, r10d #55.32
inc r8 #55.32
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..B1.9
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
ret #93.12
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End

131
config.mk
View File

@ -1,10 +1,5 @@
# Compiler tool chain (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
TOOLCHAIN ?= CLANG
# Instruction set for instrinsic kernels (NONE/SSE/AVX/AVX_FMA/AVX2/AVX512)
ISA ?= ARM
SIMD ?= NONE
# Optimization scheme (verletlist/clusterpair/clusters_per_bin)
OPT_SCHEME ?= verletlist
# Compiler tag (GCC/CLANG/ICC/NVCC)
TAG ?= NVCC
# Enable likwid (true or false)
ENABLE_LIKWID ?= false
# SP or DP
@ -12,129 +7,23 @@ DATA_TYPE ?= DP
# AOS or SOA
DATA_LAYOUT ?= AOS
# Assembly syntax to generate (ATT/INTEL)
ASM_SYNTAX ?= INTEL
# Debug
DEBUG ?= false
ASM_SYNTAX ?= ATT
# Sort atoms when reneighboring (true or false)
SORT_ATOMS ?= true
# Number of times to run the atoms loop on stubbed variant
ATOMS_LOOP_RUNS ?= 1
# Number of times to run the neighbors loop on stubbed variant
NEIGHBORS_LOOP_RUNS ?= 1
# Explicitly store and load atom types (true or false)
EXPLICIT_TYPES ?= false
# Trace memory addresses for cache simulator (true or false)
MEM_TRACER ?= false
# Trace indexes and distances for gather-md (true or false)
INDEX_TRACER ?= false
# Vector width (elements) for index and distance tracer
VECTOR_WIDTH ?= 8
# Compute statistics
COMPUTE_STATS ?= true
# Configurations for lammps optimization scheme
# Use omp simd pragma when running with half neighbor-lists
ENABLE_OMP_SIMD ?= false
# Use kernel with explicit SIMD intrinsics
USE_SIMD_KERNEL ?= false
# Configurations for gromacs optimization scheme
# Use reference version
USE_REFERENCE_VERSION ?= false
# Enable XTC output
XTC_OUTPUT ?= false
# Check if cj is local when decreasing reaction force
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= true
# Configurations for CUDA
# Use CUDA host memory to optimize transfers
USE_CUDA_HOST_MEMORY ?= false
COMPUTE_STATS ?= false
#Feature options
OPTIONS = -DALIGNMENT=64
#OPTIONS += More options
#DO NOT EDIT BELOW
ifeq ($(strip $(DATA_LAYOUT)),AOS)
DEFINES += -DAOS
endif
ifeq ($(strip $(DATA_TYPE)),SP)
DEFINES += -DPRECISION=1
else
DEFINES += -DPRECISION=2
endif
ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifeq ($(strip $(SORT_ATOMS)),true)
DEFINES += -DSORT_ATOMS
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
ifeq ($(strip $(MEM_TRACER)),true)
DEFINES += -DMEM_TRACER
endif
ifeq ($(strip $(INDEX_TRACER)),true)
DEFINES += -DINDEX_TRACER
endif
ifeq ($(strip $(COMPUTE_STATS)),true)
DEFINES += -DCOMPUTE_STATS
endif
ifeq ($(strip $(XTC_OUTPUT)),true)
DEFINES += -DXTC_OUTPUT
endif
ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
DEFINES += -DUSE_REFERENCE_VERSION
endif
ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true)
DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ
endif
ifeq ($(strip $(DEBUG)),true)
DEFINES += -DDEBUG
endif
ifneq ($(VECTOR_WIDTH),)
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
endif
ifeq ($(strip $(__SIMD_KERNEL__)),true)
DEFINES += -D__SIMD_KERNEL__
endif
ifeq ($(strip $(__SSE__)),true)
DEFINES += -D__ISA_SSE__
endif
ifeq ($(strip $(__ISA_AVX__)),true)
DEFINES += -D__ISA_AVX__
endif
ifeq ($(strip $(__ISA_AVX_FMA__)),true)
DEFINES += -D__ISA_AVX_FMA__
endif
ifeq ($(strip $(__ISA_AVX2__)),true)
DEFINES += -D__ISA_AVX2__
endif
ifeq ($(strip $(__ISA_AVX512__)),true)
DEFINES += -D__ISA_AVX512__
endif
ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
DEFINES += -DENABLE_OMP_SIMD
endif
ifeq ($(strip $(OPT_SCHEME)),verletlist)
OPT_TAG = VL
endif
ifneq ($(strip $(SIMD)),NONE)
TOOLCHAIN = $(TOOLCHAIN)-$(ISA)-$(SIMD)
endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,244 +0,0 @@
;
; Generated by:
; Vitaly V. Chaban
; School of Chemistry
; University of Kharkiv
; Ukraine, Kharkiv-61077, Svoboda sq., 4
; email: chaban@univer.kharkov.ua, vvchaban@gmail.com
; skype: vvchaban
; System: Liquid argon (1000 atoms) at 80 K. Equilibrated for 500ps.
; VARIOUS PREPROCESSING OPTIONS
title = Yo
cpp = /usr/bin/cpp
include =
define =
; RUN CONTROL PARAMETERS
integrator = md
; Start time and timestep in ps
tinit = 0
dt = 0.001
nsteps = 250000
; For exact run continuation or redoing part of a run
init_step = 0
; mode for center of mass motion removal
comm-mode = Linear
; number of steps for center of mass motion removal
nstcomm = 1
; group(s) for center of mass motion removal
comm-grps =
; LANGEVIN DYNAMICS OPTIONS
; Temperature, friction coefficient (amu/ps) and random seed
bd-temp = 300
bd-fric = 0
ld-seed = 1993
; ENERGY MINIMIZATION OPTIONS
; Force tolerance and initial step-size
emtol = 100
emstep = 0.01
; Max number of iterations in relax_shells
niter = 20
; Step size (1/ps^2) for minimization of flexible constraints
fcstep = 0
; Frequency of steepest descents steps when doing CG
nstcgsteep = 1000
nbfgscorr = 10
; OUTPUT CONTROL OPTIONS
; Output frequency for coords (x), velocities (v) and forces (f)
nstxout = 500
nstvout = 5
nstfout = 0
; Checkpointing helps you continue after crashes
nstcheckpoint = 1000
; Output frequency for energies to log file and energy file
nstlog = 50
nstenergy = 50
; Output frequency and precision for xtc file
nstxtcout = 5
xtc-precision = 1000
; This selects the subset of atoms for the xtc file. You can
; select multiple groups. By default all atoms will be written.
xtc-grps =
; Selection of energy groups
energygrps =
; NEIGHBORSEARCHING PARAMETERS
; nblist update frequency
nstlist = 5
; ns algorithm (simple or grid)
ns_type = grid
; Periodic boundary conditions: xyz (default), no (vacuum)
; or full (infinite systems only)
pbc = xyz
; nblist cut-off
rlist = 0.9
domain-decomposition = no
; OPTIONS FOR ELECTROSTATICS AND VDW
; Method for doing electrostatics
coulombtype = Cut-off
rcoulomb-switch = 0
rcoulomb = 0.9
; Dielectric constant (DC) for cut-off or DC of reaction field
epsilon-r = 1
; Method for doing Van der Waals
vdw-type = Cut-off
; cut-off lengths
rvdw-switch = 0
rvdw = 0.9
; Apply long range dispersion corrections for Energy and Pressure
DispCorr = EnerPres
; Extension of the potential lookup tables beyond the cut-off
table-extension = 1
; Spacing for the PME/PPPM FFT grid
fourierspacing = 0.12
; FFT grid size, when a value is 0 fourierspacing will be used
fourier_nx = 0
fourier_ny = 0
fourier_nz = 0
; EWALD/PME/PPPM parameters
pme_order = 4
ewald_rtol = 1e-05
ewald_geometry = 3d
epsilon_surface = 0
optimize_fft = no
; GENERALIZED BORN ELECTROSTATICS
; Algorithm for calculating Born radii
gb_algorithm = Still
; Frequency of calculating the Born radii inside rlist
nstgbradii = 1
; Cutoff for Born radii calculation; the contribution from atoms
; between rlist and rgbradii is updated every nstlist steps
rgbradii = 2
; Salt concentration in M for Generalized Born models
gb_saltconc = 0
; IMPLICIT SOLVENT (for use with Generalized Born electrostatics)
implicit_solvent = No
; OPTIONS FOR WEAK COUPLING ALGORITHMS
; Temperature coupling
Tcoupl = berendsen
; Groups to couple separately
tc-grps = System
; Time constant (ps) and reference temperature (K)
tau_t = 0.1
ref_t = 80
; Pressure coupling
Pcoupl = no
Pcoupltype = isotropic
; Time constant (ps), compressibility (1/bar) and reference P (bar)
tau_p = 1.0
compressibility = 4.5e-5
ref_p = 1.0
; Random seed for Andersen thermostat
andersen_seed = 815131
; SIMULATED ANNEALING
; Type of annealing for each temperature group (no/single/periodic)
annealing = no
; Number of time points to use for specifying annealing in each group
annealing_npoints =
; List of times at the annealing points for each group
annealing_time =
; Temp. at each annealing point, for each group.
annealing_temp =
; GENERATE VELOCITIES FOR STARTUP RUN
gen_vel = yes
gen_temp = 80
gen_seed = 1993
; OPTIONS FOR BONDS
constraints = all-bonds
; Type of constraint algorithm
constraint-algorithm = Lincs
; Do not constrain the start configuration
unconstrained-start = no
; Use successive overrelaxation to reduce the number of shake iterations
Shake-SOR = no
; Relative tolerance of shake
shake-tol = 1e-04
; Highest order in the expansion of the constraint coupling matrix
lincs-order = 4
; Number of iterations in the final step of LINCS. 1 is fine for
; normal simulations, but use 2 to conserve energy in NVE runs.
; For energy minimization with constraints it should be 4 to 8.
lincs-iter = 1
; Lincs will write a warning to the stderr if in one step a bond
; rotates over more degrees than
lincs-warnangle = 30
; Convert harmonic bonds to morse potentials
morse = no
; ENERGY GROUP EXCLUSIONS
; Pairs of energy groups for which all non-bonded interactions are excluded
energygrp_excl =
; NMR refinement stuff
; Distance restraints type: No, Simple or Ensemble
disre = No
; Force weighting of pairs in one distance restraint: Conservative or Equal
disre-weighting = Conservative
; Use sqrt of the time averaged times the instantaneous violation
disre-mixed = no
disre-fc = 1000
disre-tau = 0
; Output frequency for pair distances to energy file
nstdisreout = 100
; Orientation restraints: No or Yes
orire = no
; Orientation restraints force constant and tau for time averaging
orire-fc = 0
orire-tau = 0
orire-fitgrp =
; Output frequency for trace(SD) to energy file
nstorireout = 100
; Dihedral angle restraints: No, Simple or Ensemble
dihre = No
dihre-fc = 1000
dihre-tau = 0
; Output frequency for dihedral values to energy file
nstdihreout = 100
; Free energy control stuff
free-energy = no
init-lambda = 0
delta-lambda = 0
sc-alpha = 0
sc-sigma = 0.3
; Non-equilibrium MD stuff
acc-grps =
accelerate =
freezegrps =
freezedim =
cos-acceleration = 0
; Electric fields
; Format is number of terms (int) and for all terms an amplitude (real)
; and a phase angle (real)
E-x =
E-xt =
E-y =
E-yt =
E-z =
E-zt =
; User defined thingies
user1-grps =
user2-grps =
userint1 = 0
userint2 = 0
userint3 = 0
userint4 = 0
userreal1 = 0
userreal2 = 0
userreal3 = 0
userreal4 = 0

View File

@ -1,12 +0,0 @@
mass 39.94
sigma 0.0062220
epsilon 0.0000096960
ntimes 250000
dt 0.001
temp 80
x_out_freq 500
v_out_freq 5
cutforce 1.8
skin 0.1
reneigh_every 100
nstat 125000

File diff suppressed because it is too large Load Diff

View File

@ -1,304 +0,0 @@
DATE: 2007-06-11 UNITS: metal CONTRIBUTOR: Stephen Foiles, foiles@sandia.gov CITATION: Foiles et al, Phys Rev B, 33, 7983 (1986) COMMENT: Cu functions (universal 3), SM Foiles et al, PRB, 33, 7983 (1986)
29 63.550 3.6150 FCC
500 5.0100200400801306e-04 500 1.0000000000000009e-02 4.9499999999999886e+00
0. -3.1561636903424350e-01 -5.2324876182494506e-01 -6.9740831416804383e-01 -8.5202525457518519e-01
-9.9329216586042435e-01 -1.1246331970890324e+00 -1.2481882647347859e+00 -1.3654054700363645e+00 -1.4773214276236644e+00
-1.5847099936904741e+00 -1.6865851873526410e+00 -1.7843534091637920e+00 -1.8790616476576076e+00 -1.9710188604521761e+00
-2.0604838665854572e+00 -2.1476762477372944e+00 -2.2327843595560068e+00 -2.3159713409697673e+00 -2.3973797031286352e+00
-2.4771348895887826e+00 -2.5553480773272810e+00 -2.6321184083774227e+00 -2.7075347880408458e+00 -2.7816773487592030e+00
-2.8546186529652005e+00 -2.9264246898861899e+00 -2.9971557080624507e+00 -3.0668669157065978e+00 -3.1356090736776849e+00
-3.2034290008357829e+00 -3.2703700069757247e+00 -3.3364722658277230e+00 -3.4017731379735778e+00 -3.4663074517059016e+00
-3.5301077484029122e+00 -3.5932044977085980e+00 -3.6556262870729199e+00 -3.7173999892229403e+00 -3.7785509106421671e+00
-3.8391029237823773e+00 -3.8990785849196925e+00 -3.9584992397079333e+00 -4.0173851179270912e+00 -4.0744518500210916e+00
-4.1306733564032641e+00 -4.1864034067843932e+00 -4.2416582335814326e+00 -4.2964533268445280e+00 -4.3508034838872618e+00
-4.4047228547107977e+00 -4.4582249835318351e+00 -4.5113228468570128e+00 -4.5640288884490872e+00 -4.6163550514904443e+00
-4.6683128082199232e+00 -4.7199131872767452e+00 -4.7711667990036801e+00 -4.8220838587683374e+00 -4.8726742087289665e+00
-4.9229473379113813e+00 -4.9729124009208192e+00 -5.0225782353423369e+00 -5.0719533779533492e+00 -5.1210460798461668e+00
-5.1698643205481289e+00 -5.2184158212228908e+00 -5.2667080570261362e+00 -5.3147482686812282e+00 -5.3625434733324937e+00
-5.4101004747367369e+00 -5.4574258728391953e+00 -5.5045260727784751e+00 -5.5514072933650311e+00 -5.5980755750691458e+00
-5.6445367875538750e+00 -5.6907966367860183e+00 -5.7368606717507191e+00 -5.7827342908000219e+00 -5.8284227476608805e+00
-5.8739311571204382e+00 -5.9192645004390272e+00 -5.9644276303605182e+00 -6.0094252761103064e+00 -6.0542620478988169e+00
-6.0989424413057520e+00 -6.1434708414539330e+00 -6.1878515269578429e+00 -6.2320886736884802e+00 -6.2761863583589275e+00
-6.3201485619430571e+00 -6.3639791729330000e+00 -6.4076819904493902e+00 -6.4512607272098990e+00 -6.4947190123648113e+00
-6.5380603942065250e+00 -6.5812883427622069e+00 -6.6243939095620874e+00 -6.6670830925929181e+00 -6.7096660473058591e+00
-6.7521459135001862e+00 -6.7945257643836499e+00 -6.8368086085521611e+00 -6.8789973918942735e+00 -6.9210949994162263e+00
-6.9631042569970703e+00 -7.0050279330721992e+00 -7.0468687402560874e+00 -7.0886293368973554e+00 -7.1303123285804020e+00
-7.1719202695651916e+00 -7.2134556641788095e+00 -7.2549209681507421e+00 -7.2963185899023415e+00 -7.3376508917899628e+00
-7.3789201913012903e+00 -7.4201287622117036e+00 -7.4612788356982946e+00 -7.5023726014152032e+00 -7.5434122085331978e+00
-7.5843997667427345e+00 -7.6253373472216595e+00 -7.6662269835740062e+00 -7.7070706727342895e+00 -7.7478703758424388e+00
-7.7886280190928119e+00 -7.8293454945503811e+00 -7.8700246609474789e+00 -7.9106673444489104e+00 -7.9512753393968865e+00
-7.9918504090315139e+00 -8.0323942861870705e+00 -8.0729086739704030e+00 -8.1133952464140293e+00 -8.1538556491162808e+00
-8.1942914998523975e+00 -8.2347043891773524e+00 -8.2750958810033808e+00 -8.3154675131659701e+00 -8.3558207979692725e+00
-8.3961572227176475e+00 -8.4364782502312892e+00 -8.4767853193496308e+00 -8.5170798454139458e+00 -8.5573632207473906e+00
-8.5976368151087286e+00 -8.6379019761436666e+00 -8.6781600298199919e+00 -8.7184122808490656e+00 -8.7586600130993020e+00
-8.7989044899963460e+00 -8.8391469549140993e+00 -8.8793886315543773e+00 -8.9196307243150841e+00 -8.9598744186541239e+00
-9.0001208814363167e+00 -9.0403712612778122e+00 -9.0806266888772029e+00 -9.1208882773446476e+00 -9.1611571225108719e+00
-9.2014343032440138e+00 -9.2417208817437881e+00 -9.2820179038447463e+00 -9.3223263992829857e+00 -9.3626473819958278e+00
-9.4029818503831279e+00 -9.4433307875392529e+00 -9.4836951616705960e+00 -9.5237840547885071e+00 -9.5637918926951784e+00
-9.6038142178817338e+00 -9.6438519061474608e+00 -9.6839058194810832e+00 -9.7239768064614509e+00 -9.7640657024289226e+00
-9.8041733297054634e+00 -9.8443004978059889e+00 -9.8844480036373170e+00 -9.9246166317080906e+00 -9.9648071543198853e+00
-1.0005020331762637e+01 -1.0045256912501884e+01 -1.0085517633366123e+01 -1.0125803219723423e+01 -1.0166114385662183e+01
-1.0206451834160134e+01 -1.0246816257258331e+01 -1.0287208336224353e+01 -1.0327628741713852e+01 -1.0368078133934148e+01
-1.0408557162795717e+01 -1.0449066468066974e+01 -1.0489606679525650e+01 -1.0530178417100558e+01 -1.0570782291022510e+01
-1.0611418901960292e+01 -1.0652088841158786e+01 -1.0692792690577562e+01 -1.0733531023022920e+01 -1.0774304402276016e+01
-1.0815113383222808e+01 -1.0855958511980305e+01 -1.0896840326017184e+01 -1.0937759354276295e+01 -1.0978716117290730e+01
-1.1019711127305925e+01 -1.1060744888386239e+01 -1.1101817896531486e+01 -1.1142930639787664e+01 -1.1184083598352004e+01
-1.1225277244679319e+01 -1.1266512043589387e+01 -1.1307788452364719e+01 -1.1349106920870327e+01 -1.1390467891550486e+01
-1.1431871799781504e+01 -1.1473319073642074e+01 -1.1514810134213008e+01 -1.1556345395619132e+01 -1.1597925265115521e+01
-1.1639550143177303e+01 -1.1681220423591583e+01 -1.1722936493536452e+01 -1.1764698733669888e+01 -1.1806507518187232e+01
-1.1848363215029394e+01 -1.1890266185706139e+01 -1.1932216785634637e+01 -1.1974215364086319e+01 -1.2016262264291129e+01
-1.2058357823507606e+01 -1.2100502373105996e+01 -1.2142696238631970e+01 -1.2184939739884385e+01 -1.2227233190982815e+01
-1.2269576900438324e+01 -1.2311971171220080e+01 -1.2354416300827552e+01 -1.2396912581348374e+01 -1.2439460299532641e+01
-1.2482059736851909e+01 -1.2524711169562636e+01 -1.2567414868772744e+01 -1.2610171100495961e+01 -1.2652980125719694e+01
-1.2695842200459083e+01 -1.2738757575819193e+01 -1.2781726498053729e+01 -1.2824749208615117e+01 -1.2867825944219817e+01
-1.2910956936899197e+01 -1.2954142414054047e+01 -1.2997382598508125e+01 -1.3040677708563408e+01 -1.3084027958052218e+01
-1.3127433556386677e+01 -1.3170894708610035e+01 -1.3214411615448739e+01 -1.3257984473359954e+01 -1.3301613474583519e+01
-1.3345298807190659e+01 -1.3389040655121903e+01 -1.3432839198243016e+01 -1.3476694612386723e+01 -1.3520607069407617e+01
-1.3564576737214225e+01 -1.3608603779754390e+01 -1.3652688357330362e+01 -1.3696830626228689e+01 -1.3741030739041094e+01
-1.3785288844633044e+01 -1.3829605088192579e+01 -1.3873979611263849e+01 -1.3918412551792358e+01 -1.3962904044165157e+01
-1.4007454219246995e+01 -1.4052063204422609e+01 -1.4096731123636516e+01 -1.4141458097424390e+01 -1.4186244242962175e+01
-1.4231089674089560e+01 -1.4275994501358696e+01 -1.4320958832063411e+01 -1.4365982770278379e+01 -1.4411066416893846e+01
-1.4456209869649911e+01 -1.4501413223171539e+01 -1.4546676569005058e+01 -1.4591999995647598e+01 -1.4637383588581656e+01
-1.4682827430315228e+01 -1.4728331600403862e+01 -1.4773896175488971e+01 -1.4819521229330235e+01 -1.4865206832833337e+01
-1.4910953054084985e+01 -1.4956759958383259e+01 -1.5002627608264334e+01 -1.5048556063539081e+01 -1.5094545381317744e+01
-1.5140595616041765e+01 -1.5186706819511983e+01 -1.5232879040916600e+01 -1.5279112326867676e+01 -1.5325406721414765e+01
-1.5371762266086876e+01 -1.5418178999911675e+01 -1.5464656959446415e+01 -1.5511196178805903e+01 -1.5557796689685119e+01
-1.5604458521389688e+01 -1.5651181700861002e+01 -1.5697966252703509e+01 -1.5744812199205967e+01 -1.5791719560374304e+01
-1.5838688353945599e+01 -1.5885718595428898e+01 -1.5932810298111235e+01 -1.5979963473102316e+01 -1.6027178129340314e+01
-1.6074454273625634e+01 -1.6121791910645470e+01 -1.6169191042992907e+01 -1.6216651671189425e+01 -1.6264173793714576e+01
-1.6311757407021901e+01 -1.6359402505566209e+01 -1.6407109081822910e+01 -1.6454877126310635e+01 -1.6502706627614998e+01
-1.6550597572407241e+01 -1.6598549945469813e+01 -1.6646563729715353e+01 -1.6694638906205682e+01 -1.6742775454176012e+01
-1.6790973351056778e+01 -1.6839232572488413e+01 -1.6887553092348412e+01 -1.6935934882766333e+01 -1.6984377914146876e+01
-1.7032882155186826e+01 -1.7081447572897673e+01 -1.7130074132623690e+01 -1.7178761798061373e+01 -1.7227510531275698e+01
-1.7276320292724563e+01 -1.7325191041271864e+01 -1.7374122734215121e+01 -1.7423115327299456e+01 -1.7472168774711918e+01
-1.7521283029136725e+01 -1.7570458041655343e+01 -1.7619693762170868e+01 -1.7668990138814479e+01 -1.7718347118374936e+01
-1.7767764646209685e+01 -1.7817242666259403e+01 -1.7866781121071881e+01 -1.7916379951810882e+01 -1.7966039098283659e+01
-1.8015758498943796e+01 -1.8065538090918608e+01 -1.8115377810021755e+01 -1.8165277590764617e+01 -1.8215237366381530e+01
-1.8265257068836149e+01 -1.8315336628844307e+01 -1.8365475975885602e+01 -1.8415675038220570e+01 -1.8465933742903644e+01
-1.8516252015799409e+01 -1.8566629781600568e+01 -1.8617066963838965e+01 -1.8667563484898778e+01 -1.8718119266039025e+01
-1.8768734227397317e+01 -1.8819408288014415e+01 -1.8870141365839345e+01 -1.8920933377750998e+01 -1.8971784239569388e+01
-1.9022693866067016e+01 -1.9073662170983084e+01 -1.9124689067045438e+01 -1.9175774465969539e+01 -1.9226918278483254e+01
-1.9278120414338218e+01 -1.9329380782317116e+01 -1.9380699290257098e+01 -1.9432075845048644e+01 -1.9483510352663075e+01
-1.9535002718153464e+01 -1.9586552845676124e+01 -1.9638160638497766e+01 -1.9689825999008235e+01 -1.9741548828738019e+01
-1.9793329028359494e+01 -1.9845166497711489e+01 -1.9897061135804051e+01 -1.9949012840833348e+01 -2.0001021510188707e+01
-2.0053087040468540e+01 -2.0105209327494322e+01 -2.0157388266314911e+01 -2.0209623751249865e+01 -2.0261915675825890e+01
-2.0314263932714312e+01 -2.0366668414255741e+01 -2.0419129011700647e+01 -2.0471645615726288e+01 -2.0524218116314501e+01
-2.0576846402769888e+01 -2.0629530363722893e+01 -2.0682269887147754e+01 -2.0735064860369221e+01 -2.0787915170073120e+01
-2.0840820702317274e+01 -2.0893781342541502e+01 -2.0946796975575580e+01 -2.0999867485656864e+01 -2.1052992756428125e+01
-2.1106172670961428e+01 -2.1159407111702421e+01 -2.1212695960751944e+01 -2.1266039099329419e+01 -2.1319436408360275e+01
-2.1372887768154328e+01 -2.1426393058473991e+01 -2.1479952158748461e+01 -2.1533564947619766e+01 -2.1587231303431395e+01
-2.1640951103995235e+01 -2.1694724226644553e+01 -2.1748550548245930e+01 -2.1802429945213817e+01 -2.1856362293508028e+01
-2.1910347468648524e+01 -2.1964385345728829e+01 -2.2018475799410339e+01 -2.2072618703948137e+01 -2.2126813933181779e+01
-2.2181061360561898e+01 -2.2235360859143157e+01 -2.2289712301596296e+01 -2.2344115560361388e+01 -2.2398570507087584e+01
-2.2453077013515781e+01 -2.2507634950890292e+01 -2.2562244190064348e+01 -2.2616904601590250e+01 -2.2671616055687764e+01
-2.2726378422261405e+01 -2.2781191570901910e+01 -2.2836055370890790e+01 -2.2890969691219198e+01 -2.2945934400583837e+01
-2.3000949367399926e+01 -2.3056014459808921e+01 -2.3111129545678523e+01 -2.3166294492618363e+01 -2.3221509167983868e+01
-2.3276773438880355e+01 -2.3332087172173260e+01 -2.3387450234495873e+01 -2.3442862492249787e+01 -2.3498323811618320e+01
-2.3553834058571510e+01 -2.3609393098863848e+01 -2.3665000798062465e+01 -2.3720657021526677e+01 -2.3776361634436626e+01
-2.3832114501780552e+01 -2.3887915488378439e+01 -2.3943764458878377e+01 -2.3999661277761106e+01 -2.4055605809352301e+01
-2.4111597917826657e+01 -2.4167637467209488e+01 -2.4223724321393092e+01 -2.4279858344124932e+01 -2.4336039399030597e+01
-2.4392267349614485e+01 -2.4448542059257761e+01 -2.4504863391234494e+01 -2.4561231208711206e+01 -2.4617645374753693e+01
-2.4674105752332935e+01 -2.4730612204329191e+01 -2.4787164593538137e+01 -2.4843762782677913e+01 -2.4900406634392539e+01
-2.4957096011252133e+01 -2.5013830775771112e+01 -2.5070610790396586e+01 -2.5127435917366029e+01 -2.5184306019355063e+01
-2.5241220958503845e+01 -2.5298180597080318e+01 -2.5355184797285347e+01 -2.5412233421340488e+01 -2.5469326331427965e+01
1.0000000000000000e+01 1.0801534951171448e+01 1.0617375158244670e+01 1.0436688151228793e+01 1.0259403283230313e+01
1.0085451405601304e+01 9.9147648356938589e+00 9.7472773253084029e+00 9.5829240298195373e+00 9.4216414779654656e+00
9.2633675422888473e+00 9.1080414102110012e+00 8.9556035557302494e+00 8.8059957117284853e+00 8.6591608428743143e+00
8.5150431191084976e+00 8.3735878897014118e+00 8.2347416578681987e+00 8.0984520559319435e+00 7.9646678210201571e+00
7.8333387712866624e+00 7.7044157826449009e+00 7.5778507660022569e+00 7.4535966449878401e+00 7.3316073341564731e+00
7.2118377176659578e+00 7.0942436284134374e+00 6.9787818276207929e+00 6.8654099848621115e+00 6.7540866585212882e+00
6.6447712766712357e+00 6.5374241183666584e+00 6.4320062953403578e+00 6.3284797340946000e+00 6.2268071583795574e+00
6.1269520720505000e+00 6.0288787422946655e+00 5.9325521832211621e+00 5.8379381398054591e+00 5.7450030721804524e+00
5.6537141402680220e+00 5.5640391887418730e+00 5.4759467323160322e+00 5.3894059413519244e+00 5.3043866277758980e+00
5.2208592313018016e+00 5.1387948059520454e+00 5.0581650068698707e+00 4.9789420774166615e+00 4.9010988365496075e+00
4.8246086664712777e+00 4.7494455005478358e+00 4.6755838114879396e+00 4.6029985997776066e+00 4.5316653823665547e+00
4.4615601815980312e+00 4.3926595143797726e+00 4.3249403815888456e+00 4.2583802577058805e+00 4.1929570806747449e+00
4.1286492419807814e+00 4.0654355769448500e+00 4.0032953552278059e+00 3.9422082715398403e+00 3.8821544365521561e+00
3.8231143680053350e+00 3.7650689820101348e+00 3.7079995845373759e+00 3.6518878630917868e+00 3.5967158785670392e+00
3.5424660572764992e+00 3.4891211831576925e+00 3.4366643901451397e+00 3.3850791547089756e+00 3.3343492885547761e+00
3.2844589314827459e+00 3.2353925444006251e+00 3.1871349024889781e+00 3.1396710885139782e+00 3.0929864862859660e+00
3.0470667742591075e+00 3.0018979192706325e+00 2.9574661704151453e+00 2.9137580530522627e+00 2.8707603629438552e+00
2.8284601605189152e+00 2.7868447652620318e+00 2.7459017502243626e+00 2.7056189366531243e+00 2.6659843887374848e+00
2.6269864084689516e+00 2.5886135306124487e+00 2.5508545177868598e+00 2.5136983556521244e+00 2.4771342482006986e+00
2.4411516131510069e+00 2.4057400774406830e+00 2.3708894728175807e+00 2.3365898315265383e+00 2.3028313820887689e+00
2.2696045451740474e+00 2.2368999295609058e+00 2.2047083281853901e+00 2.1730207142748128e+00 2.1418282375653348e+00
2.1111222206016862e+00 2.0808941551166384e+00 2.0511356984892615e+00 2.0218386702793651e+00 1.9929950488372441e+00
1.9645969679867363e+00 1.9366367137799969e+00 1.9091067213223525e+00 1.8819995716660998e+00 1.8553079887710169e+00
1.8290248365311754e+00 1.8031431158652609e+00 1.7776559618705363e+00 1.7525566410377422e+00 1.7278385485262007e+00
1.7034952054980579e+00 1.6795202565098251e+00 1.6559074669601728e+00 1.6326507205929630e+00 1.6097440170540054e+00
1.5871814695006066e+00 1.5649573022624637e+00 1.5430658485530984e+00 1.5215015482308161e+00 1.5002589456071576e+00
1.4793326873036463e+00 1.4587175201534635e+00 1.4384082891492156e+00 1.4183999354343300e+00 1.3986874943378140e+00
1.3792660934511431e+00 1.3601309507466510e+00 1.3412773727360872e+00 1.3227007526689576e+00 1.3043965687692420e+00
1.2863603825102174e+00 1.2685878369261090e+00 1.2510746549598935e+00 1.2338166378466084e+00 1.2168096635312082e+00
1.2000496851203266e+00 1.1835327293670588e+00 1.1672548951882362e+00 1.1512123522134416e+00 1.1354013393647548e+00
1.1198181634671940e+00 1.1044591978884952e+00 1.0893208812080033e+00 1.0743997159140335e+00 1.0596922671287743e+00
1.0451951613605601e+00 1.0309050852825337e+00 1.0168187845373140e+00 1.0029330625671378e+00 9.8924477946872713e-01
9.7575085087259694e-01 9.6244824684604424e-01 9.4933399081931213e-01 9.3640515853477169e-01 9.2365887701803118e-01
9.1109232357100112e-01 8.9870272478628266e-01 8.8648735558209424e-01 8.7444353825798160e-01 8.6256864157006774e-01
8.5086007982605949e-01 8.3931531199913678e-01 8.2793184086057892e-01 8.1670721213066955e-01 8.0563901364725510e-01
7.9472487455206675e-01 7.8396246449372953e-01 7.7334949284779597e-01 7.6288370795296245e-01 7.5256289636327622e-01
7.4238488211596021e-01 7.3234752601463171e-01 7.2244872492728618e-01 7.1268641109915265e-01 7.0305855147956464e-01
6.9356314706317335e-01 6.8419823224459719e-01 6.7496187418651843e-01 6.6585217220099224e-01 6.5686725714346750e-01
6.4800529081937697e-01 6.3926446540306614e-01 6.3064300286859520e-01 6.2213915443241774e-01 6.1375120000748140e-01
6.0547744766850542e-01 5.9731623312840654e-01 5.8926591922531912e-01 5.8132489542033028e-01 5.7349157730523359e-01
5.6576440612064971e-01 5.5814184828379609e-01 5.5062239492602316e-01 5.4320456143964790e-01 5.3588688703414888e-01
5.2866793430138515e-01 5.2154628878946241e-01 5.1452055858552015e-01 5.0758937390678227e-01 5.0075138669987496e-01
4.9400527024841523e-01 4.8734971878830358e-01 4.8078344713093557e-01 4.7430519029390972e-01 4.6791370313911962e-01
4.6160776001828552e-01 4.5538615442535857e-01 4.4924769865602876e-01 4.4319122347399365e-01 4.3721557778390086e-01
4.3131962831075654e-01 4.2550225928575891e-01 4.1976237213834899e-01 4.1409888519439697e-01 4.0851073338028954e-01
4.0299686793291478e-01 3.9755625611540779e-01 3.9218788093843493e-01 3.8689074088692443e-01 3.8166384965228239e-01
3.7650623586976018e-01 3.7141694286095728e-01 3.6639502838144544e-01 3.6143956437320846e-01 3.5654963672189943e-01
3.5172434501901328e-01 3.4696280232829579e-01 3.4226413495707497e-01 3.3762748223177219e-01 3.3305199627774762e-01
3.2853684180349596e-01 3.2408119588894380e-01 3.1968424777773841e-01 3.1534519867361155e-01 3.1106326154055530e-01
3.0683766090688813e-01 3.0266763267296426e-01 2.9855242392259740e-01 2.9449129273803010e-01 2.9048350801842027e-01
2.8652834930171167e-01 2.8262510658997009e-01 2.7877308017785829e-01 2.7497158048439907e-01 2.7121992788793392e-01
2.6751745256412462e-01 2.6386349432690004e-01 2.6025740247248841e-01 2.5669853562631850e-01 2.5318626159266877e-01
2.4971995720718354e-01 2.4629900819206618e-01 2.4292280901402563e-01 2.3959076274464408e-01 2.3630228092351846e-01
2.3305678342376535e-01 2.2985369832002167e-01 2.2669246175884616e-01 2.2357251783148069e-01 2.2049331844890929e-01
2.1745432321916880e-01 2.1445499932688783e-01 2.1149482141498144e-01 2.0857327146848004e-01 2.0568983870040114e-01
2.0284401943976604e-01 2.0003531702142130e-01 1.9726324167804599e-01 1.9452731043391402e-01 1.9182704700056608e-01
1.8916198167437770e-01 1.8653165123588344e-01 1.8393559885088084e-01 1.8137337397327791e-01 1.7884453224959973e-01
1.7634863542523593e-01 1.7388525125224241e-01 1.7145395339876757e-01 1.6905432136008169e-01 1.6668594037109052e-01
1.6434840132036665e-01 1.6204130066570688e-01 1.5976424035106618e-01 1.5751682772493769e-01 1.5529867546015819e-01
1.5310940147503249e-01 1.5094862885580707e-01 1.4881598578045718e-01 1.4671110544379484e-01 1.4463362598375351e-01
1.4258319040899092e-01 1.4055944652768915e-01 1.3856204687748974e-01 1.3659064865666881e-01 1.3464491365640630e-01
1.3272450819420012e-01 1.3082910304837103e-01 1.2895837339364213e-01 1.2711199873781265e-01 1.2528966285941134e-01
1.2349105374641756e-01 1.2171586353596986e-01 1.1996378845505173e-01 1.1823452876211782e-01 1.1652778868972380e-01
1.1484327638801961e-01 1.1318070386919254e-01 1.1153978695277944e-01 1.0992024521187505e-01 1.0832180192018548e-01
1.0674418399992769e-01 1.0518712197055757e-01 1.0365034989832456e-01 1.0213360534659532e-01 1.0063662932698936e-01
9.9159166251264974e-02 9.7700963883974534e-02 9.6261773295835962e-02 9.4841348817873428e-02 9.3439447996227276e-02
9.2055831547688260e-02 9.0690263315935660e-02 8.9342510228411331e-02 8.8012342253891429e-02 8.6699532360706044e-02
8.5403856475584128e-02 8.4125093443141896e-02 8.2863024985984080e-02 8.1617435665412685e-02 8.0388112842733062e-02
7.9174846641143493e-02 7.7977429908209661e-02 7.6795658178889781e-02 7.5629329639115728e-02 7.4478245089953710e-02
7.3342207912248103e-02 7.2221024031827064e-02 7.1114501885225945e-02 7.0022452385910761e-02 6.8944688890991479e-02
6.7881027168450458e-02 6.6831285364849169e-02 6.5795283973477225e-02 6.4772845803028556e-02 6.3763795946680801e-02
6.2767961751651669e-02 6.1785172789201148e-02 6.0815260825057393e-02 5.9858059790287577e-02 5.8913405752569759e-02
5.7981136887894191e-02 5.7061093452682510e-02 5.6153117756271964e-02 5.5257054133826422e-02 5.4372748919636837e-02
5.3500050420772105e-02 5.2638808891131372e-02 5.1788876505864945e-02 5.0950107336147354e-02 5.0122357324306366e-02
4.9305484259319243e-02 4.8499347752635869e-02 4.7703809214351578e-02 4.6918731829721727e-02 4.6143980535982010e-02
4.5379421999521163e-02 4.4624924593352100e-02 4.3880358374905226e-02 4.3145595064128850e-02 4.2420508021892900e-02
4.1704972228691739e-02 4.0998864263647405e-02 4.0302062283785300e-02 3.9614446003616965e-02 3.8935896674993531e-02
3.8266297067221844e-02 3.7605531447481688e-02 3.6953485561492139e-02 3.6310046614435487e-02 3.5675103252157392e-02
3.5048545542616605e-02 3.4430264957581835e-02 3.3820154354582632e-02 3.3218107959093635e-02 3.2624021346983278e-02
3.2037791427166340e-02 3.1459316424514716e-02 3.0888495862994469e-02 3.0325230549015147e-02 2.9769422555015357e-02
2.9220975203265720e-02 2.8679793049885216e-02 2.8145781869070463e-02 2.7618848637539717e-02 2.7098901519172047e-02
2.6585849849867671e-02 2.6079604122596356e-02 2.5580075972643668e-02 2.5087178163056167e-02 2.4600824570288671e-02
2.4120930170012267e-02 2.3647411023137499e-02 2.3180184262011627e-02 2.2719168076792418e-02 2.2264281702001121e-02
2.1815445403263078e-02 2.1372580464206647e-02 2.0935609173537761e-02 2.0504454812290795e-02 2.0079041641240414e-02
1.9659294888467183e-02 1.9245140737102040e-02 1.8836506313223755e-02 1.8433319673904158e-02 1.8035509795416238e-02
1.7643006561603891e-02 1.7255740752380899e-02 1.6873644032391555e-02 1.6496648939823388e-02 1.6124688875347792e-02
1.5757698091213634e-02 1.5395611680482646e-02 1.5038365566394485e-02 1.4685896491875350e-02 1.4338142009180710e-02
1.3995040469664266e-02 1.3656531013687800e-02 1.3322553560652262e-02 1.2993048799157525e-02 1.2667958177290606e-02
1.2347223893038994e-02 1.2030788884814458e-02 1.1718596822117511e-02 1.1410592096299910e-02 1.1106719811460941e-02
1.0806925775450060e-02 1.0511156490982998e-02 1.0219359146882878e-02 9.9314816094114855e-03 9.6474724137328716e-03
9.3672807554677773e-03 9.0908564823645177e-03 8.8181500860711193e-03 8.5491126940134832e-03 8.2836960613733579e-03
8.0218525631707838e-03 7.7635351864465685e-03 7.5086975225370223e-03 7.2572937594544973e-03 7.0092786743605195e-03
6.7646076261301813e-03 6.5232365480138998e-03 6.2851219403949887e-03 6.0502208636273869e-03 5.8184909309735300e-03
5.5898903016277091e-03 5.3643776738254711e-03 5.1419122780385074e-03 4.9224538702609122e-03 4.7059627253757674e-03
4.4923996305976099e-03 4.2817258790122659e-03 4.0739032631877392e-03 3.8688940688609841e-03 3.6666610687164924e-03
3.4671675162341598e-03 3.2703771396105918e-03 3.0762541357672313e-03 2.8847631644254856e-03 2.6958693422570179e-03
2.5095382371091990e-03 2.3257358623008373e-03 2.1444286709895732e-03 1.9655835506104946e-03 1.7891678173820869e-03
1.6151492108847365e-03 1.4434958887007410e-03 1.2741764211267048e-03 1.1071597859496629e-03 9.4241536328815156e-04
7.7991293049733956e-04 6.1962265713921827e-04 4.6151510001329887e-04 3.0556119825198014e-04 1.5173226847375876e-04
0. 0. 0. 0. 0.
0. 5.4383329664155645e-05 9.3944898415945083e-04 4.3251847212615047e-03 1.2334244035325348e-02
2.7137722173468548e-02 5.0697119791449641e-02 8.4607638668976470e-02 1.3001641279549414e-01 1.8759487452762702e-01
2.5754900895683441e-01 3.3965493779430744e-01 4.3331024634064264e-01 5.3759384878832961e-01 6.5132908316254046e-01
7.7314622535699939e-01 9.0154178511424377e-01 1.0349328562818201e+00 1.1717054897399350e+00 1.3102565818166738e+00
1.4490291582473986e+00 1.5865412121263560e+00 1.7214084470448441e+00 1.8523614026473965e+00 1.9782575145276269e+00
2.0980886961566938e+00 2.2109850373516764e+00 2.3162151996095730e+00 2.4131840597491703e+00 2.5014281146549706e+00
2.5806091153285706e+00 2.6505063508648590e+00 2.7110079545661563e+00 2.7621015568249447e+00 2.8038645637913220e+00
2.8364542979766156e+00 2.8600981973448825e+00 2.8750842333755031e+00 2.8817516761559574e+00 2.8804823057701157e+00
2.8716921439699092e+00 2.8558237581894161e+00 2.8333391711552594e+00 2.8047133934346959e+00 2.7704285829676252e+00
2.7309688247181469e+00 2.6868155147671331e+00 2.6384433262347358e+00 2.5863167291097398e+00 2.5308870321738226e+00
2.4725899125317596e+00 2.4118433966060167e+00 2.3490462556752334e+00 2.2845767789603002e+00 2.2187918877813502e+00
2.1520265552815943e+00 2.0845934975626363e+00 2.0167831036919637e+00 1.9488635738636404e+00 1.8810812369508270e+00
1.8136610207193371e+00 1.7468070500507196e+00 1.6807033505858371e+00 1.6155146372447149e+00 1.5513871690559142e+00
1.4884496536383409e+00 1.4268141864958608e+00 1.3665772120042590e+00 1.3078204945836447e+00 1.2506120900523854e+00
1.1950073085502879e+00 1.1410496616995687e+00 1.0887717878420631e+00 1.0381963502565981e+00 9.8933690422003551e-01
9.4219872964247031e-01 8.9677962677415124e-01 8.5307067316958651e-01 8.1105694069385592e-01 7.7071817188505065e-01
7.3202941544290212e-01 6.9496162100761794e-01 6.5948219372701189e-01 6.2555550939233484e-01 5.9314339115629977e-01
5.6220554903693554e-01 5.3269998356387660e-01 5.0458335504023211e-01 4.7781131998032222e-01 4.5233883634534777e-01
4.2812043923464138e-01 4.0511048870905242e-01 3.8326339142174781e-01 3.6253379771729577e-01 3.4287677583286325e-01
3.2424796479760154e-01 3.0660370758054967e-01 2.8990116598452254e-01 2.7409841872609064e-01 2.5915454407883409e-01
2.4502968839369110e-01 2.3168512174254197e-01 2.1908328186436687e-01 2.0718780752542632e-01 1.9596356233750800e-01
1.8537665001230508e-01 1.7539442196444632e-01 1.6598547811304609e-01 1.5711966166996927e-01 1.4876804864444715e-01
1.4090293273673637e-01 1.3349780623990259e-01 1.2652733751724909e-01 1.1996734557434463e-01 1.1379477219856060e-01
1.0798765209582406e-01 1.0252508141368288e-01 9.7387185001678311e-02 9.2555082724584015e-02 8.8010855111109620e-02
8.3737508589961873e-02 7.9718940536826377e-02 7.5939904329596963e-02 7.2385974585237101e-02 6.9043512729294765e-02
6.5899633029043336e-02 6.2942169202580001e-02 6.0159641699440547e-02 5.7541225732930634e-02 5.5076720130546430e-02
5.2756517056398833e-02 5.0571572648238083e-02 4.8513378601664936e-02 4.6573934725081756e-02 4.4745722480991068e-02
4.3021679522073253e-02 4.1395175224364866e-02 3.9859987214311721e-02 3.8410278881708670e-02 3.7040577866510604e-02
3.5745755503880039e-02 3.4521007208912380e-02 3.3361833779917971e-02 3.2264023597108116e-02 3.1223635691821294e-02
3.0236983660070216e-02 2.9300620393215571e-02 2.8411323597772320e-02 2.7566082075896281e-02 2.6762082737777249e-02
2.5996698317105604e-02 2.5267475760840985e-02 2.4572125264713973e-02 2.3908509926274246e-02 2.3274635987705516e-02
2.2668643641204911e-02 2.2088798370316409e-02 2.1533482801290083e-02 2.1001189039288493e-02 2.0490511464994254e-02
2.0000139967999431e-02 1.9528853594166895e-02 1.9075514584991349e-02 1.8639062787818239e-02 1.8218510416650235e-02
1.7812937144080498e-02 1.7421485505751177e-02 1.7043356599549031e-02 1.6677806062561751e-02 1.6324140309613155e-02
1.5981713017976018e-02 1.5649921843605585e-02 1.5328205354974755e-02 1.5016040171312250e-02 1.4712938292708366e-02
1.4418444610242331e-02 1.4132134584901757e-02 1.3853612084676337e-02 1.3582507369821917e-02 1.3318475216818060e-02
1.3061193172097418e-02 1.2810359927147186e-02 1.2565693807050415e-02 1.2326931365025051e-02 1.2093826075940506e-02
1.1866147122233661e-02 1.1643678266026136e-02 1.1426216801644407e-02 1.1213572583084475e-02 1.1005567121320226e-02
1.0802032746662471e-02 1.0602811831688208e-02 1.0407756070544782e-02 1.0216725810699157e-02 1.0029589433467268e-02
9.8462227798860602e-03 9.6665086187306404e-03 9.4903361536790021e-03 9.3176005668363371e-03 9.1482025960089031e-03
8.9820481433065535e-03 8.8190479128032462e-03 8.6591170751522117e-03 8.5021749571883021e-03 8.3481447546937537e-03
8.1969532666261724e-03 8.0485306492223962e-03 7.9028101885199598e-03 7.7597280899136256e-03 7.6192232834934315e-03
7.4812372439735375e-03 7.3457138241272979e-03 7.2125991007052359e-03 7.0818412319012813e-03 6.9533903254870300e-03
6.8271983168139705e-03 6.7032188559211503e-03 6.5814072030662141e-03 6.4617201320263939e-03 6.3441158405819764e-03
6.2285538676237207e-03 6.1149950163802147e-03 6.0034012832899109e-03 5.8937357920846312e-03 5.7859627326801166e-03
5.6800473044990030e-03 5.5759556638887986e-03 5.4736548753111791e-03 5.3731128660109428e-03 5.2742983838981461e-03
5.1771809583849582e-03 5.0817308639591330e-03 4.9879190862693046e-03 4.8957172905357560e-03 4.8050977921015592e-03
4.7160335289582467e-03 4.6284980360953021e-03 4.5424654215287241e-03 4.4579103438822931e-03 4.3748079913988880e-03
4.2931340622749670e-03 4.2128647462132407e-03 4.1339767071033873e-03 4.0564470667446839e-03 3.9802533895282599e-03
3.9053736680121076e-03 3.8317863093158128e-03 3.7594701222811860e-03 3.6884043053326127e-03 3.6185684349951674e-03
3.5499424550168301e-03 3.4825066660512660e-03 3.4162417158645347e-03 3.3511285900229004e-03 3.2871486030347646e-03
3.2242833899080170e-03 3.1625148980992668e-03 3.1018253798278661e-03 3.0421973847258310e-03 2.9836137528083811e-03
2.9260576077371064e-03 2.8695123503632708e-03 2.8139616525287708e-03 2.7593894511106498e-03 2.7057799422959966e-03
2.6531175760685227e-03 2.6013870509009052e-03 2.5505733086344240e-03 2.5006615295404683e-03 2.4516371275501436e-03
2.4034857456453340e-03 2.3561932514012535e-03 2.3097457326723414e-03 2.2641294934160616e-03 2.2193310496436136e-03
2.1753371254977782e-03 2.1321346494441173e-03 2.0897107505768314e-03 2.0480527550303662e-03 2.0071481824917164e-03
1.9669847428123305e-03 1.9275503327108034e-03 1.8888330325659355e-03 1.8508211032951805e-03 1.8135029833145980e-03
1.7768672855772646e-03 1.7409027946878666e-03 1.7055984640891586e-03 1.6709434133182904e-03 1.6369269253308227e-03
1.6035384438881917e-03 1.5707675710093030e-03 1.5386040644797400e-03 1.5070378354209296e-03 1.4760589459142243e-03
1.4456576066784674e-03 1.4158241748004133e-03 1.3865491515145517e-03 1.3578231800324136e-03 1.3296370434173130e-03
1.3019816625059188e-03 1.2748480938728074e-03 1.2482275278369870e-03 1.2221112865106742e-03 1.1964908218862064e-03
1.1713577139624703e-03 1.1467036689077198e-03 1.1225205172586891e-03 1.0988002121543120e-03 1.0755348276031765e-03
1.0527165567835728e-03 1.0303377103750150e-03 1.0083907149206553e-03 9.8686811121878604e-04 9.6576255274356815e-04
9.4506680409354657e-04 9.2477373946662708e-04 9.0487634116191706e-04 8.8536769810608137e-04 8.6624100440530968e-04
8.4748955791986991e-04 8.2910675886310736e-04 8.1108610842155551e-04 7.9342120739794852e-04 7.7610575487466887e-04
7.5913354689786591e-04 7.4249847518158968e-04 7.2619452583109687e-04 7.1021577808524222e-04 6.9455640307671332e-04
6.7921066261025093e-04 6.6417290795844214e-04 6.4943757867335500e-04 6.3499920141575628e-04 6.2085238879914031e-04
6.0699183824991856e-04 5.9341233088238896e-04 5.8010873038847818e-04 5.6707598194186137e-04 5.5430911111587280e-04
5.4180322281523891e-04 5.2955350022104025e-04 5.1755520374872563e-04 5.0580367001857793e-04 4.9429431083891986e-04
4.8302261220136561e-04 4.7198413328763435e-04 4.6117450548847222e-04 4.5058943143359842e-04 4.4022468403297037e-04
4.3007610552883886e-04 4.2013960655883260e-04 4.1041116522908330e-04 4.0088682619821882e-04 3.9156269977118005e-04
3.8243496100300207e-04 3.7349984881274514e-04 3.6475366510662147e-04 3.5619277391102898e-04 3.4781360051482253e-04
3.3961263062063513e-04 3.3158640950565685e-04 3.2373154119109092e-04 3.1604468762060252e-04 3.0852256784754707e-04
3.0116195723081836e-04 2.9395968663908575e-04 2.8691264166377101e-04 2.8001776184017647e-04 2.7327203987681688e-04
2.6667252089326854e-04 2.6021630166557681e-04 2.5390052988028163e-04 2.4772240339593181e-04 2.4167916951265550e-04
2.3576812424967210e-04 2.2998661163024531e-04 2.2433202297460642e-04 2.1880179620031078e-04 2.1339341513026532e-04
2.0810440880823181e-04 2.0293235082175821e-04 1.9787485863260665e-04 1.9292959291436311e-04 1.8809425689761319e-04
1.8336659572205580e-04 1.7874439579616125e-04 1.7422548416372047e-04 1.6980772787763936e-04 1.6548903338088530e-04
1.6126734589430591e-04 1.5714064881157744e-04 1.5310696310104604e-04 1.4916434671449329e-04 1.4531089400280153e-04
1.4154473513841234e-04 1.3786403554466153e-04 1.3426699533172857e-04 1.3075184873951283e-04 1.2731686358694039e-04
1.2396034072819674e-04 1.2068061351527565e-04 1.1747604726729168e-04 1.1434503874632306e-04 1.1128601563955686e-04
1.0829743604811193e-04 1.0537778798212988e-04 1.0252558886227753e-04 9.9739385027582898e-05 9.7017751249615057e-05
9.4359290252773662e-05 9.1762632240957511e-05 8.9226434430383569e-05 8.6749380588361721e-05 8.4330180578390864e-05
8.1967569911181246e-05 7.9660309301724484e-05 7.7407184232279429e-05 7.5207004521348451e-05 7.3058603898526649e-05
7.0960839585107720e-05 6.8912591880629977e-05 6.6912763755002085e-05 6.4960280446513426e-05 6.3054089065330086e-05
6.1193158202771814e-05 5.9376477546041213e-05 5.7603057498502742e-05 5.5871928805544500e-05 5.4182142185708361e-05
5.2532767967318744e-05 5.0922895730446966e-05 4.9351633954125953e-05 4.7818109668823321e-05 4.6321468114150300e-05
4.4860872401664663e-05 4.3435503182825573e-05 4.2044558321957873e-05 4.0687252574273750e-05 3.9362817268785450e-05
3.8070499996214428e-05 3.6809564301621984e-05 3.5579289382025496e-05 3.4378969788611451e-05 3.3207915133769052e-05
3.2065449802711312e-05 3.0950912669766876e-05 2.9863656819185611e-05 2.8803049270468119e-05 2.7768470708167169e-05
2.6759315216115260e-05 2.5774990015931323e-05 2.4814915209964844e-05 2.3878523528387922e-05 2.2965260080560611e-05
2.2074582110528148e-05 2.1205958756658535e-05 2.0358870815317476e-05 1.9532810508535560e-05 1.8727281255713447e-05
1.7941797449145505e-05 1.7175884233475961e-05 1.6429077288930018e-05 1.5700922618341645e-05 1.4990976337865471e-05
1.4298804471386687e-05 1.3623982748522034e-05 1.2966096406226424e-05 1.2324739993882115e-05 1.1699517181902770e-05
1.1090040573734860e-05 1.0495931521266495e-05 9.9168199435395021e-06 9.3523441487842465e-06 8.8021506596591475e-06
8.2658940417265321e-06 7.7432367350197678e-06 7.2338488887770244e-06 6.7374081991923703e-06 6.2535997501888662e-06
5.7821158571569505e-06 5.3226559136389283e-06 4.8749262408651290e-06 4.4386399401326240e-06 4.0135167480073166e-06
3.5992828942305738e-06 3.1956709623667747e-06 2.8024197531120341e-06 2.4192741502208947e-06 2.0459849890155880e-06
1.6823089274468580e-06 1.3280083196495871e-06 9.8285109196557868e-07 6.4661062138351467e-07 3.1906561636122974e-07
0. 0. 0. 0. 0.

View File

@ -1,303 +0,0 @@
Cu functions (universal 4) - JB Adams et al J. Mater. Res., 4(1), 102 (1989)
29 63.550 3.6150 FCC
500 5.0100200400801306e-04 500 1.0000000000000009e-02 4.9499999999999886e+00
0. -3.1589719908208558e-01 -5.2405175291223927e-01 -6.9885553834123115e-01 -8.5420409172727574e-01
-9.9627285782417374e-01 -1.1284756487892835e+00 -1.2529454148045645e+00 -1.3711252149943363e+00 -1.4840478277127076e+00
-1.5924840805403662e+00 -1.6954285804552853e+00 -1.7942937845174001e+00 -1.8901318213864968e+00 -1.9832501645057476e+00
-2.0739063371790252e+00 -2.1623187777983759e+00 -2.2486748239473968e+00 -2.3331367007241965e+00 -2.4158460932269890e+00
-2.4969276936450484e+00 -2.5764919917168783e+00 -2.6546374977553171e+00 -2.7314525337618534e+00 -2.8070166913917660e+00
-2.8814020298474361e+00 -2.9546740684873072e+00 -3.0268926157701515e+00 -3.0981124665488835e+00 -3.1683839923973949e+00
-3.2377536446699082e+00 -3.3062643854300688e+00 -3.3739560586949437e+00 -3.4408657118035535e+00 -3.5070278749074930e+00
-3.5724748051301987e+00 -3.6372367006631805e+00 -3.7013418893374563e+00 -3.7648169952241943e+00 -3.8276870863304424e+00
-3.8899758061050136e+00 -3.9517054906525857e+00 -4.0128972737054482e+00 -4.0735711808432313e+00 -4.1324020819066334e+00
-4.1903921077370399e+00 -4.2479051536742531e+00 -4.3049572584920952e+00 -4.3615637243846948e+00 -4.4177391662932166e+00
-4.4734975570518145e+00 -4.5288522686579995e+00 -4.5838161101054880e+00 -4.6384013621277234e+00 -4.6926198091528875e+00
-4.7464827687459206e+00 -4.8000011187718314e+00 -4.8531853225280486e+00 -4.9060454519053280e+00 -4.9585912090057604e+00
-5.0108319460781274e+00 -5.0627766840981678e+00 -5.1144341300432075e+00 -5.1658126929883963e+00 -5.2169204991179470e+00
-5.2677654057399081e+00 -5.3183550143895957e+00 -5.3686966830900360e+00 -5.4187975378393958e+00 -5.4686644833797118e+00
-5.5183042133078573e+00 -5.5677232195759814e+00 -5.6169278014231168e+00 -5.6659240737851633e+00 -5.7147179752168427e+00
-5.7633152753617765e+00 -5.8117215820040258e+00 -5.8599423477251662e+00 -5.9079828761981332e+00 -5.9558483281427925e+00
-6.0035437269616239e+00 -6.0510739641062798e+00 -6.0984438040355258e+00 -6.1456578891786364e+00 -6.1927207443901580e+00
-6.2396367812953599e+00 -6.2864103024253382e+00 -6.3330455051271599e+00 -6.3795464852936732e+00 -6.4259172409138614e+00
-6.4721616754587501e+00 -6.5182836011053098e+00 -6.5642867418169999e+00 -6.6101747362826870e+00 -6.6559511407215268e+00
-6.7016194315664848e+00 -6.7471830080289408e+00 -6.7926321787266488e+00 -6.8376565597087335e+00 -6.8825828851093718e+00
-6.9274142262937062e+00 -6.9721535889762833e+00 -7.0168039151812138e+00 -7.0613680851242009e+00 -7.1058489190182854e+00
-7.1502491788078260e+00 -7.1945715698338404e+00 -7.2388187424386103e+00 -7.2829932935017325e+00 -7.3270977679243288e+00
-7.3711346600521210e+00 -7.4151064150498769e+00 -7.4590154302223652e+00 -7.5028640562861142e+00 -7.5466545985988489e+00
-7.5903893183397599e+00 -7.6340704336532212e+00 -7.6777001207475166e+00 -7.7212805149568453e+00 -7.7648137117691078e+00
-7.8083017678126225e+00 -7.8517467018166371e+00 -7.8951504955327323e+00 -7.9385150946301337e+00 -7.9818424095582259e+00
-8.0251343163853335e+00 -8.0683926576019758e+00 -8.1116192429086027e+00 -8.1548158499697934e+00 -8.1979842251487867e+00
-8.2411260842174556e+00 -8.2842431130446244e+00 -8.3273369682627845e+00 -8.3704092779143480e+00 -8.4134616420805628e+00
-8.4564956334858721e+00 -8.4995127980902225e+00 -8.5425146556591471e+00 -8.5855027003218538e+00 -8.6284784011075430e+00
-8.6714432024708685e+00 -8.7143985247980709e+00 -8.7573457649043576e+00 -8.8002862965079203e+00 -8.8432214707042931e+00
-8.8861526164113798e+00 -8.9290810408153902e+00 -8.9720080297988716e+00 -9.0149348483554377e+00 -9.0578627409975070e+00
-9.1007929321486927e+00 -9.1437266265307358e+00 -9.1866650095359432e+00 -9.2296092475897353e+00 -9.2725604885091570e+00
-9.3155198618426311e+00 -9.3584884792123262e+00 -9.4014674346366292e+00 -9.4444578048540961e+00 -9.4874606496305205e+00
-9.5304770120671378e+00 -9.5735079188909253e+00 -9.6165543807549625e+00 -9.6596173924971254e+00 -9.7026979334374914e+00
-9.7457969676344760e+00 -9.7889154441094774e+00 -9.8320542972711564e+00 -9.8749024758928954e+00 -9.9176676449848173e+00
-9.9604518241702067e+00 -1.0003255855809869e+01 -1.0046080568707225e+01 -1.0088926778514065e+01 -1.0131795287896693e+01
-1.0174686886752681e+01 -1.0217602352416293e+01 -1.0260542449857894e+01 -1.0303507931886486e+01 -1.0346499539340471e+01
-1.0389518001277565e+01 -1.0432564035159942e+01 -1.0475638347037830e+01 -1.0518741631724708e+01 -1.0561874572971817e+01
-1.0605037843637035e+01 -1.0648232105854731e+01 -1.0691458011195323e+01 -1.0734716200826654e+01 -1.0778007305671338e+01
-1.0821331946557279e+01 -1.0864690734371720e+01 -1.0908084270204256e+01 -1.0951513145493493e+01 -1.0994977942169044e+01
-1.1038479232788461e+01 -1.1082017580672300e+01 -1.1125593540039802e+01 -1.1169207656138724e+01 -1.1212860465371193e+01
-1.1256552495422056e+01 -1.1300284265381379e+01 -1.1344056285864497e+01 -1.1387869059131503e+01 -1.1431723079201220e+01
-1.1475618831971758e+01 -1.1519556795323240e+01 -1.1563537439236882e+01 -1.1607561225897086e+01 -1.1651628609799957e+01
-1.1695740037856638e+01 -1.1739895949496542e+01 -1.1784096776765693e+01 -1.1828342944444898e+01 -1.1872634870038326e+01
-1.1916972964140655e+01 -1.1961357630185660e+01 -1.2005789264757027e+01 -1.2050268257623998e+01 -1.2094794991829531e+01
-1.2139369843773920e+01 -1.2183993183309894e+01 -1.2228665373815033e+01 -1.2273386772283743e+01 -1.2318157729378811e+01
-1.2362978589648264e+01 -1.2407849691329261e+01 -1.2452771366701370e+01 -1.2497743942026659e+01 -1.2542767737650308e+01
-1.2587843068072686e+01 -1.2632970242029501e+01 -1.2678149562554552e+01 -1.2723381327055449e+01 -1.2768665827383359e+01
-1.2814003349896723e+01 -1.2859394175532202e+01 -1.2904838579871580e+01 -1.2950336833201561e+01 -1.2995889200586475e+01
-1.3041495941923358e+01 -1.3087157312007946e+01 -1.3132873560597147e+01 -1.3178644932465090e+01 -1.3224471667467185e+01
-1.3270354000597138e+01 -1.3316292162042373e+01 -1.3362286377246335e+01 -1.3408336866957768e+01 -1.3454443847292225e+01
-1.3500607529781746e+01 -1.3546828121432384e+01 -1.3593105824775080e+01 -1.3639440837916936e+01 -1.3685833354596070e+01
-1.3732283564231011e+01 -1.3778791651965662e+01 -1.3825357798727850e+01 -1.3871982181271051e+01 -1.3918664972226225e+01
-1.3965406340150423e+01 -1.4012206449566122e+01 -1.4059065461010562e+01 -1.4105983531084178e+01 -1.4152960812497383e+01
-1.4199997454109450e+01 -1.4247093600900882e+01 -1.4294249394311976e+01 -1.4341464971842868e+01 -1.4388740467389482e+01
-1.4436076011212833e+01 -1.4483471729977452e+01 -1.4530927746798284e+01 -1.4578444181270356e+01 -1.4626021149517328e+01
-1.4673658764226502e+01 -1.4721357134688901e+01 -1.4769116366835874e+01 -1.4816936563275590e+01 -1.4864817823335784e+01
-1.4912760243093658e+01 -1.4960763915414873e+01 -1.5008828929991182e+01 -1.5056955373373171e+01 -1.5105143329006637e+01
-1.5153392877265333e+01 -1.5201704095489163e+01 -1.5250077058012721e+01 -1.5298511836202465e+01 -1.5347008498487980e+01
-1.5395567110395177e+01 -1.5444187734579373e+01 -1.5492870430854509e+01 -1.5541615256228738e+01 -1.5590422264928975e+01
-1.5639291508440465e+01 -1.5688223035530768e+01 -1.5737216892280117e+01 -1.5786273122116540e+01 -1.5835391765839859e+01
-1.5884572861650554e+01 -1.5933816445184789e+01 -1.5983122549535665e+01 -1.6032491205288238e+01 -1.6081922440538847e+01
-1.6131416280932740e+01 -1.6180972749683292e+01 -1.6230591867602584e+01 -1.6280273653129598e+01 -1.6330018122352271e+01
-1.6379825289037512e+01 -1.6429695164657005e+01 -1.6479627758410516e+01 -1.6529623077253063e+01 -1.6579681125920047e+01
-1.6629801906948160e+01 -1.6679985420709272e+01 -1.6730231665424185e+01 -1.6780540637196850e+01 -1.6830912330026536e+01
-1.6881346735842499e+01 -1.6931843844523655e+01 -1.6982403643917451e+01 -1.7033026119869078e+01 -1.7083711256241486e+01
-1.7134459034938232e+01 -1.7185269435924170e+01 -1.7236142437252170e+01 -1.7287078015076759e+01 -1.7338076143685498e+01
-1.7389136795514560e+01 -1.7440259941169757e+01 -1.7491445549452351e+01 -1.7542693587371559e+01 -1.7594004020176158e+01
-1.7645376811364258e+01 -1.7696811922712527e+01 -1.7748309314288690e+01 -1.7799868944476657e+01 -1.7851490769996190e+01
-1.7903174745917113e+01 -1.7954920825684781e+01 -1.8006728961136105e+01 -1.8058599102520361e+01 -1.8110531198513968e+01
-1.8162525196246406e+01 -1.8214581041310112e+01 -1.8266698677789350e+01 -1.8318878048276588e+01 -1.8371119093861239e+01
-1.8423421754189917e+01 -1.8475785967356501e+01 -1.8528211670354381e+01 -1.8580698798442995e+01 -1.8633247285599964e+01
-1.8685857064432412e+01 -1.8738528066186518e+01 -1.8791260220768891e+01 -1.8844053456762026e+01 -1.8896907701446821e+01
-1.8949822880805868e+01 -1.9002798919552447e+01 -1.9055835741138708e+01 -1.9108933267775342e+01 -1.9162091420446473e+01
-1.9215310118921138e+01 -1.9268589281777849e+01 -1.9321928826411295e+01 -1.9375328669053715e+01 -1.9428788724780020e+01
-1.9482308907539277e+01 -1.9535889130155283e+01 -1.9589529304345319e+01 -1.9643229340734365e+01 -1.9696989148876355e+01
-1.9750808637254977e+01 -1.9804687713312433e+01 -1.9858626283450008e+01 -1.9912624253055810e+01 -1.9966681526506250e+01
-2.0020798007185476e+01 -2.0074973597498911e+01 -2.0129208198888136e+01 -2.0183501711838630e+01 -2.0237854035899204e+01
-2.0292265069692348e+01 -2.0346734710925716e+01 -2.0401262856409858e+01 -2.0455849402064473e+01 -2.0510494242935920e+01
-2.0565197273209719e+01 -2.0619958386219423e+01 -2.0674777474463212e+01 -2.0729654429611969e+01 -2.0784589142526556e+01
-2.0839581503263844e+01 -2.0894631401093307e+01 -2.0949738724508393e+01 -2.1004903361235051e+01 -2.1060125198247988e+01
-2.1115404121775100e+01 -2.1170740017318053e+01 -2.1226132769657397e+01 -2.1281582262894972e+01 -2.1337088380382852e+01
-2.1392651004661843e+01 -2.1448270018015251e+01 -2.1503945301662043e+01 -2.1559676736299366e+01 -2.1615464201984196e+01
-2.1671307578140954e+01 -2.1727206743568217e+01 -2.1783161576455427e+01 -2.1839171954393919e+01 -2.1895237754379082e+01
-2.1951358852829799e+01 -2.2007535125591289e+01 -2.2063766447950343e+01 -2.2120052694642595e+01 -2.2176393739860259e+01
-2.2232789457272474e+01 -2.2289239719961643e+01 -2.2345744400729018e+01 -2.2402303371517178e+01 -2.2458916504038370e+01
-2.2515583669430725e+01 -2.2572304738325670e+01 -2.2629079581086330e+01 -2.2685908067306855e+01 -2.2742790066346515e+01
-2.2799725447076526e+01 -2.2856714077931429e+01 -2.2913755826927172e+01 -2.2970850561669295e+01 -2.3027998149359064e+01
-2.3085198456800299e+01 -2.3142451350411534e+01 -2.3199756696229770e+01 -2.3257114359926845e+01 -2.3314524206806482e+01
-2.3371986101824632e+01 -2.3429499909581864e+01 -2.3487065494349963e+01 -2.3544682720213359e+01 -2.3602351450489550e+01
-2.3660071548650194e+01 -2.3717842877714475e+01 -2.3775665300345281e+01 -2.3833538678952209e+01 -2.3891462875653588e+01
-2.3949437752298763e+01 -2.4007463170460369e+01 -2.4065538991453877e+01 -2.4123665076338057e+01 -2.4181841285923610e+01
-2.4240067480782272e+01 -2.4298343521253173e+01 -2.4356669267446705e+01 -2.4415044579252253e+01 -2.4473469316351384e+01
-2.4531943338216706e+01 -2.4590466504118467e+01 -2.4649038673143195e+01 -2.4707659704179378e+01 -2.4766329455946106e+01
-2.4825047786983760e+01 -2.4883814555664912e+01 -2.4942629620207981e+01 -2.5001492838670174e+01 -2.5060404068966136e+01
-2.5119363168864538e+01 -2.5178369996001948e+01 -2.5237424407882145e+01 -2.5296526261886129e+01 -2.5355675415276437e+01
-2.5414871725205558e+01 -2.5474115048716612e+01 -2.5533405242759045e+01 -2.5592742164177480e+01 -2.5652125669732186e+01
-2.5711555616102487e+01 -2.5771031859886762e+01 -2.5830554257610174e+01 -2.5890122665731269e+01 -2.5949736940650155e+01
-2.6009396938703958e+01 -2.6069102516181829e+01 -2.6128853529326307e+01 -2.6188649834339685e+01 -2.6248491287389243e+01
-2.6308377744605878e+01 -2.6368309062100934e+01 -2.6428285095962565e+01 -2.6488305702088610e+01 -2.6548370736885317e+01
-2.6608480056235521e+01 -2.6668633516188947e+01 -2.6728830972768947e+01 -2.6789072282060033e+01 -2.6849357300140582e+01
1.0000000000000000e+01 1.0801250630455797e+01 1.0617301586939504e+01 1.0436833263885262e+01 1.0259774441010791e+01
1.0086055417347552e+01 9.9156079783837754e+00 9.7483653639170598e+00 9.5842622365983630e+00 9.4232346511569745e+00
9.2652200242883396e+00 9.1101571051919450e+00 8.9579859467472716e+00 8.8086478773091699e+00 8.6620854731154395e+00
8.5182425312888768e+00 8.3770640434238430e+00 8.2384961697429731e+00 8.1024862138128810e+00 7.9689825978078090e+00
7.8379348383064951e+00 7.7092935226140753e+00 7.5830102855955488e+00 7.4590377870116811e+00 7.3373296893445286e+00
7.2178406361032330e+00 7.1005262306008490e+00 6.9853430151894997e+00 6.8722484509459889e+00 6.7612008977976643e+00
6.6521595950793255e+00 6.5450846425111706e+00 6.4399369815891703e+00 6.3366783773799114e+00 6.2352714007088537e+00
6.1356794107365431e+00 6.0378665379119241e+00 5.9417976672963562e+00 5.8474384222482740e+00 5.7547551484639143e+00
5.6637148983634233e+00 5.5742854158160355e+00 5.4864351211977294e+00 5.4001330967728620e+00 5.3153490723937580e+00
5.2320534115112594e+00 5.1502170974889339e+00 5.0698117202151138e+00 4.9908094630057178e+00 4.9131830897922271e+00
4.8369059325884791e+00 4.7619518792290876e+00 4.6882953613761629e+00 4.6159113427851821e+00 4.5447753078280471e+00
4.4748632502644341e+00 4.4061516622586225e+00 4.3386175236344116e+00 4.2722382913651700e+00 4.2069918892916007e+00
4.1428566980642358e+00 4.0798115453044659e+00 4.0178356959802954e+00 3.9569088429914245e+00 3.8970110979596200e+00
3.8381229822198435e+00 3.7802254180071628e+00 3.7232997198366320e+00 3.6673275860703995e+00 3.6122910906684780e+00
3.5581726751199767e+00 3.5049551405497539e+00 3.4526216399971048e+00 3.4011556708634600e+00 3.3505410675235936e+00
3.3007619940993322e+00 3.2518029373895416e+00 3.2036486999552665e+00 3.1562843933552358e+00 3.1096954315288059e+00
3.0638675243237259e+00 3.0187866711643210e+00 2.9744391548584872e+00 2.9308115355394193e+00 2.8878906447394712e+00
2.8456635795935767e+00 2.8041176971686212e+00 2.7632406089169876e+00 2.7230201752508236e+00 2.6834445002347138e+00
2.6445019263941703e+00 2.6061810296373835e+00 2.5684706142875200e+00 2.5313597082236612e+00 2.4948375581276281e+00
2.4588936248343458e+00 2.4235175787838017e+00 2.3886992955719677e+00 2.3544288515990814e+00 2.3206965198123726e+00
2.2874927655419270e+00 2.2548082424273730e+00 2.2226337884330718e+00 2.1909604219504502e+00 2.1597793379851566e+00
2.1290819044273803e+00 2.0988596584032564e+00 2.0691043027062364e+00 2.0398077023055379e+00 2.0109618809312195e+00
1.9825590177335428e+00 1.9545914440145751e+00 1.9270516400317490e+00 1.8999322318704799e+00 1.8732259883849522e+00
1.8469258182056052e+00 1.8210247668116040e+00 1.7955160136671395e+00 1.7703928694199718e+00 1.7456487731607453e+00
1.7212772897420763e+00 1.6972721071559391e+00 1.6736270339679251e+00 1.6503359968072218e+00 1.6273930379113750e+00
1.6047923127241077e+00 1.5825280875452847e+00 1.5605947372321296e+00 1.5389867429502644e+00 1.5176986899731588e+00
1.4967252655299603e+00 1.4760612566992890e+00 1.4557015483494027e+00 1.4356411211222593e+00 1.4158750494618957e+00
1.3963984996850130e+00 1.3772067280935900e+00 1.3582950791282968e+00 1.3396589835618542e+00 1.3212939567312461e+00
1.3031955968085143e+00 1.2853595831085869e+00 1.2677816744339125e+00 1.2504577074545153e+00 1.2333835951233212e+00
1.2165553251254906e+00 1.1999689583611897e+00 1.1836206274610817e+00 1.1675065353339207e+00 1.1516229537450258e+00
1.1359662219258198e+00 1.1205327452130547e+00 1.1053189937170771e+00 1.0903215010191687e+00 1.0755368628964561e+00
1.0609617360743258e+00 1.0465928370056616e+00 1.0324269406761744e+00 1.0184608794353309e+00 1.0046915418523596e+00
9.9111587159659820e-01 9.7773086634202144e-01 9.6453357669496853e-01 9.5152110514480626e-01 9.3869060503712376e-01
9.2603927956864140e-01 9.1356438080370239e-01 9.0126320871155485e-01 8.8913311022427521e-01 8.7717147831462938e-01
8.6537575109353426e-01 8.5374341092675010e-01 8.4227198357020328e-01 8.3095903732382581e-01 8.1980218220310874e-01
8.0879906912829824e-01 7.9794738913080465e-01 7.8724487257618136e-01 7.7668928840378371e-01 7.6627844338220541e-01
7.5601018138057441e-01 7.4588238265517859e-01 7.3589296315095609e-01 7.2603987381787150e-01 7.1632109994155613e-01
7.0673466048788924e-01 6.9727860746149162e-01 6.8795102527759866e-01 6.7875003014695068e-01 6.6967376947370028e-01
6.6072042126578623e-01 6.5188819355765304e-01 6.4317532384495735e-01 6.3458007853101606e-01 6.2610075238490026e-01
6.1773566801053903e-01 6.0948317532703555e-01 6.0134165105970538e-01 5.9330949824153834e-01 5.8538514572508404e-01
5.7756704770434197e-01 5.6985368324655994e-01 5.6224355583360364e-01 5.5473519291279416e-01 5.4732714545698968e-01
5.4001798753364838e-01 5.3280631588273764e-01 5.2569074950327632e-01 5.1866992924830413e-01 5.1174251742814292e-01
5.0490719742172274e-01 4.9816267329578068e-01 4.9150766943189694e-01 4.8494093016099704e-01 4.7846121940521869e-01
4.7206732032718257e-01 4.6575803498626733e-01 4.5953218400168616e-01 4.5338860622255339e-01 4.4732615840449164e-01
4.4134371489266400e-01 4.3544016731128998e-01 4.2961442425926322e-01 4.2386541101190822e-01 4.1819206922867025e-01
4.1259335666658892e-01 4.0706824689955035e-01 4.0161572904300691e-01 3.9623480748423034e-01 3.9092450161784775e-01
3.8568384558664448e-01 3.8051188802739233e-01 3.7540769182181144e-01 3.7037033385227858e-01 3.6539890476236891e-01
3.6049250872219041e-01 3.5565026319808801e-01 3.5087129872705347e-01 3.4615475869537526e-01 3.4149979912160333e-01
3.3690558844382679e-01 3.3237130731094133e-01 3.2789614837797743e-01 3.2347931610542879e-01 3.1912002656234151e-01
3.1481750723327195e-01 3.1057099682888989e-01 3.0637974510021770e-01 3.0224301265637266e-01 2.9816007078585649e-01
2.9413020128113843e-01 2.9015269626666473e-01 2.8622685803004089e-01 2.8235199885637741e-01 2.7852744086590420e-01
2.7475251585446614e-01 2.7102656513704559e-01 2.6734893939429405e-01 2.6371899852185088e-01 2.6013611148246873e-01
2.5659965616091007e-01 2.5310901922152595e-01 2.4966359596849230e-01 2.4626279020852770e-01 2.4290601411627044e-01
2.3959268810202516e-01 2.3632224068197960e-01 2.3309410835077227e-01 2.2990773545640408e-01 2.2676257407740064e-01
2.2365808390219311e-01 2.2059373211072231e-01 2.1756899325815038e-01 2.1458334916067301e-01 2.1163628878335583e-01
2.0872730813005891e-01 2.0585591013519533e-01 2.0302160455757168e-01 2.0022390787598532e-01 1.9746234318676770e-01
1.9473644010305513e-01 1.9204573465593366e-01 1.8938976919722261e-01 1.8676809230404867e-01 1.8418025868501697e-01
1.8162582908807767e-01 1.7910437020998504e-01 1.7661545460728956e-01 1.7415866060892160e-01 1.7173357223026997e-01
1.6933977908869480e-01 1.6697687632057967e-01 1.6464446449973735e-01 1.6234214955720816e-01 1.6006954270248031e-01
1.5782626034600167e-01 1.5561192402300605e-01 1.5342616031865575e-01 1.5126860079441595e-01 1.4913888191569047e-01
1.4703664498065105e-01 1.4496153605028095e-01 1.4291320587954814e-01 1.4089130984976528e-01 1.3889550790202332e-01
1.3692546447178433e-01 1.3498084842450275e-01 1.3306133299233025e-01 1.3116659571184286e-01 1.2929631836281086e-01
1.2745018690792786e-01 1.2562789143357200e-01 1.2382912609148811e-01 1.2205358904140517e-01 1.2030098239464682e-01
1.1857101215854371e-01 1.1686338818183373e-01 1.1517782410088362e-01 1.1351403728677267e-01 1.1187174879324324e-01
1.1025068330544618e-01 1.0865056908951143e-01 1.0707113794291789e-01 1.0551212514563968e-01 1.0397326941204543e-01
1.0245431284357487e-01 1.0095500088214582e-01 9.9475082264262937e-02 9.8014308975870268e-02 9.6572436207889467e-02
9.5149222312433945e-02 9.3744428759716225e-02 9.2357820095600562e-02 9.0989163899807046e-02 8.9638230744780500e-02
8.8304794155128263e-02 8.6988630567732095e-02 8.5689519292436067e-02 8.4407242473327759e-02 8.3141585050604316e-02
8.1892334723027815e-02 8.0659281910912206e-02 7.9442219719687568e-02 7.8240943904000826e-02 7.7055252832346710e-02
7.5884947452225848e-02 7.4729831255814450e-02 7.3589710246154016e-02 7.2464392903814900e-02 7.1353690154065674e-02
7.0257415334517681e-02 6.9175384163253639e-02 6.8107414707390124e-02 6.7053327352138758e-02 6.6012944770277748e-02
6.4986091892085707e-02 6.3972595875702698e-02 6.2972286077917161e-02 6.1984994025379159e-02 6.1010553386208866e-02
6.0048799942037157e-02 5.9099571560412123e-02 5.8162708167624810e-02 5.7238051721903105e-02 5.6325446186999528e-02
5.5424737506136967e-02 5.4535773576326108e-02 5.3658404223060785e-02 5.2792481175329975e-02 5.1937858041017471e-02
5.1094390282629742e-02 5.0261935193351093e-02 4.9440351873451860e-02 4.8629501207008818e-02 4.7829245838954204e-02
4.7039450152438045e-02 4.6259980246510013e-02 4.5490703914087494e-02 4.4731490620259606e-02 4.3982211480854794e-02
4.3242739241325268e-02 4.2512948255901239e-02 4.1792714467042247e-02 4.1081915385161816e-02 4.0380430068628126e-02
3.9688139104032016e-02 3.9004924586723888e-02 3.8330670101623943e-02 3.7665260704261794e-02 3.7008582902100740e-02
3.6360524636098734e-02 3.5720975262514720e-02 3.5089825534961649e-02 3.4466967586696873e-02 3.3852294913154113e-02
3.3245702354694373e-02 3.2647086079598653e-02 3.2056343567280043e-02 3.1473373591718756e-02 3.0898076205114089e-02
3.0330352721755771e-02 2.9770105702100036e-02 2.9217238937061962e-02 2.8671657432515429e-02 2.8133267393987360e-02
2.7601976211552692e-02 2.7077692444942292e-02 2.6560325808820506e-02 2.6049787158272331e-02 2.5545988474477754e-02
2.5048842850559749e-02 2.4558264477628988e-02 2.4074168631003978e-02 2.3596471656606832e-02 2.3125090957536454e-02
2.2659944980823798e-02 2.2200953204335683e-02 2.1748036123873327e-02 2.1301115240412782e-02 2.0860113047532769e-02
2.0424953018977066e-02 1.9995559596398205e-02 1.9571858177249157e-02 1.9153775102828896e-02 1.8741237646474174e-02
1.8334174001923720e-02 1.7932513271801676e-02 1.7536185456265119e-02 1.7145121441799471e-02 1.6759252990136364e-02
1.6378512727332817e-02 1.6002834132978205e-02 1.5632151529535232e-02 1.5266400071822006e-02 1.4905515736628239e-02
1.4549435312452008e-02 1.4198096389378967e-02 1.3851437349077567e-02 1.3509397354926733e-02 1.3171916342264223e-02
1.2838935008766206e-02 1.2510394804928215e-02 1.2186237924689647e-02 1.1866407296154180e-02 1.1550846572444651e-02
1.1239500122655843e-02 1.0932313022929629e-02 1.0629231047647014e-02 1.0330200660712663e-02 1.0035169006965661e-02
9.7440839036889715e-03 9.4568938322237006e-03 9.1735479296908284e-03 8.8939959808188584e-03 8.6181884098633921e-03
8.3460762726380588e-03 8.0776112486364848e-03 7.8127456332576228e-03 7.5514323301215658e-03 7.2936248434884998e-03
7.0392772707632556e-03 6.7883442950981143e-03 6.5407811780883174e-03 6.2965437525517309e-03 6.0555884154033235e-03
5.8178721206175732e-03 5.5833523722726985e-03 5.3519872176873706e-03 5.1237352406410253e-03 4.8985555546731119e-03
4.6764077964680517e-03 4.4572521193242398e-03 4.2410491867018174e-03 4.0277601658472717e-03 3.8173467214993595e-03
3.6097710096757996e-03 3.4049956715283547e-03 3.2029838272795708e-03 3.0036990702375643e-03 2.8071054608720947e-03
2.6131675209760674e-03 2.4218502278956500e-03 2.2331190088270558e-03 2.0469397351849938e-03 1.8632787170468346e-03
1.6821026976564513e-03 1.5033788479984489e-03 1.3270747614446132e-03 1.1531584484569257e-03 9.8159833136179930e-04
8.1236323918953968e-04 6.4542240257081662e-04 4.8074544870146951e-04 3.1830239637042901e-04 1.5806365104042985e-04
0. 0. 0. 0. 0.
0. 5.4383329664155645e-05 9.3944898415945083e-04 4.3251847212615047e-03 1.2334244035325348e-02
2.7137722173468548e-02 5.0697119791449641e-02 8.4607638668976470e-02 1.3001641279549414e-01 1.8759487452762702e-01
2.5754900895683441e-01 3.3965493779430744e-01 4.3331024634064264e-01 5.3759384878832961e-01 6.5132908316254046e-01
7.7314622535699939e-01 9.0154178511424377e-01 1.0349328562818201e+00 1.1717054897399350e+00 1.3102565818166738e+00
1.4490291582473986e+00 1.5865412121263560e+00 1.7214084470448441e+00 1.8523614026473965e+00 1.9782575145276269e+00
2.0980886961566938e+00 2.2109850373516764e+00 2.3162151996095730e+00 2.4131840597491703e+00 2.5014281146549706e+00
2.5806091153285706e+00 2.6505063508648590e+00 2.7110079545661563e+00 2.7621015568249447e+00 2.8038645637913220e+00
2.8364542979766156e+00 2.8600981973448825e+00 2.8750842333755031e+00 2.8817516761559574e+00 2.8804823057701157e+00
2.8716921439699092e+00 2.8558237581894161e+00 2.8333391711552594e+00 2.8047133934346959e+00 2.7704285829676252e+00
2.7309688247181469e+00 2.6868155147671331e+00 2.6384433262347358e+00 2.5863167291097398e+00 2.5308870321738226e+00
2.4725899125317596e+00 2.4118433966060167e+00 2.3490462556752334e+00 2.2845767789603002e+00 2.2187918877813502e+00
2.1520265552815943e+00 2.0845934975626363e+00 2.0167831036919637e+00 1.9488635738636404e+00 1.8810812369508270e+00
1.8136610207193371e+00 1.7468070500507196e+00 1.6807033505858371e+00 1.6155146372447149e+00 1.5513871690559142e+00
1.4884496536383409e+00 1.4268141864958608e+00 1.3665772120042590e+00 1.3078204945836447e+00 1.2506120900523854e+00
1.1950073085502879e+00 1.1410496616995687e+00 1.0887717878420631e+00 1.0381963502565981e+00 9.8933690422003551e-01
9.4219872964247031e-01 8.9677962677415124e-01 8.5307067316958651e-01 8.1105694069385592e-01 7.7071817188505065e-01
7.3202941544290212e-01 6.9496162100761794e-01 6.5948219372701189e-01 6.2555550939233484e-01 5.9314339115629977e-01
5.6220554903693554e-01 5.3269998356387660e-01 5.0458335504023211e-01 4.7781131998032222e-01 4.5233883634534777e-01
4.2812043923464138e-01 4.0511048870905242e-01 3.8326339142174781e-01 3.6253379771729577e-01 3.4287677583286325e-01
3.2424796479760154e-01 3.0660370758054967e-01 2.8990116598452254e-01 2.7409841872609064e-01 2.5915454407883409e-01
2.4502968839369110e-01 2.3168512174254197e-01 2.1908328186436687e-01 2.0718780752542632e-01 1.9596356233750800e-01
1.8537665001230508e-01 1.7539442196444632e-01 1.6598547811304609e-01 1.5711966166996927e-01 1.4876804864444715e-01
1.4090293273673637e-01 1.3349780623990259e-01 1.2652733751724909e-01 1.1996734557434463e-01 1.1379477219856060e-01
1.0798765209582406e-01 1.0252508141368288e-01 9.7387185001678311e-02 9.2555082724584015e-02 8.8010855111109620e-02
8.3737508589961873e-02 7.9718940536826377e-02 7.5939904329596963e-02 7.2385974585237101e-02 6.9043512729294765e-02
6.5899633029043336e-02 6.2942169202580001e-02 6.0159641699440547e-02 5.7541225732930634e-02 5.5076720130546430e-02
5.2756517056398833e-02 5.0571572648238083e-02 4.8513378601664936e-02 4.6573934725081756e-02 4.4745722480991068e-02
4.3021679522073253e-02 4.1395175224364866e-02 3.9859987214311721e-02 3.8410278881708670e-02 3.7040577866510604e-02
3.5745755503880039e-02 3.4521007208912380e-02 3.3361833779917971e-02 3.2264023597108116e-02 3.1223635691821294e-02
3.0236983660070216e-02 2.9300620393215571e-02 2.8411323597772320e-02 2.7566082075896281e-02 2.6762082737777249e-02
2.5996698317105604e-02 2.5267475760840985e-02 2.4572125264713973e-02 2.3908509926274246e-02 2.3274635987705516e-02
2.2668643641204911e-02 2.2088798370316409e-02 2.1533482801290083e-02 2.1001189039288493e-02 2.0490511464994254e-02
2.0000139967999431e-02 1.9528853594166895e-02 1.9075514584991349e-02 1.8639062787818239e-02 1.8218510416650235e-02
1.7812937144080498e-02 1.7421485505751177e-02 1.7043356599549031e-02 1.6677806062561751e-02 1.6324140309613155e-02
1.5981713017976018e-02 1.5649921843605585e-02 1.5328205354974755e-02 1.5016040171312250e-02 1.4712938292708366e-02
1.4418444610242331e-02 1.4132134584901757e-02 1.3853612084676337e-02 1.3582507369821917e-02 1.3318475216818060e-02
1.3061193172097418e-02 1.2810359927147186e-02 1.2565693807050415e-02 1.2326931365025051e-02 1.2093826075940506e-02
1.1866147122233661e-02 1.1643678266026136e-02 1.1426216801644407e-02 1.1213572583084475e-02 1.1005567121320226e-02
1.0802032746662471e-02 1.0602811831688208e-02 1.0407756070544782e-02 1.0216725810699157e-02 1.0029589433467268e-02
9.8462227798860602e-03 9.6665086187306404e-03 9.4903361536790021e-03 9.3176005668363371e-03 9.1482025960089031e-03
8.9820481433065535e-03 8.8190479128032462e-03 8.6591170751522117e-03 8.5021749571883021e-03 8.3481447546937537e-03
8.1969532666261724e-03 8.0485306492223962e-03 7.9028101885199598e-03 7.7597280899136256e-03 7.6192232834934315e-03
7.4812372439735375e-03 7.3457138241272979e-03 7.2125991007052359e-03 7.0818412319012813e-03 6.9533903254870300e-03
6.8271983168139705e-03 6.7032188559211503e-03 6.5814072030662141e-03 6.4617201320263939e-03 6.3441158405819764e-03
6.2285538676237207e-03 6.1149950163802147e-03 6.0034012832899109e-03 5.8937357920846312e-03 5.7859627326801166e-03
5.6800473044990030e-03 5.5759556638887986e-03 5.4736548753111791e-03 5.3731128660109428e-03 5.2742983838981461e-03
5.1771809583849582e-03 5.0817308639591330e-03 4.9879190862693046e-03 4.8957172905357560e-03 4.8050977921015592e-03
4.7160335289582467e-03 4.6284980360953021e-03 4.5424654215287241e-03 4.4579103438822931e-03 4.3748079913988880e-03
4.2931340622749670e-03 4.2128647462132407e-03 4.1339767071033873e-03 4.0564470667446839e-03 3.9802533895282599e-03
3.9053736680121076e-03 3.8317863093158128e-03 3.7594701222811860e-03 3.6884043053326127e-03 3.6185684349951674e-03
3.5499424550168301e-03 3.4825066660512660e-03 3.4162417158645347e-03 3.3511285900229004e-03 3.2871486030347646e-03
3.2242833899080170e-03 3.1625148980992668e-03 3.1018253798278661e-03 3.0421973847258310e-03 2.9836137528083811e-03
2.9260576077371064e-03 2.8695123503632708e-03 2.8139616525287708e-03 2.7593894511106498e-03 2.7057799422959966e-03
2.6531175760685227e-03 2.6013870509009052e-03 2.5505733086344240e-03 2.5006615295404683e-03 2.4516371275501436e-03
2.4034857456453340e-03 2.3561932514012535e-03 2.3097457326723414e-03 2.2641294934160616e-03 2.2193310496436136e-03
2.1753371254977782e-03 2.1321346494441173e-03 2.0897107505768314e-03 2.0480527550303662e-03 2.0071481824917164e-03
1.9669847428123305e-03 1.9275503327108034e-03 1.8888330325659355e-03 1.8508211032951805e-03 1.8135029833145980e-03
1.7768672855772646e-03 1.7409027946878666e-03 1.7055984640891586e-03 1.6709434133182904e-03 1.6369269253308227e-03
1.6035384438881917e-03 1.5707675710093030e-03 1.5386040644797400e-03 1.5070378354209296e-03 1.4760589459142243e-03
1.4456576066784674e-03 1.4158241748004133e-03 1.3865491515145517e-03 1.3578231800324136e-03 1.3296370434173130e-03
1.3019816625059188e-03 1.2748480938728074e-03 1.2482275278369870e-03 1.2221112865106742e-03 1.1964908218862064e-03
1.1713577139624703e-03 1.1467036689077198e-03 1.1225205172586891e-03 1.0988002121543120e-03 1.0755348276031765e-03
1.0527165567835728e-03 1.0303377103750150e-03 1.0083907149206553e-03 9.8686811121878604e-04 9.6576255274356815e-04
9.4506680409354657e-04 9.2477373946662708e-04 9.0487634116191706e-04 8.8536769810608137e-04 8.6624100440530968e-04
8.4748955791986991e-04 8.2910675886310736e-04 8.1108610842155551e-04 7.9342120739794852e-04 7.7610575487466887e-04
7.5913354689786591e-04 7.4249847518158968e-04 7.2619452583109687e-04 7.1021577808524222e-04 6.9455640307671332e-04
6.7921066261025093e-04 6.6417290795844214e-04 6.4943757867335500e-04 6.3499920141575628e-04 6.2085238879914031e-04
6.0699183824991856e-04 5.9341233088238896e-04 5.8010873038847818e-04 5.6707598194186137e-04 5.5430911111587280e-04
5.4180322281523891e-04 5.2955350022104025e-04 5.1755520374872563e-04 5.0580367001857793e-04 4.9429431083891986e-04
4.8302261220136561e-04 4.7198413328763435e-04 4.6117450548847222e-04 4.5058943143359842e-04 4.4022468403297037e-04
4.3007610552883886e-04 4.2013960655883260e-04 4.1041116522908330e-04 4.0088682619821882e-04 3.9156269977118005e-04
3.8243496100300207e-04 3.7349984881274514e-04 3.6475366510662147e-04 3.5619277391102898e-04 3.4781360051482253e-04
3.3961263062063513e-04 3.3158640950565685e-04 3.2373154119109092e-04 3.1604468762060252e-04 3.0852256784754707e-04
3.0116195723081836e-04 2.9395968663908575e-04 2.8691264166377101e-04 2.8001776184017647e-04 2.7327203987681688e-04
2.6667252089326854e-04 2.6021630166557681e-04 2.5390052988028163e-04 2.4772240339593181e-04 2.4167916951265550e-04
2.3576812424967210e-04 2.2998661163024531e-04 2.2433202297460642e-04 2.1880179620031078e-04 2.1339341513026532e-04
2.0810440880823181e-04 2.0293235082175821e-04 1.9787485863260665e-04 1.9292959291436311e-04 1.8809425689761319e-04
1.8336659572205580e-04 1.7874439579616125e-04 1.7422548416372047e-04 1.6980772787763936e-04 1.6548903338088530e-04
1.6126734589430591e-04 1.5714064881157744e-04 1.5310696310104604e-04 1.4916434671449329e-04 1.4531089400280153e-04
1.4154473513841234e-04 1.3786403554466153e-04 1.3426699533172857e-04 1.3075184873951283e-04 1.2731686358694039e-04
1.2396034072819674e-04 1.2068061351527565e-04 1.1747604726729168e-04 1.1434503874632306e-04 1.1128601563955686e-04
1.0829743604811193e-04 1.0537778798212988e-04 1.0252558886227753e-04 9.9739385027582898e-05 9.7017751249615057e-05
9.4359290252773662e-05 9.1762632240957511e-05 8.9226434430383569e-05 8.6749380588361721e-05 8.4330180578390864e-05
8.1967569911181246e-05 7.9660309301724484e-05 7.7407184232279429e-05 7.5207004521348451e-05 7.3058603898526649e-05
7.0960839585107720e-05 6.8912591880629977e-05 6.6912763755002085e-05 6.4960280446513426e-05 6.3054089065330086e-05
6.1193158202771814e-05 5.9376477546041213e-05 5.7603057498502742e-05 5.5871928805544500e-05 5.4182142185708361e-05
5.2532767967318744e-05 5.0922895730446966e-05 4.9351633954125953e-05 4.7818109668823321e-05 4.6321468114150300e-05
4.4860872401664663e-05 4.3435503182825573e-05 4.2044558321957873e-05 4.0687252574273750e-05 3.9362817268785450e-05
3.8070499996214428e-05 3.6809564301621984e-05 3.5579289382025496e-05 3.4378969788611451e-05 3.3207915133769052e-05
3.2065449802711312e-05 3.0950912669766876e-05 2.9863656819185611e-05 2.8803049270468119e-05 2.7768470708167169e-05
2.6759315216115260e-05 2.5774990015931323e-05 2.4814915209964844e-05 2.3878523528387922e-05 2.2965260080560611e-05
2.2074582110528148e-05 2.1205958756658535e-05 2.0358870815317476e-05 1.9532810508535560e-05 1.8727281255713447e-05
1.7941797449145505e-05 1.7175884233475961e-05 1.6429077288930018e-05 1.5700922618341645e-05 1.4990976337865471e-05
1.4298804471386687e-05 1.3623982748522034e-05 1.2966096406226424e-05 1.2324739993882115e-05 1.1699517181902770e-05
1.1090040573734860e-05 1.0495931521266495e-05 9.9168199435395021e-06 9.3523441487842465e-06 8.8021506596591475e-06
8.2658940417265321e-06 7.7432367350197678e-06 7.2338488887770244e-06 6.7374081991923703e-06 6.2535997501888662e-06
5.7821158571569505e-06 5.3226559136389283e-06 4.8749262408651290e-06 4.4386399401326240e-06 4.0135167480073166e-06
3.5992828942305738e-06 3.1956709623667747e-06 2.8024197531120341e-06 2.4192741502208947e-06 2.0459849890155880e-06
1.6823089274468580e-06 1.3280083196495871e-06 9.8285109196557868e-07 6.4661062138351467e-07 3.1906561636122974e-07
0. 0. 0. 0. 0.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +0,0 @@
2 0 1 0 1 0 1
1 0.1 0.166 0.55 0 1 0 0
1 0.1 0.833 0.45 0 -1 0 0

View File

@ -1,9 +0,0 @@
pbc_x 0
pbc_y 0
pbc_z 0
ntimes 200
dt 0.01
reneigh_every 1
mass 0.1
k_s 100
k_dn 10

View File

@ -0,0 +1,10 @@
END=32
for ((i=1;i<=END;i++)); do
output=$(eval "likwid-mpirun -np 1 -t $i -m -g FLOPS_DP -omp gnu ./MDBench-GCC -n 50")
echo -n "$i,"
echo "$output" > "FLOPS_DP/thread_$i.txt"
done
## likwid perf measurements on testfront1:
# srun --nodes=1 --exclusive --nodelist=rome1 --time=00:30:00 --export=NONE -c 64 -C hwperf --pty /bin/bash -l
# likwid-mpirun -np 1 -t 32 -m -g MEM -omp gnu -d ./MDBench-GCC

6
evaluate_cpu_runtime.sh Normal file
View File

@ -0,0 +1,6 @@
#!/bin/bash
for i in $(seq 1 32); do
echo "$i"
export "OMP_NUM_THREADS=$i"
./MDBench-GCC -n 50 | grep "Performance"
done

View File

@ -0,0 +1,5 @@
END=32
for ((i=16;i<=END;i++)); do
export NUM_THREADS=$i
$(eval "ncu --set full -o /home/hpc/rzku/ptfs410h/MD-Bench/log/MG/presentation_2/Resources/GPU/Metrics/threads_$i ./MDBench-NVCC -n 50")
done

View File

@ -0,0 +1,6 @@
END=64
for ((i=1;i<=END;i*=2)); do
output=$(eval "NUM_THREADS=$i ./MDBench-NVCC -n 2000")
echo -n "$i,"
echo "$output" | grep 'atom updates per second' | sed 's/[^0-9.,]//g' | awk '{print $1"e6"}'
done

14
include_CLANG.mk Normal file
View File

@ -0,0 +1,14 @@
CC = cc
LINKER = $(CC)
ANSI_CFLAGS = -ansi
ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
ASFLAGS = -masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE
INCLUDES =
LIBS = -lm #-lomp

15
include_GCC.mk Normal file
View File

@ -0,0 +1,15 @@
CC = gcc
LINKER = $(CC)
ANSI_CFLAGS = -ansi
ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
# CFLAGS = -O0 -g -std=c99 -fargument-noalias
CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
ASFLAGS = -masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE -DLIKWID_PERFMON
INCLUDES = $(LIKWID_INC)
LIBS = -lm $(LIKWID_LIB) -llikwid

17
include_ICC.mk Normal file
View File

@ -0,0 +1,17 @@
CC = icc
LINKER = $(CC)
OPENMP = #-qopenmp
PROFILE = #-profile-functions -g -pg
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
#OPTS = -fast -xCORE-AVX2 $(PROFILE)
#OPTS = -fast -xAVX $(PROFILE)
#OPTS = -fast -xSSE4.2 $(PROFILE)
#OPTS = -fast -no-vec $(PROFILE)
#OPTS = -fast -xHost $(PROFILE)
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
ASFLAGS = #-masm=intel
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
DEFINES = -D_GNU_SOURCE #-DLIKWID_PERFMON
INCLUDES = #$(LIKWID_INC)
LIBS = -lm #$(LIKWID_LIB) -llikwid

15
include_NVCC.mk Normal file
View File

@ -0,0 +1,15 @@
CC = nvcc
LINKER = $(CC)
ANSI_CFLAGS = -ansi
ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
# CFLAGS = -O0 -g -std=c99 -fargument-noalias
CFLAGS = -O3 -g -arch=sm_61 # -fopenmp
ASFLAGS = -masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE -DLIKWID_PERFMON
INCLUDES = $(LIKWID_INC)
LIBS = -lm $(LIKWID_LIB) -llikwid -lcuda -lcudart

View File

@ -1,18 +0,0 @@
CC = /opt/homebrew/Cellar/llvm/18.1.5/bin/clang
LINKER = $(CC)
ANSI_CFLAGS = -ansi
ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
# ANSI_CFLAGS += -Wextra
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) -Xpreprocessor -fopenmp #-g
#CFLAGS = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
ASFLAGS = #-masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE
# MacOSX with Apple Silicon and homebrew
INCLUDES = -I/opt/homebrew/Cellar/libomp/18.1.5/include/
LIBS = -lm -L/opt/homebrew/Cellar/libomp/18.1.5/lib/ -lomp

View File

@ -1,36 +0,0 @@
CC = gcc
LINKER = $(CC)
ANSI_CFLAGS = -ansi
ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
ifeq ($(ISA),AVX512)
CFLAGS = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
endif
ifeq ($(ISA),AVX2)
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
CFLAGS = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
endif
ifeq ($(ISA),AVX)
CFLAGS = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
endif
ifeq ($(ISA),SSE)
CFLAGS = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
endif
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
ASFLAGS = #-masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN
INCLUDES = $(LIKWID_INC)
LIBS = -lm

View File

@ -1,32 +0,0 @@
CC = icc
LINKER = $(CC)
OPENMP = -qopenmp
PROFILE = #-profile-functions -g -pg
ifeq ($(ISA),AVX512)
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
endif
ifeq ($(ISA),AVX2)
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xAVX2 $(PROFILE)
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
endif
ifeq ($(ISA),AVX)
OPTS = -Ofast -xAVX $(PROFILE)
endif
ifeq ($(ISA),SSE)
OPTS = -Ofast -xSSE4.2 $(PROFILE)
endif
#OPTS = -Ofast -no-vec $(PROFILE)
#OPTS = -Ofast -xHost $(PROFILE)
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
ASFLAGS = #-masm=intel
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE
INCLUDES =
LIBS = -lm

View File

@ -1,33 +0,0 @@
CC = icx
LINKER = $(CC)
OPENMP = #-qopenmp
PROFILE = #-profile-functions -g -pg
ifeq ($(ISA),AVX512)
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
#OPTS = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
endif
ifeq ($(ISA),AVX2)
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xHost $(PROFILE)
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
endif
ifeq ($(ISA),AVX)
OPTS = -Ofast -xAVX $(PROFILE)
endif
ifeq ($(ISA),SSE)
OPTS = -Ofast -xSSE4.2 $(PROFILE)
endif
#OPTS = -Ofast -no-vec $(PROFILE)
#OPTS = -Ofast -xHost $(PROFILE)
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
ASFLAGS = #-masm=intel
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN
INCLUDES =
LIBS = -lm

View File

@ -1,29 +0,0 @@
ifeq ($(strip $(ISA)), SSE)
__ISA_SSE__=true
__SIMD_WIDTH_DBL__=2
else ifeq ($(strip $(ISA)), AVX)
__ISA_AVX__=true
__SIMD_WIDTH_DBL__=4
else ifeq ($(strip $(ISA)), AVX_FMA)
__ISA_AVX__=true
__ISA_AVX_FMA__=true
__SIMD_WIDTH_DBL__=4
else ifeq ($(strip $(ISA)), AVX2)
#__SIMD_KERNEL__=true
__ISA_AVX2__=true
__SIMD_WIDTH_DBL__=4
else ifeq ($(strip $(ISA)), AVX512)
__ISA_AVX512__=true
__SIMD_WIDTH_DBL__=8
ifeq ($(strip $(DATA_TYPE)), DP)
__SIMD_KERNEL__=true
endif
endif
# SIMD width is specified in double-precision, hence it may
# need to be adjusted for single-precision
ifeq ($(strip $(DATA_TYPE)), SP)
VECTOR_WIDTH=$(shell echo $$(( $(__SIMD_WIDTH_DBL__) * 2 )))
else
VECTOR_WIDTH=$(__SIMD_WIDTH_DBL__)
endif

View File

@ -1,23 +0,0 @@
CC = nvcc
LINKER = $(CC)
ANSI_CFLAGS = -ansi
ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
#
# A100 + Native
CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# A40 + Native
#CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# Cascade Lake
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# For GROMACS kernels, we need at least sm_61 due to atomicAdd with doubles
# TODO: Check if this is required for full neighbor-lists and just compile kernel for that case if not
#CFLAGS = -O3 -g -arch=sm_61 # -fopenmp
ASFLAGS = -masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE -DCUDA_TARGET -DNO_ZMM_INTRIN #-DLIKWID_PERFMON
INCLUDES = $(LIKWID_INC)
LIBS = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid

View File

@ -1,17 +0,0 @@
CC = icx
LINKER = $(CC)
OPENMP = -qopenmp-simd
PROFILE = #-g -pg
#OPTS = -Ofast -no-vec
#OPTS = -Ofast -xSSE4.2
#OPTS = -Ofast -xAVX
#OPTS = -Ofast -xCORE-AVX2
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high
#OPTS = -Ofast -xHost
CFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
ASFLAGS = -masm=intel
LFLAGS = $(PROFILE) $(OPTS)
DEFINES = -D_GNU_SOURCE -DNOCHUNK
INCLUDES =
LIBS = -lm

89
src/allocate.c Normal file
View File

@ -0,0 +1,89 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <cuda_runtime.h>
void checkCUDAError(const char *msg, cudaError_t err)
{
if (err != cudaSuccess)
{
//print a human readable error message
printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
exit(-1);
}
}
void* allocate (int alignment, size_t bytesize)
{
int errorCode;
void* ptr;
checkCUDAError( "allocate", cudaMallocHost((void**)&ptr, bytesize) );
return ptr;
/*
errorCode = posix_memalign(&ptr, alignment, bytesize);
if (errorCode) {
if (errorCode == EINVAL) {
fprintf(stderr,
"Error: Alignment parameter is not a power of two\n");
exit(EXIT_FAILURE);
}
if (errorCode == ENOMEM) {
fprintf(stderr,
"Error: Insufficient memory to fulfill the request\n");
exit(EXIT_FAILURE);
}
}
if (ptr == NULL) {
fprintf(stderr, "Error: posix_memalign failed!\n");
exit(EXIT_FAILURE);
}
return ptr;
*/
}
void* reallocate (
void* ptr,
int alignment,
size_t newBytesize,
size_t oldBytesize)
{
void* newarray = allocate(alignment, newBytesize);
if(ptr != NULL) {
memcpy(newarray, ptr, oldBytesize);
cudaFreeHost(ptr);
}
return newarray;
}

185
src/atom.c Normal file
View File

@ -0,0 +1,185 @@
/*
* =======================================================================================
*
* Authors: Jan Eitzinger (je), jan.eitzinger@fau.de
* Rafael Ravedutti (rr), rafaelravedutti@gmail.com
*
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <atom.h>
#include <allocate.h>
#include <util.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#define DELTA 20000
void initAtom(Atom *atom)
{
atom->x = NULL; atom->y = NULL; atom->z = NULL;
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
atom->Natoms = 0;
atom->Nlocal = 0;
atom->Nghost = 0;
atom->Nmax = 0;
atom->type = NULL;
atom->ntypes = 0;
atom->epsilon = NULL;
atom->sigma6 = NULL;
atom->cutforcesq = NULL;
atom->cutneighsq = NULL;
}
void createAtom(Atom *atom, Parameter *param)
{
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
atom->Natoms = 4 * param->nx * param->ny * param->nz;
atom->Nlocal = 0;
atom->ntypes = param->ntypes;
checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
int ilo = (int) (xlo / (0.5 * alat) - 1);
int ihi = (int) (xhi / (0.5 * alat) + 1);
int jlo = (int) (ylo / (0.5 * alat) - 1);
int jhi = (int) (yhi / (0.5 * alat) + 1);
int klo = (int) (zlo / (0.5 * alat) - 1);
int khi = (int) (zhi / (0.5 * alat) + 1);
ilo = MAX(ilo, 0);
ihi = MIN(ihi, 2 * param->nx - 1);
jlo = MAX(jlo, 0);
jhi = MIN(jhi, 2 * param->ny - 1);
klo = MAX(klo, 0);
khi = MIN(khi, 2 * param->nz - 1);
MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
int i, j, k, m, n;
int sx = 0; int sy = 0; int sz = 0;
int ox = 0; int oy = 0; int oz = 0;
int subboxdim = 8;
while(oz * subboxdim <= khi) {
k = oz * subboxdim + sz;
j = oy * subboxdim + sy;
i = ox * subboxdim + sx;
if(((i + j + k) % 2 == 0) &&
(i >= ilo) && (i <= ihi) &&
(j >= jlo) && (j <= jhi) &&
(k >= klo) && (k <= khi)) {
xtmp = 0.5 * alat * i;
ytmp = 0.5 * alat * j;
ztmp = 0.5 * alat * k;
if( xtmp >= xlo && xtmp < xhi &&
ytmp >= ylo && ytmp < yhi &&
ztmp >= zlo && ztmp < zhi ) {
n = k * (2 * param->ny) * (2 * param->nx) +
j * (2 * param->nx) +
i + 1;
for(m = 0; m < 5; m++) {
myrandom(&n);
}
vxtmp = myrandom(&n);
for(m = 0; m < 5; m++){
myrandom(&n);
}
vytmp = myrandom(&n);
for(m = 0; m < 5; m++) {
myrandom(&n);
}
vztmp = myrandom(&n);
if(atom->Nlocal == atom->Nmax) {
growAtom(atom);
}
atom_x(atom->Nlocal) = xtmp;
atom_y(atom->Nlocal) = ytmp;
atom_z(atom->Nlocal) = ztmp;
atom_vx(atom->Nlocal) = vxtmp;
atom_vy(atom->Nlocal) = vytmp;
atom_vz(atom->Nlocal) = vztmp;
atom->type[atom->Nlocal] = rand() % atom->ntypes;
atom->Nlocal++;
}
}
sx++;
if(sx == subboxdim) { sx = 0; sy++; }
if(sy == subboxdim) { sy = 0; sz++; }
if(sz == subboxdim) { sz = 0; ox++; }
if(ox * subboxdim > ihi) { ox = 0; oy++; }
if(oy * subboxdim > jhi) { oy = 0; oz++; }
}
}
void growAtom(Atom *atom)
{
int nold = atom->Nmax;
atom->Nmax += DELTA;
#ifdef AOS
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
#else
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
#endif
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
}

View File

@ -1,532 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <atom.h>
#include <allocate.h>
#include <util.h>
void initAtom(Atom *atom) {
atom->x = NULL; atom->y = NULL; atom->z = NULL;
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
atom->cl_x = NULL;
atom->cl_v = NULL;
atom->cl_f = NULL;
atom->cl_type = NULL;
atom->Natoms = 0;
atom->Nlocal = 0;
atom->Nghost = 0;
atom->Nmax = 0;
atom->Nclusters = 0;
atom->Nclusters_local = 0;
atom->Nclusters_ghost = 0;
atom->Nclusters_max = 0;
atom->type = NULL;
atom->ntypes = 0;
atom->epsilon = NULL;
atom->sigma6 = NULL;
atom->cutforcesq = NULL;
atom->cutneighsq = NULL;
atom->iclusters = NULL;
atom->jclusters = NULL;
atom->icluster_bin = NULL;
initMasks(atom);
}
void createAtom(Atom *atom, Parameter *param) {
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
atom->Natoms = 4 * param->nx * param->ny * param->nz;
atom->Nlocal = 0;
atom->ntypes = param->ntypes;
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
int ilo = (int) (xlo / (0.5 * alat) - 1);
int ihi = (int) (xhi / (0.5 * alat) + 1);
int jlo = (int) (ylo / (0.5 * alat) - 1);
int jhi = (int) (yhi / (0.5 * alat) + 1);
int klo = (int) (zlo / (0.5 * alat) - 1);
int khi = (int) (zhi / (0.5 * alat) + 1);
ilo = MAX(ilo, 0);
ihi = MIN(ihi, 2 * param->nx - 1);
jlo = MAX(jlo, 0);
jhi = MIN(jhi, 2 * param->ny - 1);
klo = MAX(klo, 0);
khi = MIN(khi, 2 * param->nz - 1);
MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
int i, j, k, m, n;
int sx = 0; int sy = 0; int sz = 0;
int ox = 0; int oy = 0; int oz = 0;
int subboxdim = 8;
while(oz * subboxdim <= khi) {
k = oz * subboxdim + sz;
j = oy * subboxdim + sy;
i = ox * subboxdim + sx;
if(((i + j + k) % 2 == 0) && (i >= ilo) && (i <= ihi) && (j >= jlo) && (j <= jhi) && (k >= klo) && (k <= khi)) {
xtmp = 0.5 * alat * i;
ytmp = 0.5 * alat * j;
ztmp = 0.5 * alat * k;
if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1;
for(m = 0; m < 5; m++) { myrandom(&n); }
vxtmp = myrandom(&n);
for(m = 0; m < 5; m++){ myrandom(&n); }
vytmp = myrandom(&n);
for(m = 0; m < 5; m++) { myrandom(&n); }
vztmp = myrandom(&n);
if(atom->Nlocal == atom->Nmax) { growAtom(atom); }
atom_x(atom->Nlocal) = xtmp;
atom_y(atom->Nlocal) = ytmp;
atom_z(atom->Nlocal) = ztmp;
atom->vx[atom->Nlocal] = vxtmp;
atom->vy[atom->Nlocal] = vytmp;
atom->vz[atom->Nlocal] = vztmp;
atom->type[atom->Nlocal] = rand() % atom->ntypes;
atom->Nlocal++;
}
}
sx++;
if(sx == subboxdim) { sx = 0; sy++; }
if(sy == subboxdim) { sy = 0; sz++; }
if(sz == subboxdim) { sz = 0; ox++; }
if(ox * subboxdim > ihi) { ox = 0; oy++; }
if(oy * subboxdim > jhi) { oy = 0; oz++; }
}
}
int type_str2int(const char *type) {
if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
fprintf(stderr, "Invalid atom type: %s\n", type);
exit(-1);
return -1;
}
int readAtom(Atom* atom, Parameter* param) {
int len = strlen(param->input_file);
if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
exit(-1);
return -1;
}
int readAtom_pdb(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int read_atoms = 0;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
while(!feof(fp)) {
readline(line, fp);
char *item = strtok(line, " ");
if(strncmp(item, "CRYST1", 6) == 0) {
param->xlo = 0.0;
param->xhi = atof(strtok(NULL, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
// alpha, beta, gamma, sGroup, z
} else if(strncmp(item, "ATOM", 4) == 0) {
char *label;
int atom_id, comp_id;
MD_FLOAT occupancy, charge;
atom_id = atoi(strtok(NULL, " ")) - 1;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
label = strtok(NULL, " ");
comp_id = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = 0.0;
atom->vy[atom_id] = 0.0;
atom->vz[atom_id] = 0.0;
occupancy = atof(strtok(NULL, " "));
charge = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
read_atoms++;
} else if(strncmp(item, "HEADER", 6) == 0 ||
strncmp(item, "REMARK", 6) == 0 ||
strncmp(item, "MODEL", 5) == 0 ||
strncmp(item, "TER", 3) == 0 ||
strncmp(item, "ENDMDL", 6) == 0) {
// Do nothing
} else {
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
}
if(!read_atoms) {
fprintf(stderr, "Input error: No atoms read!\n");
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_gro(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
char desc[MAXLINE];
int read_atoms = 0;
int atoms_to_read = 0;
int i = 0;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
readline(desc, fp);
for(i = 0; desc[i] != '\n'; i++);
desc[i] = '\0';
readline(line, fp);
atoms_to_read = atoi(strtok(line, " "));
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
while(!feof(fp) && read_atoms < atoms_to_read) {
readline(line, fp);
char *label = strtok(line, " ");
int type = type_str2int(strtok(NULL, " "));
int atom_id = atoi(strtok(NULL, " ")) - 1;
atom_id = read_atoms;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type;
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = atof(strtok(NULL, " "));
atom->vy[atom_id] = atof(strtok(NULL, " "));
atom->vz[atom_id] = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
read_atoms++;
}
if(!feof(fp)) {
readline(line, fp);
param->xlo = 0.0;
param->xhi = atof(strtok(line, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
}
if(read_atoms != atoms_to_read) {
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_dmp(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int natoms = 0;
int read_atoms = 0;
int atom_id = -1;
int ts = -1;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
while(!feof(fp) && ts < 1 && !read_atoms) {
readline(line, fp);
if(strncmp(line, "ITEM: ", 6) == 0) {
char *item = &line[6];
if(strncmp(item, "TIMESTEP", 8) == 0) {
readline(line, fp);
ts = atoi(line);
} else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
readline(line, fp);
natoms = atoi(line);
atom->Natoms = natoms;
atom->Nlocal = natoms;
while(atom->Nlocal >= atom->Nmax) {
growAtom(atom);
}
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
readline(line, fp);
param->xlo = atof(strtok(line, " "));
param->xhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
readline(line, fp);
param->ylo = atof(strtok(line, " "));
param->yhi = atof(strtok(NULL, " "));
param->yprd = param->yhi - param->ylo;
readline(line, fp);
param->zlo = atof(strtok(line, " "));
param->zhi = atof(strtok(NULL, " "));
param->zprd = param->zhi - param->zlo;
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
for(int i = 0; i < natoms; i++) {
readline(line, fp);
atom_id = atoi(strtok(line, " ")) - 1;
atom->type[atom_id] = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = atof(strtok(NULL, " "));
atom->vy[atom_id] = atof(strtok(NULL, " "));
atom->vz[atom_id] = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
read_atoms++;
}
} else {
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
} else {
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
exit(-1);
return -1;
}
}
if(ts < 0 || !natoms || !read_atoms) {
fprintf(stderr, "Input error: atom data was not read!\n");
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
fclose(fp);
return natoms;
}
void initMasks(Atom *atom) {
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
unsigned int mask0, mask1, mask2, mask3;
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
//atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
}
for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
}
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
atom->exclusion_filter[i] = (1U << i);
}
#if CLUSTER_M == CLUSTER_N
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x3 * cond0);
mask2 = (unsigned int)(0xf - 0x7 * cond0);
mask3 = (unsigned int)(0xf - 0xf * cond0);
atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x2 * cond0);
mask2 = (unsigned int)(0xf - 0x4 * cond0);
mask3 = (unsigned int)(0xf - 0x8 * cond0);
atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
}
#else
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
#if CLUSTER_M < CLUSTER_N
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
#else
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
#endif
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
#if CLUSTER_M < CLUSTER_N
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
#else
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
#endif
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
#if CLUSTER_M < CLUSTER_N
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
#else
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
#endif
}
}
#endif
}
void growAtom(Atom *atom) {
int nold = atom->Nmax;
atom->Nmax += DELTA;
#ifdef AOS
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
#else
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
#endif
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
}
void growClusters(Atom *atom) {
int nold = atom->Nclusters_max;
int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
atom->Nclusters_max += DELTA;
atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
}

View File

@ -1,171 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#ifndef __ATOM_H_
#define __ATOM_H_
#define DELTA 20000
// Nbnxn layouts (as of GROMACS):
// Simd4xN: M=4, N=VECTOR_WIDTH
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
// Cuda: M=8, N=VECTOR_WIDTH
#ifdef CUDA_TARGET
# undef VECTOR_WIDTH
# define VECTOR_WIDTH 8
# define KERNEL_NAME "CUDA"
# define CLUSTER_M 8
# define CLUSTER_N VECTOR_WIDTH
# define UNROLL_J 1
# define computeForceLJ computeForceLJ_cuda
# define initialIntegrate cudaInitialIntegrate
# define finalIntegrate cudaFinalIntegrate
# define updatePbc cudaUpdatePbc
#else
# define CLUSTER_M 4
// Simd2xNN (here used for single-precision)
# if VECTOR_WIDTH > CLUSTER_M * 2
# define KERNEL_NAME "Simd2xNN"
# define CLUSTER_N (VECTOR_WIDTH / 2)
# define UNROLL_I 4
# define UNROLL_J 2
# define computeForceLJ computeForceLJ_2xnn
// Simd4xN
# else
# define KERNEL_NAME "Simd4xN"
# define CLUSTER_N VECTOR_WIDTH
# define UNROLL_I 4
# define UNROLL_J 1
# define computeForceLJ computeForceLJ_4xn
# endif
# ifdef USE_REFERENCE_VERSION
# undef KERNEL_NAME
# undef computeForceLJ
# define KERNEL_NAME "Reference"
# define computeForceLJ computeForceLJ_ref
# endif
# define initialIntegrate cpuInitialIntegrate
# define finalIntegrate cpuFinalIntegrate
# define updatePbc cpuUpdatePbc
#endif
#if CLUSTER_M == CLUSTER_N
# define CJ0_FROM_CI(a) (a)
# define CJ1_FROM_CI(a) (a)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
# define CJ0_FROM_CI(a) ((a) << 1)
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
# define CJ0_FROM_CI(a) ((a) >> 1)
# define CJ1_FROM_CI(a) ((a) >> 1)
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#else
# error "Invalid cluster configuration!"
#endif
#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
# error "Cluster N dimension can be only 2, 4 and 8"
#endif
#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1))
#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3))
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
#if CLUSTER_M >= CLUSTER_N
# define CL_X_OFFSET (0 * CLUSTER_M)
# define CL_Y_OFFSET (1 * CLUSTER_M)
# define CL_Z_OFFSET (2 * CLUSTER_M)
#else
# define CL_X_OFFSET (0 * CLUSTER_N)
# define CL_Y_OFFSET (1 * CLUSTER_N)
# define CL_Z_OFFSET (2 * CLUSTER_N)
#endif
typedef struct {
int natoms;
MD_FLOAT bbminx, bbmaxx;
MD_FLOAT bbminy, bbmaxy;
MD_FLOAT bbminz, bbmaxz;
} Cluster;
typedef struct {
int Natoms, Nlocal, Nghost, Nmax;
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
int *border_map;
int *type;
int ntypes;
MD_FLOAT *epsilon;
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
int *PBCx, *PBCy, *PBCz;
// Data in cluster format
MD_FLOAT *cl_x;
MD_FLOAT *cl_v;
MD_FLOAT *cl_f;
int *cl_type;
Cluster *iclusters, *jclusters;
int *icluster_bin;
int dummy_cj;
MD_UINT *exclusion_filter;
MD_FLOAT *diagonal_4xn_j_minus_i;
MD_FLOAT *diagonal_2xnn_j_minus_i;
unsigned int masks_2xnn_hn[8];
unsigned int masks_2xnn_fn[8];
unsigned int masks_4xn_hn[16];
unsigned int masks_4xn_fn[16];
} Atom;
extern void initAtom(Atom*);
extern void initMasks(Atom*);
extern void createAtom(Atom*, Parameter*);
extern int readAtom(Atom*, Parameter*);
extern int readAtom_pdb(Atom*, Parameter*);
extern int readAtom_gro(Atom*, Parameter*);
extern int readAtom_dmp(Atom*, Parameter*);
extern void growAtom(Atom*);
extern void growClusters(Atom*);
#ifdef AOS
# define POS_DATA_LAYOUT "AoS"
# define atom_x(i) atom->x[(i) * 3 + 0]
# define atom_y(i) atom->x[(i) * 3 + 1]
# define atom_z(i) atom->x[(i) * 3 + 2]
/*
# define atom_vx(i) atom->vx[(i) * 3 + 0]
# define atom_vy(i) atom->vx[(i) * 3 + 1]
# define atom_vz(i) atom->vx[(i) * 3 + 2]
# define atom_fx(i) atom->fx[(i) * 3 + 0]
# define atom_fy(i) atom->fx[(i) * 3 + 1]
# define atom_fz(i) atom->fx[(i) * 3 + 2]
*/
#else
# define POS_DATA_LAYOUT "SoA"
# define atom_x(i) atom->x[i]
# define atom_y(i) atom->y[i]
# define atom_z(i) atom->z[i]
#endif
// TODO: allow to switch velocites and forces to AoS
# define atom_vx(i) atom->vx[i]
# define atom_vy(i) atom->vy[i]
# define atom_vz(i) atom->vz[i]
# define atom_fx(i) atom->fx[i]
# define atom_fy(i) atom->fy[i]
# define atom_fz(i) atom->fz[i]
#endif

View File

@ -1,317 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
extern "C" {
#include <stdio.h>
//---
#include <cuda.h>
#include <driver_types.h>
//---
#include <likwid-marker.h>
//---
#include <atom.h>
#include <device.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#include <util.h>
}
extern "C" {
MD_FLOAT *cuda_cl_x;
MD_FLOAT *cuda_cl_v;
MD_FLOAT *cuda_cl_f;
int *cuda_neighbors;
int *cuda_numneigh;
int *cuda_natoms;
int *natoms;
int *ngatoms;
int *cuda_border_map;
int *cuda_jclusters_natoms;
MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
int isReneighboured;
}
extern "C"
void initDevice(Atom *atom, Neighbor *neighbor) {
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
cuda_assert("cudaDeviceSetup", cudaSetDevice(0));
cuda_cl_x = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_cl_v = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_cl_f = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_jclusters_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_border_map = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_PBCx = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_PBCy = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_PBCz = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_numneigh = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_neighbors = (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int));
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
isReneighboured = 1;
}
extern "C"
void copyDataToCUDADevice(Atom *atom) {
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
natoms[ci] = atom->iclusters[ci].natoms;
}
memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int));
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
const int cj = ncj + cg;
ngatoms[cg] = atom->jclusters[cj].natoms;
}
memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
}
extern "C"
void copyDataFromCUDADevice(Atom *atom) {
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
}
extern "C"
void cudaDeviceFree() {
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x));
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v));
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f));
cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh));
cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors));
cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms));
cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map));
cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms));
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx));
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy));
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
free(natoms);
free(ngatoms);
}
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_natoms,
int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
if (ci_pos >= Nclusters_local) return;
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii];
ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii];
ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii];
}
}
__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
int *cuda_jclusters_natoms,
int *cuda_PBCx,
int *cuda_PBCy,
int *cuda_PBCz,
int Nclusters_local,
int Nclusters_ghost,
MD_FLOAT param_xprd,
MD_FLOAT param_yprd,
MD_FLOAT param_zprd) {
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
if (cg >= Nclusters_ghost) return;
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = Nclusters_local / jfac;
MD_FLOAT xprd = param_xprd;
MD_FLOAT yprd = param_yprd;
MD_FLOAT zprd = param_zprd;
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
}
}
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
int Nclusters_local, int Nclusters_max,
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
int ci_cj0 = CJ0_FROM_CI(ci_pos);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
int numneighs = cuda_numneigh[ci_pos];
for(int k = 0; k < numneighs; k++) {
int cj = (&cuda_neighs[ci_pos * maxneighs])[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos];
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos];
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos];
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
int cond;
#if CLUSTER_M == CLUSTER_N
cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
(ci_cj0 != cj || cii_pos != cjj_pos);
#elif CLUSTER_M < CLUSTER_N
cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
(ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
#endif
if(cond) {
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos];
MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos];
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
if(half_neigh) {
atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force);
atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force);
atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force);
}
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix);
atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy);
atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz);
}
}
}
}
__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_natoms,
int Nclusters_local, MD_FLOAT dtforce) {
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
if (ci_pos >= Nclusters_local) return;
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
}
}
extern "C"
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
const int threads_num = 16;
dim3 block_size = dim3(threads_num, 1, 1);
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
}
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
extern "C"
void cudaUpdatePbc(Atom *atom, Parameter *param) {
const int threads_num = 512;
dim3 block_size = dim3(threads_num, 1, 1);;
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
atom->Nclusters_local, atom->Nclusters_ghost,
param->xprd, param->yprd, param->zprd);
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
}
extern "C"
double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
if (isReneighboured) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
}
isReneighboured = 0;
}
const int threads_num = 1;
dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N);
dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
computeForceLJ_cuda_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_f,
atom->Nclusters_local, atom->Nclusters_max,
cuda_numneigh, cuda_neighbors,
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
sigma6, epsilon);
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
return E-S;
}
extern "C"
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
const int threads_num = 16;
dim3 block_size = dim3(threads_num, 1, 1);
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
}

File diff suppressed because it is too large Load Diff

View File

@ -1,56 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdbool.h>
//---
#include <atom.h>
#include <parameter.h>
#include <util.h>
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
}
}
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
}
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
}
}
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
}
#ifdef CUDA_TARGET
void cudaInitialIntegrate(Parameter*, Atom*);
void cudaFinalIntegrate(Parameter*, Atom*);
#endif

View File

@ -1,344 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <string.h>
#include <math.h>
//---
#include <likwid-marker.h>
//---
#include <timing.h>
#include <allocate.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#include <thermo.h>
#include <eam.h>
#include <pbc.h>
#include <timers.h>
#include <util.h>
#define HLINE "----------------------------------------------------------------------------\n"
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
// Patterns
#define P_SEQ 0
#define P_FIX 1
#define P_RAND 2
void init(Parameter *param) {
param->input_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma6 = 1.0;
param->rho = 0.8442;
param->ntypes = 4;
param->ntimes = 200;
param->nx = 1;
param->ny = 1;
param->nz = 1;
param->lattice = 1.0;
param->cutforce = 1000000.0;
param->cutneigh = param->cutforce;
param->mass = 1.0;
param->half_neigh = 0;
// Unused
param->dt = 0.005;
param->dtforce = 0.5 * param->dt;
param->nstat = 100;
param->temp = 1.44;
param->reneigh_every = 20;
param->proc_freq = 2.4;
param->eam_file = NULL;
}
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
const int maxneighs = nneighs * nreps;
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
const int ncj = atom->Nclusters_local / jfac;
const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
if(pattern == P_RAND && ncj <= nneighs) {
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
exit(-1);
}
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
int m = (pattern == P_SEQ) ? ncj : nneighs;
int k = 0;
for(int k = 0; k < nneighs; k++) {
if(pattern == P_RAND) {
int found = 0;
do {
int cj = rand() % ncj;
neighptr[k] = cj;
neighptr_imask[k] = imask;
found = 0;
for(int l = 0; l < k; l++) {
if(neighptr[l] == cj) {
found = 1;
}
}
} while(found == 1);
} else {
neighptr[k] = j;
neighptr_imask[k] = imask;
j = (j + 1) % m;
}
}
for(int r = 1; r < nreps; r++) {
for(int k = 0; k < nneighs; k++) {
neighptr[r * nneighs + k] = neighptr[k];
neighptr_imask[r * nneighs + k] = neighptr_imask[k];
}
}
neighbor->numneigh[ci] = nneighs * nreps;
neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
}
}
int main(int argc, const char *argv[]) {
Eam eam;
Atom atom_data;
Atom *atom = (Atom *)(&atom_data);
Neighbor neighbor;
Stats stats;
Parameter param;
char *pattern_str = NULL;
int pattern = P_SEQ;
int niclusters = 256; // Number of local i-clusters
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
int masked = 0; // Use masked loop
int nreps = 1;
int csv = 0;
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG_MESSAGE("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-f") == 0)) {
if((param.force_field = str2ff(argv[++i])) < 0) {
fprintf(stderr, "Invalid force field!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-p") == 0)) {
pattern_str = strdup(argv[++i]);
if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
else {
fprintf(stderr, "Invalid pattern!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-e") == 0)) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-m") == 0)) {
masked = 1;
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-ni") == 0)) {
niclusters = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-na") == 0)) {
iclusters_natoms = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nn") == 0)) {
nneighs = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nr") == 0)) {
nreps = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "--freq") == 0)) {
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--csv") == 0)) {
csv = 1;
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-f <string>: force field (lj or eam), default lj\n");
printf("-p <string>: pattern for data accesses (seq, fix or rand)\n");
printf("-n / --nsteps <int>: number of timesteps for simulation\n");
printf("-ni <int>: number of i-clusters (default 256)\n");
printf("-na <int>: number of atoms per i-cluster (default %d)\n", CLUSTER_M);
printf("-nn <int>: number of j-cluster neighbors per i-cluster (default 9)\n");
printf("-nr <int>: number of times neighbor lists should be replicated (default 1)\n");
printf("--freq <real>: set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
printf("--csv: set output as CSV style\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
if(pattern_str == NULL) {
pattern_str = strdup("seq\0");
}
if(param.force_field == FF_EAM) {
DEBUG_MESSAGE("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG_MESSAGE("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
atom->ntypes = param.ntypes;
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param.epsilon;
atom->sigma6[i] = param.sigma6;
atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG_MESSAGE("Creating atoms...\n");
while(atom->Nmax < niclusters * iclusters_natoms) {
growAtom(atom);
}
while(atom->Nclusters_max < niclusters) {
growClusters(atom);
}
for(int ci = 0; ci < niclusters; ++ci) {
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
int *ci_type = &atom->cl_type[ci_sca_base];
for(int cii = 0; cii < iclusters_natoms; ++cii) {
ci_x[CL_X_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
ci_x[CL_Y_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
ci_x[CL_Z_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
ci_v[CL_X_OFFSET + cii] = 0.0;
ci_v[CL_Y_OFFSET + cii] = 0.0;
ci_v[CL_Z_OFFSET + cii] = 0.0;
ci_type[cii] = rand() % atom->ntypes;
atom->Nlocal++;
}
for(int cii = iclusters_natoms; cii < CLUSTER_M; cii++) {
ci_x[CL_X_OFFSET + cii] = INFINITY;
ci_x[CL_Y_OFFSET + cii] = INFINITY;
ci_x[CL_Z_OFFSET + cii] = INFINITY;
}
atom->iclusters[ci].natoms = iclusters_natoms;
atom->Nclusters_local++;
}
const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
if(!csv) {
printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
printf("Floating-point precision: %s\n", PRECISION_STRING);
printf("Pattern: %s\n", pattern_str);
printf("Number of timesteps: %d\n", param.ntimes);
printf("Number of i-clusters: %d\n", niclusters);
printf("Number of atoms per i-cluster: %d\n", iclusters_natoms);
printf("Number of j-cluster neighbors per i-cluster: %d\n", nneighs);
printf("Number of times to replicate neighbor lists: %d\n", nreps);
printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG_MESSAGE("Defining j-clusters...\n");
defineJClusters(atom);
DEBUG_MESSAGE("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG_MESSAGE("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
DEBUG_MESSAGE("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, atom, &neighbor, i + 1);
#endif
if(param.force_field == FF_EAM) {
T_accum += computeForceEam(&eam, &param, atom, &neighbor, &stats);
} else {
T_accum += computeForceLJ(&param, atom, &neighbor, &stats);
}
}
double freq_hz = param.proc_freq * 1.e9;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
if(!csv) {
printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
if(param.proc_freq > 0.0) {
printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
}
} else {
printf("steps,pattern,niclusters,iclusters_natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
if(param.proc_freq > 0.0) {
printf(",cy/atom,cy/neigh");
}
printf("\n");
printf("%d,%s,%d,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
param.ntimes, pattern_str, niclusters, iclusters_natoms, nneighs, nreps,
estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
if(param.proc_freq > 0.0) {
printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
}
printf("\n");
}
double timer[NUMTIMER];
timer[FORCE] = T_accum;
displayStatistics(atom, &param, &stats, timer);
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

View File

@ -1,344 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <omp.h>
//--
#include <likwid-marker.h>
//--
#include <atom.h>
#include <allocate.h>
#include <device.h>
#include <eam.h>
#include <integrate.h>
#include <neighbor.h>
#include <parameter.h>
#include <pbc.h>
#include <stats.h>
#include <thermo.h>
#include <timers.h>
#include <timing.h>
#include <util.h>
#include <vtk.h>
#include <xtc.h>
#define HLINE "----------------------------------------------------------------------------\n"
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
#ifdef CUDA_TARGET
extern int isReneighboured;
extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
extern void copyDataToCUDADevice(Atom *atom);
extern void copyDataFromCUDADevice(Atom *atom);
extern void cudaDeviceFree();
#endif
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
param->xprd = param->nx * param->lattice;
param->yprd = param->ny * param->lattice;
param->zprd = param->nz * param->lattice;
S = getTimeStamp();
initAtom(atom);
initPbc(atom);
initStats(stats);
initNeighbor(neighbor, param);
if(param->input_file == NULL) {
createAtom(atom, param);
} else {
readAtom(atom, param);
}
setupNeighbor(param, atom);
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
buildClusters(atom);
defineJClusters(atom);
setupPbc(atom, param);
binClusters(atom);
buildNeighbor(atom, neighbor);
initDevice(atom, neighbor);
E = getTimeStamp();
return E-S;
}
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
updateSingleAtoms(atom);
updateAtomsPbc(atom, param);
buildClusters(atom);
defineJClusters(atom);
setupPbc(atom, param);
binClusters(atom);
buildNeighbor(atom, neighbor);
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
return E-S;
}
void printAtomState(Atom *atom) {
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
/* int nall = atom->Nlocal + atom->Nghost; */
/* for (int i=0; i<nall; i++) { */
/* printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
/* } */
}
int main(int argc, char** argv) {
double timer[NUMTIMER];
Eam eam;
Atom atom;
Neighbor neighbor;
Stats stats;
Parameter param;
LIKWID_MARKER_INIT;
#pragma omp parallel
{
LIKWID_MARKER_REGISTER("force");
//LIKWID_MARKER_REGISTER("reneighbour");
//LIKWID_MARKER_REGISTER("pbc");
}
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
readParameter(&param, argv[++i]);
continue;
}
if((strcmp(argv[i], "-f") == 0)) {
if((param.force_field = str2ff(argv[++i])) < 0) {
fprintf(stderr, "Invalid force field!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-i") == 0)) {
param.input_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-e") == 0)) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nx") == 0)) {
param.nx = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-ny") == 0)) {
param.ny = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nz") == 0)) {
param.nz = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-half") == 0)) {
param.half_neigh = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
param.mass = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
param.cutforce = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
param.skin = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--freq") == 0)) {
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--vtk") == 0)) {
param.vtk_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "--xtc") == 0)) {
#ifndef XTC_OUTPUT
fprintf(stderr, "XTC not available, set XTC_OUTPUT option in config.mk file and recompile MD-Bench!");
exit(-1);
#else
param.xtc_file = strdup(argv[++i]);
#endif
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
printf("-f <string>: force field (lj or eam), default lj\n");
printf("-i <string>: input file with atom positions (dump)\n");
printf("-e <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-r / --radius <real>: set cutoff radius\n");
printf("-s / --skin <real>: set skin (verlet buffer)\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf("--xtc <string>: XTC file for visualization\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
param.cutneigh = param.cutforce + param.skin;
setup(&param, &eam, &atom, &neighbor, &stats);
printParameter(&param);
printf(HLINE);
printf("step\ttemp\t\tpressure\n");
computeThermo(0, &param, &atom);
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
#ifdef CUDA_TARGET
copyDataToCUDADevice(&atom);
#endif
if(param.force_field == FF_EAM) {
timer[FORCE] = computeForceEam(&eam, &param, &atom, &neighbor, &stats);
} else {
timer[FORCE] = computeForceLJ(&param, &atom, &neighbor, &stats);
}
timer[NEIGH] = 0.0;
timer[TOTAL] = getTimeStamp();
if(param.vtk_file != NULL) {
write_data_to_vtk_file(param.vtk_file, &atom, 0);
}
if(param.xtc_file != NULL) {
xtc_init(param.xtc_file, &atom, 0);
}
for(int n = 0; n < param.ntimes; n++) {
initialIntegrate(&param, &atom);
if((n + 1) % param.reneigh_every) {
if(!((n + 1) % param.prune_every)) {
pruneNeighbor(&param, &atom, &neighbor);
}
updatePbc(&atom, &param, 0);
} else {
#ifdef CUDA_TARGET
copyDataFromCUDADevice(&atom);
#endif
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
#ifdef CUDA_TARGET
copyDataToCUDADevice(&atom);
isReneighboured = 1;
#endif
}
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
if(param.force_field == FF_EAM) {
timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats);
} else {
timer[FORCE] += computeForceLJ(&param, &atom, &neighbor, &stats);
}
finalIntegrate(&param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
computeThermo(n + 1, &param, &atom);
}
int write_pos = !((n + 1) % param.x_out_every);
int write_vel = !((n + 1) % param.v_out_every);
if(write_pos || write_vel) {
if(param.vtk_file != NULL) {
write_data_to_vtk_file(param.vtk_file, &atom, n + 1);
}
if(param.xtc_file != NULL) {
xtc_write(&atom, n + 1, write_pos, write_vel);
}
}
}
#ifdef CUDA_TARGET
copyDataFromCUDADevice(&atom);
#endif
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
updateSingleAtoms(&atom);
computeThermo(-1, &param, &atom);
if(param.xtc_file != NULL) {
xtc_end();
}
#ifdef CUDA_TARGET
cudaDeviceFree();
#endif
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
int nthreads = 0;
int chunkSize = 0;
omp_sched_t schedKind;
char schedType[10];
#pragma omp parallel
#pragma omp master
{
omp_get_schedule(&schedKind, &chunkSize);
switch (schedKind)
{
case omp_sched_static: strcpy(schedType, "static"); break;
case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
case omp_sched_guided: strcpy(schedType, "guided"); break;
case omp_sched_auto: strcpy(schedType, "auto"); break;
}
nthreads = omp_get_max_threads();
}
printf("Num threads: %d\n", nthreads);
printf("Schedule: (%s,%d)\n", schedType, chunkSize);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS
displayStatistics(&atom, &param, &stats, timer);
#endif
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

View File

@ -1,939 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <util.h>
#define SMALL 1.0e-6
#define FACTOR 0.999
static MD_FLOAT xprd, yprd, zprd;
static MD_FLOAT bininvx, bininvy;
static int mbinxlo, mbinylo;
static int nbinx, nbiny;
static int mbinx, mbiny; // n bins in x, y
static int *bincount;
static int *bins;
static int *bin_nclusters;
static int *bin_clusters;
static int mbins; //total number of bins
static int atoms_per_bin; // max atoms per bin
static int clusters_per_bin; // max clusters per bin
static MD_FLOAT cutneigh;
static MD_FLOAT cutneighsq; // neighbor cutoff squared
static int nmax;
static int nstencil; // # of bins in stencil
static int* stencil; // stencil list of bin offsets
static MD_FLOAT binsizex, binsizey;
static int coord2bin(MD_FLOAT, MD_FLOAT);
static MD_FLOAT bindist(int, int);
/* exported subroutines */
void initNeighbor(Neighbor *neighbor, Parameter *param) {
MD_FLOAT neighscale = 5.0 / 6.0;
xprd = param->nx * param->lattice;
yprd = param->ny * param->lattice;
zprd = param->nz * param->lattice;
cutneigh = param->cutneigh;
nmax = 0;
atoms_per_bin = 8;
clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
stencil = NULL;
bins = NULL;
bincount = NULL;
bin_clusters = NULL;
bin_nclusters = NULL;
neighbor->half_neigh = param->half_neigh;
neighbor->maxneighs = 100;
neighbor->numneigh = NULL;
neighbor->numneigh_masked = NULL;
neighbor->neighbors = NULL;
neighbor->neighbors_imask = NULL;
}
void setupNeighbor(Parameter *param, Atom *atom) {
MD_FLOAT coord;
int mbinxhi, mbinyhi;
int nextx, nexty, nextz;
if(param->input_file != NULL) {
xprd = param->xprd;
yprd = param->yprd;
zprd = param->zprd;
}
// TODO: update lo and hi for standard case and use them here instead
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
binsizex = (xhi - xlo) / nbinx;
binsizey = (yhi - ylo) / nbiny;
bininvx = 1.0 / binsizex;
bininvy = 1.0 / binsizey;
cutneighsq = cutneigh * cutneigh;
coord = xlo - cutneigh - SMALL * xprd;
mbinxlo = (int)(coord * bininvx);
if(coord < 0.0) { mbinxlo = mbinxlo - 1; }
coord = xhi + cutneigh + SMALL * xprd;
mbinxhi = (int)(coord * bininvx);
coord = ylo - cutneigh - SMALL * yprd;
mbinylo = (int)(coord * bininvy);
if(coord < 0.0) { mbinylo = mbinylo - 1; }
coord = yhi + cutneigh + SMALL * yprd;
mbinyhi = (int)(coord * bininvy);
mbinxlo = mbinxlo - 1;
mbinxhi = mbinxhi + 1;
mbinx = mbinxhi - mbinxlo + 1;
mbinylo = mbinylo - 1;
mbinyhi = mbinyhi + 1;
mbiny = mbinyhi - mbinylo + 1;
nextx = (int)(cutneigh * bininvx);
nexty = (int)(cutneigh * bininvy);
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
if (stencil) { free(stencil); }
stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
nstencil = 0;
for(int j = -nexty; j <= nexty; j++) {
for(int i = -nextx; i <= nextx; i++) {
if(bindist(i, j) < cutneighsq) {
stencil[nstencil++] = j * mbinx + i;
}
}
}
if(bincount) { free(bincount); }
if(bins) { free(bins); }
if(bin_nclusters) { free(bin_nclusters); }
if(bin_clusters) { free(bin_clusters); }
mbins = mbinx * mbiny;
bincount = (int*) malloc(mbins * sizeof(int));
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
bin_nclusters = (int*) malloc(mbins * sizeof(int));
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
/*
DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi);
DEBUG_MESSAGE("binsize = %e, %e\n", binsizex, binsizey);
DEBUG_MESSAGE("mbin lo, hi = (%d, %d), (%d, %d)\n", mbinxlo, mbinylo, mbinxhi, mbinyhi);
DEBUG_MESSAGE("mbins = %d (%d x %d)\n", mbins, mbinx, mbiny);
DEBUG_MESSAGE("nextx = %d, nexty = %d\n", nextx, nexty);
*/
}
MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
MD_FLOAT dm = MAX(dl, dh);
MD_FLOAT dm0 = MAX(dm, 0.0);
MD_FLOAT d2 = dm0 * dm0;
dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d2 += dm0 * dm0;
dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d2 += dm0 * dm0;
return d2;
}
int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
if(delx * delx + dely * dely + delz * delz < rsq) {
return 1;
}
}
}
return 0;
}
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
static unsigned int get_imask(int rdiag, int ci, int cj) {
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
}
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
: (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
: NBNXN_INTERACTION_MASK_ALL));
}
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
}
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
: (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
: NBNXN_INTERACTION_MASK_ALL));
}
#if VECTOR_WIDTH == 2
# define get_imask_simd_4xn get_imask_simd_j2
#elif VECTOR_WIDTH== 4
# define get_imask_simd_4xn get_imask_simd_j4
#elif VECTOR_WIDTH == 8
# define get_imask_simd_4xn get_imask_simd_j8
# define get_imask_simd_2xnn get_imask_simd_j4
#elif VECTOR_WIDTH == 16
# define get_imask_simd_2xnn get_imask_simd_j8
#else
# error "Invalid cluster configuration"
#endif
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("buildNeighbor start\n");
/* extend atom arrays if necessary */
if(atom->Nclusters_local > nmax) {
nmax = atom->Nclusters_local;
if(neighbor->numneigh) free(neighbor->numneigh);
if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
if(neighbor->neighbors) free(neighbor->neighbors);
if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
}
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
MD_FLOAT bby = 0.5 * (binsizey + binsizey);
MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
rbb_sq = rbb_sq * rbb_sq;
int resize = 1;
/* loop over each atom, storing neighbors */
while(resize) {
int new_maxneighs = neighbor->maxneighs;
resize = 0;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj1 = CJ1_FROM_CI(ci);
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
int n = 0, nmasked = 0;
int ibin = atom->icluster_bin[ci];
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
for(int k = 0; k < nstencil; k++) {
int jbin = ibin + stencil[k];
int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
int cj, m = -1;
MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
const int c = bin_nclusters[jbin];
if(c > 0) {
MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
do {
m++;
cj = loc_bin[m];
if(neighbor->half_neigh && ci_cj1 > cj) {
continue;
}
jbb_zmin = atom->jclusters[cj].bbminz;
jbb_zmax = atom->jclusters[cj].bbmaxz;
dl = ibb_zmin - jbb_zmax;
dh = jbb_zmin - ibb_zmax;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d_bb_sq = dm0 * dm0;
} while(m + 1 < c && d_bb_sq > cutneighsq);
jbb_xmin = atom->jclusters[cj].bbminx;
jbb_xmax = atom->jclusters[cj].bbmaxx;
jbb_ymin = atom->jclusters[cj].bbminy;
jbb_ymax = atom->jclusters[cj].bbmaxy;
while(m < c) {
if(!neighbor->half_neigh || ci_cj1 <= cj) {
dl = ibb_zmin - jbb_zmax;
dh = jbb_zmin - ibb_zmax;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d_bb_sq = dm0 * dm0;
/*if(d_bb_sq > cutneighsq) {
break;
}*/
dl = ibb_ymin - jbb_ymax;
dh = jbb_ymin - ibb_ymax;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d_bb_sq += dm0 * dm0;
dl = ibb_xmin - jbb_xmax;
dh = jbb_xmin - ibb_xmax;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d_bb_sq += dm0 * dm0;
if(d_bb_sq < cutneighsq) {
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
// We use true (1) for rdiag because we only care if there are masks
// at all, and when this is set to false (0) the self-exclusions are
// not accounted for, which makes the optimized version to not work!
unsigned int imask;
#if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
imask = get_imask_simd_2xnn(1, ci, cj);
#else // 4xn
imask = get_imask_simd_4xn(1, ci, cj);
#endif
if(n < neighbor->maxneighs) {
if(imask == NBNXN_INTERACTION_MASK_ALL) {
neighptr[n] = cj;
neighptr_imask[n] = imask;
} else {
neighptr[n] = neighptr[nmasked];
neighptr_imask[n] = neighptr_imask[nmasked];
neighptr[nmasked] = cj;
neighptr_imask[nmasked] = imask;
nmasked++;
}
}
n++;
}
}
}
m++;
if(m < c) {
cj = loc_bin[m];
jbb_xmin = atom->jclusters[cj].bbminx;
jbb_xmax = atom->jclusters[cj].bbmaxx;
jbb_ymin = atom->jclusters[cj].bbminy;
jbb_ymax = atom->jclusters[cj].bbmaxy;
jbb_zmin = atom->jclusters[cj].bbminz;
jbb_zmax = atom->jclusters[cj].bbmaxz;
}
}
}
}
// Fill neighbor list with dummy values to fit vector width
if(CLUSTER_N < VECTOR_WIDTH) {
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
neighptr_imask[n] = 0;
n++;
}
}
neighbor->numneigh[ci] = n;
neighbor->numneigh_masked[ci] = nmasked;
if(n >= neighbor->maxneighs) {
resize = 1;
if(n >= new_maxneighs) {
new_maxneighs = n;
}
}
}
if(resize) {
neighbor->maxneighs = new_maxneighs * 1.2;
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
free(neighbor->neighbors);
free(neighbor->neighbors_imask);
neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
}
}
/*
DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
for(int ci = 0; ci < 6; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
ci,
atom->iclusters[ci].bbminx,
atom->iclusters[ci].bbmaxx,
atom->iclusters[ci].bbminy,
atom->iclusters[ci].bbmaxy,
atom->iclusters[ci].bbminz,
atom->iclusters[ci].bbmaxz);
for(int cii = 0; cii < CLUSTER_M; cii++) {
DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
DEBUG_MESSAGE("Neighbors:\n");
for(int k = 0; k < neighbor->numneigh[ci]; k++) {
int cj = neighptr[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
cj,
atom->jclusters[cj].bbminx,
atom->jclusters[cj].bbmaxx,
atom->jclusters[cj].bbminy,
atom->jclusters[cj].bbmaxy,
atom->jclusters[cj].bbminz,
atom->jclusters[cj].bbmaxz);
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
}
}
}
*/
DEBUG_MESSAGE("buildNeighbor end\n");
}
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("pruneNeighbor start\n");
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
MD_FLOAT cutsq = cutneighsq;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
int numneighs_masked = neighbor->numneigh_masked[ci];
int k = 0;
// Remove dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) {
while(neighs[numneighs - 1] == atom->dummy_cj) {
numneighs--;
}
}
while(k < numneighs) {
int cj = neighs[k];
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
k++;
} else {
numneighs--;
if(k < numneighs_masked) {
numneighs_masked--;
}
neighs[k] = neighs[numneighs];
}
}
// Readd dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) {
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
neighs_imask[numneighs] = 0;
numneighs++;
}
}
neighbor->numneigh[ci] = numneighs;
neighbor->numneigh_masked[ci] = numneighs_masked;
}
DEBUG_MESSAGE("pruneNeighbor end\n");
}
/* internal subroutines */
MD_FLOAT bindist(int i, int j) {
MD_FLOAT delx, dely, delz;
if(i > 0) {
delx = (i - 1) * binsizex;
} else if(i == 0) {
delx = 0.0;
} else {
delx = (i + 1) * binsizex;
}
if(j > 0) {
dely = (j - 1) * binsizey;
} else if(j == 0) {
dely = 0.0;
} else {
dely = (j + 1) * binsizey;
}
return (delx * delx + dely * dely);
}
int coord2bin(MD_FLOAT xin, MD_FLOAT yin) {
int ix, iy;
if(xin >= xprd) {
ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
} else if(xin >= 0.0) {
ix = (int)(xin * bininvx) - mbinxlo;
} else {
ix = (int)(xin * bininvx) - mbinxlo - 1;
}
if(yin >= yprd) {
iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
} else if(yin >= 0.0) {
iy = (int)(yin * bininvy) - mbinylo;
} else {
iy = (int)(yin * bininvy) - mbinylo - 1;
}
return (iy * mbinx + ix + 1);
}
void coord2bin2D(MD_FLOAT xin, MD_FLOAT yin, int *ix, int *iy) {
if(xin >= xprd) {
*ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
} else if(xin >= 0.0) {
*ix = (int)(xin * bininvx) - mbinxlo;
} else {
*ix = (int)(xin * bininvx) - mbinxlo - 1;
}
if(yin >= yprd) {
*iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
} else if(yin >= 0.0) {
*iy = (int)(yin * bininvy) - mbinylo;
} else {
*iy = (int)(yin * bininvy) - mbinylo - 1;
}
}
void binAtoms(Atom *atom) {
DEBUG_MESSAGE("binAtoms start\n");
int resize = 1;
while(resize > 0) {
resize = 0;
for(int i = 0; i < mbins; i++) {
bincount[i] = 0;
}
for(int i = 0; i < atom->Nlocal; i++) {
int ibin = coord2bin(atom_x(i), atom_y(i));
if(bincount[ibin] < atoms_per_bin) {
int ac = bincount[ibin]++;
bins[ibin * atoms_per_bin + ac] = i;
} else {
resize = 1;
}
}
if(resize) {
free(bins);
atoms_per_bin *= 2;
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
}
}
DEBUG_MESSAGE("binAtoms end\n");
}
// TODO: Use pigeonhole sorting
void sortAtomsByZCoord(Atom *atom) {
DEBUG_MESSAGE("sortAtomsByZCoord start\n");
for(int bin = 0; bin < mbins; bin++) {
int c = bincount[bin];
int *bin_ptr = &bins[bin * atoms_per_bin];
for(int ac_i = 0; ac_i < c; ac_i++) {
int i = bin_ptr[ac_i];
int min_ac = ac_i;
int min_idx = i;
MD_FLOAT min_z = atom_z(i);
for(int ac_j = ac_i + 1; ac_j < c; ac_j++) {
int j = bin_ptr[ac_j];
MD_FLOAT zj = atom_z(j);
if(zj < min_z) {
min_ac = ac_j;
min_idx = j;
min_z = zj;
}
}
bin_ptr[ac_i] = min_idx;
bin_ptr[min_ac] = i;
}
}
DEBUG_MESSAGE("sortAtomsByZCoord end\n");
}
void buildClusters(Atom *atom) {
DEBUG_MESSAGE("buildClusters start\n");
atom->Nclusters_local = 0;
/* bin local atoms */
binAtoms(atom);
sortAtomsByZCoord(atom);
for(int bin = 0; bin < mbins; bin++) {
int c = bincount[bin];
int ac = 0;
int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
for(int cl = 0; cl < nclusters; cl++) {
const int ci = atom->Nclusters_local;
if(ci >= atom->Nclusters_max) {
growClusters(atom);
}
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
int *ci_type = &atom->cl_type[ci_sca_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
atom->iclusters[ci].natoms = 0;
for(int cii = 0; cii < CLUSTER_M; cii++) {
if(ac < c) {
int i = bins[bin * atoms_per_bin + ac];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
ci_x[CL_X_OFFSET + cii] = xtmp;
ci_x[CL_Y_OFFSET + cii] = ytmp;
ci_x[CL_Z_OFFSET + cii] = ztmp;
ci_v[CL_X_OFFSET + cii] = atom->vx[i];
ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
ci_type[cii] = atom->type[i];
atom->iclusters[ci].natoms++;
} else {
ci_x[CL_X_OFFSET + cii] = INFINITY;
ci_x[CL_Y_OFFSET + cii] = INFINITY;
ci_x[CL_Z_OFFSET + cii] = INFINITY;
}
ac++;
}
atom->icluster_bin[ci] = bin;
atom->iclusters[ci].bbminx = bbminx;
atom->iclusters[ci].bbmaxx = bbmaxx;
atom->iclusters[ci].bbminy = bbminy;
atom->iclusters[ci].bbmaxy = bbmaxy;
atom->iclusters[ci].bbminz = bbminz;
atom->iclusters[ci].bbmaxz = bbmaxz;
atom->Nclusters_local++;
}
}
DEBUG_MESSAGE("buildClusters end\n");
}
void defineJClusters(Atom *atom) {
DEBUG_MESSAGE("defineJClusters start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int cj0 = CJ0_FROM_CI(ci);
if(CLUSTER_M == CLUSTER_N) {
atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx;
atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx;
atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy;
atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy;
atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz;
atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz;
atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms;
} else if(CLUSTER_M > CLUSTER_N) {
int cj1 = CJ1_FROM_CI(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) {
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
}
atom->jclusters[cj0].bbminx = bbminx;
atom->jclusters[cj0].bbmaxx = bbmaxx;
atom->jclusters[cj0].bbminy = bbminy;
atom->jclusters[cj0].bbmaxy = bbmaxy;
atom->jclusters[cj0].bbminz = bbminz;
atom->jclusters[cj0].bbmaxz = bbmaxz;
atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N);
bbminx = INFINITY, bbmaxx = -INFINITY;
bbminy = INFINITY, bbmaxy = -INFINITY;
bbminz = INFINITY, bbmaxz = -INFINITY;
for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) {
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
}
atom->jclusters[cj1].bbminx = bbminx;
atom->jclusters[cj1].bbmaxx = bbmaxx;
atom->jclusters[cj1].bbminy = bbminy;
atom->jclusters[cj1].bbmaxy = bbmaxy;
atom->jclusters[cj1].bbminz = bbminz;
atom->jclusters[cj1].bbmaxz = bbmaxz;
atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N);
} else {
if(ci % 2 == 0) {
const int ci1 = ci + 1;
atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx);
atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx);
atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy);
atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy);
atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz);
atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz);
atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms;
}
}
}
DEBUG_MESSAGE("defineJClusters end\n");
}
void binClusters(Atom *atom) {
DEBUG_MESSAGE("binClusters start\n");
/*
DEBUG_MESSAGE("Nghost = %d\n", atom->Nclusters_ghost);
for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + 4; ci++) {
MD_FLOAT *cptr = cluster_pos_ptr(ci);
DEBUG_MESSAGE("Cluster %d:\n", ci);
DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
atom->icluster_bin[ci],
atom->clusters[ci].natoms,
atom->clusters[ci].bbminx,
atom->clusters[ci].bbmaxx,
atom->clusters[ci].bbminy,
atom->clusters[ci].bbmaxy,
atom->clusters[ci].bbminz,
atom->clusters[ci].bbmaxz);
for(int cii = 0; cii < CLUSTER_M; cii++) {
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
}
}
*/
const int nlocal = atom->Nclusters_local;
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
const int ncj = atom->Nclusters_local / jfac;
int resize = 1;
while(resize > 0) {
resize = 0;
for(int bin = 0; bin < mbins; bin++) {
bin_nclusters[bin] = 0;
}
for(int ci = 0; ci < nlocal && !resize; ci++) {
// Assure we add this j-cluster only once in the bin
if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
int bin = atom->icluster_bin[ci];
int c = bin_nclusters[bin];
if(c + 1 < clusters_per_bin) {
bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
bin_nclusters[bin]++;
if(CLUSTER_M > CLUSTER_N) {
int cj1 = CJ1_FROM_CI(ci);
if(atom->jclusters[cj1].natoms > 0) {
bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
bin_nclusters[bin]++;
}
}
} else {
resize = 1;
}
}
}
for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
const int cj = ncj + cg;
int ix = -1, iy = -1;
MD_FLOAT xtmp, ytmp;
if(atom->jclusters[cj].natoms > 0) {
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
xtmp = cj_x[CL_X_OFFSET + 0];
ytmp = cj_x[CL_Y_OFFSET + 0];
coord2bin2D(xtmp, ytmp, &ix, &iy);
ix = MAX(MIN(ix, mbinx - 1), 0);
iy = MAX(MIN(iy, mbiny - 1), 0);
for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
int nix, niy;
xtmp = cj_x[CL_X_OFFSET + cjj];
ytmp = cj_x[CL_Y_OFFSET + cjj];
coord2bin2D(xtmp, ytmp, &nix, &niy);
nix = MAX(MIN(nix, mbinx - 1), 0);
niy = MAX(MIN(niy, mbiny - 1), 0);
// Always put the cluster on the bin of its innermost atom so
// the cluster should be closer to local clusters
if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; }
if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; }
if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; }
if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; }
}
int bin = iy * mbinx + ix + 1;
int c = bin_nclusters[bin];
if(c < clusters_per_bin) {
// Insert the current ghost cluster in the bin keeping clusters
// sorted by z coordinate
int inserted = 0;
for(int i = 0; i < c; i++) {
int last_cl = bin_clusters[bin * clusters_per_bin + i];
if(atom->jclusters[last_cl].bbminz > cj_minz) {
bin_clusters[bin * clusters_per_bin + i] = cj;
for(int j = i + 1; j <= c; j++) {
int tmp = bin_clusters[bin * clusters_per_bin + j];
bin_clusters[bin * clusters_per_bin + j] = last_cl;
last_cl = tmp;
}
inserted = 1;
break;
}
}
if(!inserted) {
bin_clusters[bin * clusters_per_bin + c] = cj;
}
bin_nclusters[bin]++;
} else {
resize = 1;
}
}
}
if(resize) {
free(bin_clusters);
clusters_per_bin *= 2;
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
}
}
/*
DEBUG_MESSAGE("bin_nclusters\n");
for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
DEBUG_MESSAGE("\n");
*/
DEBUG_MESSAGE("binClusters stop\n");
}
void updateSingleAtoms(Atom *atom) {
DEBUG_MESSAGE("updateSingleAtoms start\n");
int Natom = 0;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
atom->vx[Natom] = ci_v[CL_X_OFFSET + cii];
atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
Natom++;
}
}
if(Natom != atom->Nlocal) {
fprintf(stderr, "updateSingleAtoms(): Number of atoms changed!\n");
}
DEBUG_MESSAGE("updateSingleAtoms stop\n");
}

View File

@ -1,49 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <parameter.h>
#ifndef __NEIGHBOR_H_
#define __NEIGHBOR_H_
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
// 1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
// interaction masks (1 = interaction, 0 = no interaction)
// 2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
// so read them from right to left (least significant to most significant bit)
// All interaction mask is the same for all kernels
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
// 4x4 kernel diagonal mask
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
// 4x2 kernel diagonal masks
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
// 4x8 kernel diagonal masks
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
typedef struct {
int every;
int ncalls;
int maxneighs;
int* numneigh;
int* numneigh_masked;
int half_neigh;
int* neighbors;
unsigned int* neighbors_imask;
} Neighbor;
extern void initNeighbor(Neighbor*, Parameter*);
extern void setupNeighbor(Parameter*, Atom*);
extern void binatoms(Atom*);
extern void buildNeighbor(Atom*, Neighbor*);
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
extern void sortAtom(Atom*);
extern void buildClusters(Atom*);
extern void defineJClusters(Atom*);
extern void binClusters(Atom*);
extern void updateSingleAtoms(Atom*);
#endif

View File

@ -1,231 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <pbc.h>
#include <atom.h>
#include <allocate.h>
#include <neighbor.h>
#include <util.h>
#define DELTA 20000
static int NmaxGhost;
static void growPbc(Atom*);
/* exported subroutines */
void initPbc(Atom* atom) {
NmaxGhost = 0;
atom->border_map = NULL;
atom->PBCx = NULL; atom->PBCy = NULL; atom->PBCz = NULL;
}
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
DEBUG_MESSAGE("updatePbc start\n");
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
cj_x[CL_X_OFFSET + cjj] = xtmp;
cj_x[CL_Y_OFFSET + cjj] = ytmp;
cj_x[CL_Z_OFFSET + cjj] = ztmp;
if(firstUpdate) {
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
}
}
if(firstUpdate) {
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
}
atom->jclusters[cj].bbminx = bbminx;
atom->jclusters[cj].bbmaxx = bbmaxx;
atom->jclusters[cj].bbminy = bbminy;
atom->jclusters[cj].bbmaxy = bbmaxy;
atom->jclusters[cj].bbminz = bbminz;
atom->jclusters[cj].bbmaxz = bbmaxz;
}
}
DEBUG_MESSAGE("updatePbc end\n");
}
/* relocate atoms that have left domain according
* to periodic boundary conditions */
void updateAtomsPbc(Atom *atom, Parameter *param) {
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int i = 0; i < atom->Nlocal; i++) {
if(atom_x(i) < 0.0) {
atom_x(i) += xprd;
} else if(atom_x(i) >= xprd) {
atom_x(i) -= xprd;
}
if(atom_y(i) < 0.0) {
atom_y(i) += yprd;
} else if(atom_y(i) >= yprd) {
atom_y(i) -= yprd;
}
if(atom_z(i) < 0.0) {
atom_z(i) += zprd;
} else if(atom_z(i) >= zprd) {
atom_z(i) -= zprd;
}
}
}
/* setup periodic boundary conditions by
* defining ghost atoms around domain
* only creates mapping and coordinate corrections
* that are then enforced in updatePbc */
#define ADDGHOST(dx,dy,dz); \
Nghost++; \
const int cg = ncj + Nghost; \
const int cj_natoms = atom->jclusters[cj].natoms; \
atom->border_map[Nghost] = cj; \
atom->PBCx[Nghost] = dx; \
atom->PBCy[Nghost] = dy; \
atom->PBCz[Nghost] = dz; \
atom->jclusters[cg].natoms = cj_natoms; \
Nghost_atoms += cj_natoms; \
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); \
int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg); \
for(int cjj = 0; cjj < cj_natoms; cjj++) { \
atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj]; \
}
/* internal subroutines */
void growPbc(Atom* atom) {
int nold = NmaxGhost;
NmaxGhost += DELTA;
atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
}
void setupPbc(Atom *atom, Parameter *param) {
DEBUG_MESSAGE("setupPbc start\n");
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
MD_FLOAT Cutneigh = param->cutneigh;
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
int Nghost = -1;
int Nghost_atoms = 0;
for(int cj = 0; cj < ncj; cj++) {
if(atom->jclusters[cj].natoms > 0) {
if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
growClusters(atom);
}
if((Nghost + 7) * jfac >= NmaxGhost) {
growPbc(atom);
}
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
/* Setup ghost atoms */
/* 6 planes */
if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); }
if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); }
if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); }
if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
/* 8 corners */
if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); }
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); }
if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
/* 12 edges */
if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); }
if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); }
if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); }
if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
}
}
if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
growClusters(atom);
}
// Add dummy cluster at the end
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
}
// increase by one to make it the ghost atom count
atom->dummy_cj = ncj + Nghost + 1;
atom->Nghost = Nghost_atoms;
atom->Nclusters_ghost = Nghost + 1;
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
// Update created ghost clusters positions
cpuUpdatePbc(atom, param, 1);
DEBUG_MESSAGE("setupPbc end\n");
}

View File

@ -1,20 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <parameter.h>
#ifndef __PBC_H_
#define __PBC_H_
extern void initPbc();
extern void cpuUpdatePbc(Atom*, Parameter*, int);
extern void updateAtomsPbc(Atom*, Parameter*);
extern void setupPbc(Atom*, Parameter*);
#ifdef CUDA_TARGET
extern void cudaUpdatePbc(Atom*, Parameter*, int);
#endif
#endif

View File

@ -1,58 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <atom.h>
#include <parameter.h>
#include <stats.h>
#include <timers.h>
void initStats(Stats *s) {
s->calculated_forces = 0;
s->num_neighs = 0;
s->force_iters = 0;
s->atoms_within_cutoff = 0;
s->atoms_outside_cutoff = 0;
s->clusters_within_cutoff = 0;
s->clusters_outside_cutoff = 0;
}
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
#ifdef COMPUTE_STATS
const int MxN = CLUSTER_M * CLUSTER_N;
double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
(double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
#ifdef EXPLICIT_TYPES
force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->num_neighs) * sizeof(int);
#endif
printf("Statistics:\n");
printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
printf("\tAverage atoms per cluster: %.4f\n", avg_atoms_cluster);
printf("\tAverage neighbors per atom: %.4f\n", avg_neigh_atom);
printf("\tAverage neighbors per cluster: %.4f\n", avg_neigh_cluster);
printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
printf("\tTotal number of computed pair interactions: %lld\n", stats->num_neighs * MxN);
printf("\tTotal number of SIMD iterations: %lld\n", stats->force_iters);
printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->force_iters);
#ifdef USE_REFERENCE_VERSION
const double atoms_eff = (double)stats->atoms_within_cutoff / (double)(stats->atoms_within_cutoff + stats->atoms_outside_cutoff) * 100.0;
printf("\tAtoms within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->atoms_within_cutoff, stats->atoms_outside_cutoff, atoms_eff);
const double clusters_eff = (double)stats->clusters_within_cutoff / (double)(stats->clusters_within_cutoff + stats->clusters_outside_cutoff) * 100.0;
printf("\tClusters within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->clusters_within_cutoff, stats->clusters_outside_cutoff, clusters_eff);
#endif
#endif
}

View File

@ -1,35 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <parameter.h>
#ifndef __STATS_H_
#define __STATS_H_
typedef struct {
long long int calculated_forces;
long long int num_neighs;
long long int force_iters;
long long int atoms_within_cutoff;
long long int atoms_outside_cutoff;
long long int clusters_within_cutoff;
long long int clusters_outside_cutoff;
} Stats;
void initStats(Stats *s);
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
#ifdef COMPUTE_STATS
# define addStat(stat, value) stat += value;
# define beginStatTimer() double Si = getTimeStamp();
# define endStatTimer(stat) stat += getTimeStamp() - Si;
#else
# define addStat(stat, value)
# define beginStatTimer()
# define endStatTimer(stat)
#endif
#endif

View File

@ -1,61 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <tracing.h>
void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep) {
MEM_TRACER_INIT;
INDEX_TRACER_INIT;
int Nlocal = atom->Nlocal;
int *neighs;
unsigned int *neighs_imask;
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MEM_TRACE(atom_x(i), 'R');
MEM_TRACE(atom_y(i), 'R');
MEM_TRACE(atom_z(i), 'R');
INDEX_TRACE_ATOM(i);
#ifdef EXPLICIT_TYPES
MEM_TRACE(atom->type[i], 'R');
#endif
DIST_TRACE_SORT(neighs, numneighs);
INDEX_TRACE(neighs, numneighs);
DIST_TRACE(neighs, numneighs);
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MEM_TRACE(j, 'R');
MEM_TRACE(atom_x(j), 'R');
MEM_TRACE(atom_y(j), 'R');
MEM_TRACE(atom_z(j), 'R');
#ifdef EXPLICIT_TYPES
MEM_TRACE(atom->type[j], 'R');
#endif
}
/*
MEM_TRACE(fx[i], 'R');
MEM_TRACE(fx[i], 'W');
MEM_TRACE(fy[i], 'R');
MEM_TRACE(fy[i], 'W');
MEM_TRACE(fz[i], 'R');
MEM_TRACE(fz[i], 'W');
*/
}
INDEX_TRACER_END;
MEM_TRACER_END;
}

View File

@ -1,102 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
#include <stdio.h>
#include <stdlib.h>
#endif
#ifndef VECTOR_WIDTH
# define VECTOR_WIDTH 8
#endif
#ifndef TRACER_CONDITION
# define TRACER_CONDITION (!(timestep % param->every))
#endif
#ifdef MEM_TRACER
# define MEM_TRACER_INIT FILE *mem_tracer_fp; \
if(TRACER_CONDITION) { \
char mem_tracer_fn[128]; \
snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
mem_tracer_fp = fopen(mem_tracer_fn, "w");
}
# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
#else
# define MEM_TRACER_INIT
# define MEM_TRACER_END
# define MEM_TRACE(addr, op)
#endif
#ifdef INDEX_TRACER
# define INDEX_TRACER_INIT FILE *index_tracer_fp; \
if(TRACER_CONDITION) { \
char index_tracer_fn[128]; \
snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
index_tracer_fp = fopen(index_tracer_fn, "w"); \
}
# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); }
# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
fprintf(index_tracer_fp, "I: "); \
for(int __j = 0; __j < __e; ++__j) { \
fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
} \
fprintf(index_tracer_fp, "\n"); \
} \
}
# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
if(__e > 1) { \
for(int __j = __i; __j < __i + __e - 1; ++__j) { \
for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
if(l[__k] > l[__k + 1]) { \
int __t = l[__k]; \
l[__k] = l[__k + 1]; \
l[__k + 1] = __t; \
} \
} \
} \
} \
} \
}
# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
if(__e > 1) { \
fprintf(index_tracer_fp, "D: "); \
for(int __j = 0; __j < __e - 1; ++__j) { \
int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
fprintf(index_tracer_fp, "%d ", __dist); \
} \
fprintf(index_tracer_fp, "\n"); \
} \
} \
}
#else
# define INDEX_TRACER_INIT
# define INDEX_TRACER_END
# define INDEX_TRACE_NATOMS(nl, ng, mn)
# define INDEX_TRACE_ATOM(a)
# define INDEX_TRACE(l, e)
# define DIST_TRACE_SORT(l, e)
# define DIST_TRACE(l, e)
#endif
extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);

View File

@ -1,190 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <atom.h>
#include <vtk.h>
void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
write_local_atoms_to_vtk_file(filename, atom, timestep);
write_ghost_atoms_to_vtk_file(filename, atom, timestep);
write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
}
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
fprintf(fp, "\n\n");
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1 %d\n", i);
}
fprintf(fp, "\n\n");
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1\n");
}
fprintf(fp, "\n\n");
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
fprintf(fp, "SCALARS mass double\n");
fprintf(fp, "LOOKUP_TABLE default\n");
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fp, "1.0\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nghost);
for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
fprintf(fp, "\n\n");
fprintf(fp, "CELLS %d %d\n", atom->Nghost, atom->Nghost * 2);
for(int i = 0; i < atom->Nghost; ++i) {
fprintf(fp, "1 %d\n", i);
}
fprintf(fp, "\n\n");
fprintf(fp, "CELL_TYPES %d\n", atom->Nghost);
for(int i = 0; i < atom->Nghost; ++i) {
fprintf(fp, "1\n");
}
fprintf(fp, "\n\n");
fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
fprintf(fp, "SCALARS mass double\n");
fprintf(fp, "LOOKUP_TABLE default\n");
for(int i = 0; i < atom->Nghost; i++) {
fprintf(fp, "1.0\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_edges_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
int N = atom->Nclusters_local;
int tot_lines = 0;
int i = 0;
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET POLYDATA\n");
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
for(int ci = 0; ci < N; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
tot_lines += atom->iclusters[ci].natoms;
}
fprintf(fp, "\n\n");
fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
for(int ci = 0; ci < N; ++ci) {
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%d ", i++);
}
fprintf(fp, "\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_edges_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
int N = atom->Nclusters_local + atom->Nclusters_ghost;
int tot_lines = 0;
int i = 0;
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET POLYDATA\n");
fprintf(fp, "POINTS %d double\n", atom->Nghost);
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
tot_lines += atom->iclusters[ci].natoms;
}
fprintf(fp, "\n\n");
fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%d ", i++);
}
fprintf(fp, "\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}

View File

@ -1,16 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#ifndef __VTK_H_
#define __VTK_H_
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
#endif

View File

@ -1,56 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
//---
#include <atom.h>
#include <allocate.h>
#include <xtc.h>
#ifdef XTC_OUTPUT
#include <gromacs/fileio/xtcio.h>
static struct t_fileio *xtc_file = NULL;
static rvec *x_buf = NULL;
static rvec basis[3];
void xtc_init(const char *filename, Atom *atom, int timestep) {
basis[0][XX] = 1.0;
basis[0][YY] = 0.0;
basis[0][ZZ] = 0.0;
basis[1][XX] = 0.0;
basis[1][YY] = 1.0;
basis[1][ZZ] = 0.0;
basis[2][XX] = 0.0;
basis[2][YY] = 0.0;
basis[2][ZZ] = 1.0;
xtc_file = open_xtc(filename, "w");
x_buf = (rvec *) allocate(ALIGNMENT, sizeof(rvec) * (atom->Nlocal + 1));
xtc_write(atom, timestep, 1, 1);
}
void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
int i = 0;
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
i++;
}
}
write_xtc(xtc_file, atom->Nlocal, timestep, 0.0, (const rvec *) basis, (const rvec *) x_buf, 1000);
}
void xtc_end() {
free(x_buf);
close_xtc(xtc_file);
}
#endif

View File

@ -1,21 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#ifndef __XTC_H_
#define __XTC_H_
#ifdef XTC_OUTPUT
void xtc_init(const char *, Atom*, int);
void xtc_write(Atom*, int, int, int);
void xtc_end();
#else
#define xtc_init(a,b,c)
#define xtc_write(a,b,c,d)
#define xtc_end()
#endif
#endif

View File

@ -1,44 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <util.h>
void *allocate(int alignment, size_t bytesize) {
void *ptr;
int errorCode;
errorCode = posix_memalign(&ptr, alignment, bytesize);
if(errorCode == EINVAL) {
fprintf(stderr, "Error: Alignment parameter is not a power of two\n");
exit(EXIT_FAILURE);
}
if(errorCode == ENOMEM) {
fprintf(stderr, "Error: Insufficient memory to fulfill the request\n");
exit(EXIT_FAILURE);
}
if(ptr == NULL) {
fprintf(stderr, "Error: posix_memalign failed!\n");
exit(EXIT_FAILURE);
}
return ptr;
}
void *reallocate(void* ptr, int alignment, size_t new_bytesize, size_t old_bytesize) {
void *newarray = allocate(alignment, new_bytesize);
if(ptr != NULL) {
memcpy(newarray, ptr, old_bytesize);
free(ptr);
}
return newarray;
}

View File

@ -1,13 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#ifndef __ALLOCATE_H_
#define __ALLOCATE_H_
extern void* allocate (int alignment, size_t bytesize);
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
#endif

View File

@ -1,68 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
//---
#include <device.h>
#ifdef CUDA_TARGET
#include <cuda_runtime.h>
void cuda_assert(const char *label, cudaError_t err) {
if (err != cudaSuccess) {
printf("[CUDA Error]: %s: %s\r\n", label, cudaGetErrorString(err));
exit(-1);
}
}
void *allocateGPU(size_t bytesize) {
void *ptr;
#ifdef CUDA_HOST_MEMORY
cuda_assert("allocateGPU", cudaMallocHost((void **) &ptr, bytesize));
#else
cuda_assert("allocateGPU", cudaMalloc((void **) &ptr, bytesize));
#endif
return ptr;
}
// Data is not preserved
void *reallocateGPU(void *ptr, size_t new_bytesize) {
if(ptr != NULL) {
#ifdef CUDA_HOST_MEMORY
cudaFreeHost(ptr);
#else
cudaFree(ptr);
#endif
}
return allocateGPU(new_bytesize);
}
void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {
#ifndef CUDA_HOST_MEMORY
cuda_assert("memcpyToGPU", cudaMemcpy(d_ptr, h_ptr, bytesize, cudaMemcpyHostToDevice));
#endif
}
void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {
#ifndef CUDA_HOST_MEMORY
cuda_assert("memcpyFromGPU", cudaMemcpy(h_ptr, d_ptr, bytesize, cudaMemcpyDeviceToHost));
#endif
}
void memsetGPU(void *d_ptr, int value, size_t bytesize) {
cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize));
}
#else
void initDevice(Atom *atom, Neighbor *neighbor) {}
void *allocateGPU(size_t bytesize) { return NULL; }
void *reallocateGPU(void *ptr, size_t new_bytesize) { return NULL; }
void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {}
void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {}
void memsetGPU(void *d_ptr, int value, size_t bytesize) {}
#endif

View File

@ -1,26 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stddef.h>
//---
#include <atom.h>
#include <neighbor.h>
#ifndef __DEVICE_H_
#define __DEVICE_H_
#ifdef CUDA_TARGET
#include <cuda_runtime.h>
extern void cuda_assert(const char *msg, cudaError_t err);
#endif
extern void initDevice(Atom*, Neighbor*);
extern void *allocateGPU(size_t bytesize);
extern void *reallocateGPU(void *ptr, size_t new_bytesize);
extern void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize);
extern void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize);
extern void memsetGPU(void *d_ptr, int value, size_t bytesize);
#endif

View File

@ -1,39 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <atom.h>
#include <parameter.h>
#ifndef __EAM_H_
#define __EAM_H_
typedef struct {
int nrho, nr;
MD_FLOAT drho, dr, cut, mass;
MD_FLOAT *frho, *rhor, *zr;
} Funcfl;
typedef struct {
MD_FLOAT* fp;
int nmax;
int nrho, nr;
int nrho_tot, nr_tot;
MD_FLOAT dr, rdr, drho, rdrho;
MD_FLOAT *frho, *rhor, *z2r;
MD_FLOAT *rhor_spline, *frho_spline, *z2r_spline;
Funcfl file;
} Eam;
void initEam(Eam* eam, Parameter* param);
void coeff(Eam* eam, Parameter* param);
void init_style(Eam* eam, Parameter *param);
void read_eam_file(Funcfl* file, const char* filename);
void file2array(Eam* eam);
void array2spline(Eam* eam, Parameter* param);
void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline);
void grab(FILE* fptr, int n, MD_FLOAT* list);
#endif

View File

@ -1,186 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//---
#include <atom.h>
#include <parameter.h>
#include <util.h>
void initParameter(Parameter *param) {
param->input_file = NULL;
param->vtk_file = NULL;
param->xtc_file = NULL;
param->eam_file = NULL;
param->write_atom_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma = 1.0;
param->sigma6 = 1.0;
param->rho = 0.8442;
param->ntypes = 4;
param->ntimes = 200;
param->dt = 0.005;
param->nx = 32;
param->ny = 32;
param->nz = 32;
param->pbc_x = 1;
param->pbc_y = 1;
param->pbc_z = 1;
param->cutforce = 2.5;
param->skin = 0.3;
param->cutneigh = param->cutforce + param->skin;
param->temp = 1.44;
param->nstat = 100;
param->mass = 1.0;
param->dtforce = 0.5 * param->dt;
param->reneigh_every = 20;
param->prune_every = 1000;
param->x_out_every = 20;
param->v_out_every = 5;
param->half_neigh = 0;
param->proc_freq = 2.4;
// DEM
param->k_s = 1.0;
param->k_dn = 1.0;
param->gx = 0.0;
param->gy = 0.0;
param->gz = 0.0;
param->reflect_x = 0.0;
param->reflect_y = 0.0;
param->reflect_z = 0.0;
}
void readParameter(Parameter *param, const char *filename) {
FILE *fp = fopen(filename, "r");
char line[MAXLINE];
int i;
if(!fp) {
fprintf(stderr, "Could not open parameter file: %s\n", filename);
exit(-1);
}
while(!feof(fp)) {
line[0] = '\0';
readline(line, fp);
for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
line[i] = '\0';
char *tok = strtok(line, " ");
char *val = strtok(NULL, " ");
#define PARSE_PARAM(p,f) if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
#define PARSE_STRING(p) PARSE_PARAM(p, strdup)
#define PARSE_INT(p) PARSE_PARAM(p, atoi)
#define PARSE_REAL(p) PARSE_PARAM(p, atof)
if(tok != NULL && val != NULL) {
PARSE_PARAM(force_field, str2ff);
PARSE_STRING(input_file);
PARSE_STRING(eam_file);
PARSE_STRING(vtk_file);
PARSE_STRING(xtc_file);
PARSE_REAL(epsilon);
PARSE_REAL(sigma);
PARSE_REAL(k_s);
PARSE_REAL(k_dn);
PARSE_REAL(reflect_x);
PARSE_REAL(reflect_y);
PARSE_REAL(reflect_z);
PARSE_REAL(gx);
PARSE_REAL(gy);
PARSE_REAL(gz);
PARSE_REAL(rho);
PARSE_REAL(dt);
PARSE_REAL(cutforce);
PARSE_REAL(skin);
PARSE_REAL(temp);
PARSE_REAL(mass);
PARSE_REAL(proc_freq);
PARSE_INT(ntypes);
PARSE_INT(ntimes);
PARSE_INT(nx);
PARSE_INT(ny);
PARSE_INT(nz);
PARSE_INT(pbc_x);
PARSE_INT(pbc_y);
PARSE_INT(pbc_z);
PARSE_INT(nstat);
PARSE_INT(reneigh_every);
PARSE_INT(prune_every);
PARSE_INT(x_out_every);
PARSE_INT(v_out_every);
PARSE_INT(half_neigh);
}
}
// Update dtforce
param->dtforce = 0.5 * param->dt;
// Update sigma6 parameter
MD_FLOAT s2 = param->sigma * param->sigma;
param->sigma6 = s2 * s2 * s2;
fclose(fp);
}
void printParameter(Parameter *param) {
printf("Parameters:\n");
if(param->input_file != NULL) {
printf("\tInput file: %s\n", param->input_file);
}
if(param->vtk_file != NULL) {
printf("\tVTK file: %s\n", param->vtk_file);
}
if(param->xtc_file != NULL) {
printf("\tXTC file: %s\n", param->xtc_file);
}
if(param->eam_file != NULL) {
printf("\tEAM file: %s\n", param->eam_file);
}
printf("\tForce field: %s\n", ff2str(param->force_field));
#ifdef CLUSTER_M
printf("\tKernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
#else
printf("\tKernel: %s\n", KERNEL_NAME);
#endif
printf("\tData layout: %s\n", POS_DATA_LAYOUT);
printf("\tFloating-point precision: %s\n", PRECISION_STRING);
printf("\tUnit cells (nx, ny, nz): %d, %d, %d\n", param->nx, param->ny, param->nz);
printf("\tDomain box sizes (x, y, z): %e, %e, %e\n", param->xprd, param->yprd, param->zprd);
printf("\tPeriodic (x, y, z): %d, %d, %d\n", param->pbc_x, param->pbc_y, param->pbc_z);
printf("\tLattice size: %e\n", param->lattice);
printf("\tEpsilon: %e\n", param->epsilon);
printf("\tSigma: %e\n", param->sigma);
printf("\tSpring constant: %e\n", param->k_s);
printf("\tDamping constant: %e\n", param->k_dn);
printf("\tTemperature: %e\n", param->temp);
printf("\tRHO: %e\n", param->rho);
printf("\tMass: %e\n", param->mass);
printf("\tNumber of types: %d\n", param->ntypes);
printf("\tNumber of timesteps: %d\n", param->ntimes);
printf("\tReport stats every (timesteps): %d\n", param->nstat);
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
#ifdef SORT_ATOMS
printf("\tSort atoms when reneighboring: yes\n");
#else
printf("\tSort atoms when reneighboring: no\n");
#endif
printf("\tPrune every (timesteps): %d\n", param->prune_every);
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
printf("\tDelta time (dt): %e\n", param->dt);
printf("\tCutoff radius: %e\n", param->cutforce);
printf("\tSkin: %e\n", param->skin);
printf("\tHalf neighbor lists: %d\n", param->half_neigh);
printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
}

View File

@ -1,62 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __PARAMETER_H_
#define __PARAMETER_H_
#if PRECISION == 1
# define MD_FLOAT float
# define MD_UINT unsigned int
#else
# define MD_FLOAT double
# define MD_UINT unsigned long long int
#endif
typedef struct {
int force_field;
char* param_file;
char* input_file;
char* vtk_file;
char* xtc_file;
char* write_atom_file;
MD_FLOAT epsilon;
MD_FLOAT sigma;
MD_FLOAT sigma6;
MD_FLOAT temp;
MD_FLOAT rho;
MD_FLOAT mass;
int ntypes;
int ntimes;
int nstat;
int reneigh_every;
int prune_every;
int x_out_every;
int v_out_every;
int half_neigh;
MD_FLOAT dt;
MD_FLOAT dtforce;
MD_FLOAT skin;
MD_FLOAT cutforce;
MD_FLOAT cutneigh;
int nx, ny, nz;
int pbc_x, pbc_y, pbc_z;
MD_FLOAT lattice;
MD_FLOAT xlo, xhi, ylo, yhi, zlo, zhi;
MD_FLOAT xprd, yprd, zprd;
double proc_freq;
char* eam_file;
// DEM
MD_FLOAT k_s;
MD_FLOAT k_dn;
MD_FLOAT gx, gy, gz;
MD_FLOAT reflect_x, reflect_y, reflect_z;
} Parameter;
void initParameter(Parameter*);
void readParameter(Parameter*, const char*);
void printParameter(Parameter*);
#endif

View File

@ -1,68 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __SIMD_H__
#define __SIMD_H__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#ifndef NO_ZMM_INTRIN
# include <zmmintrin.h>
#endif
#ifndef CLUSTER_M
# define CLUSTER_M 1
#endif
#ifndef CLUSTER_N
# define CLUSTER_N 1
#endif
#if defined(__ISA_AVX512__)
# if PRECISION == 2
# include "simd/avx512_double.h"
# else
# include "simd/avx512_float.h"
# endif
#endif
#if defined(__ISA_AVX2__)
# if PRECISION == 2
# include "simd/avx2_double.h"
# else
# include "simd/avx2_float.h"
# endif
#endif
#if defined(__ISA_AVX__)
# if PRECISION == 2
# include "simd/avx_double.h"
# else
# include "simd/avx_float.h"
# endif
#endif
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
double x[VECTOR_WIDTH];
memcpy(x, &a, sizeof(x));
fprintf(stdout, "%s: ", ref);
for(int i = 0; i < VECTOR_WIDTH; i++) {
fprintf(stdout, "%f ", x[i]);
}
fprintf(stdout, "\n");
}
static inline void simd_print_mask(const char *ref, MD_SIMD_MASK a) { fprintf(stdout, "%s: %x\n", ref, simd_mask_to_u32(a)); }
#endif // __SIMD_H__

View File

@ -1,103 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#define MD_SIMD_FLOAT __m256d
#define MD_SIMD_INT __m128i
#define MD_SIMD_MASK __m256d
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
MD_SIMD_FLOAT ret;
fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX2 with double precision!");
exit(-1);
return ret;
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
MD_SIMD_FLOAT ret;
fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX2 with double precision!");
exit(-1);
return ret;
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX2 with double precision!");
exit(-1);
return 0.0;
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m256d t0, t1, t2;
__m128d a0, a1;
t0 = _mm256_hadd_pd(v0, v1);
t1 = _mm256_hadd_pd(v2, v3);
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
t0 = _mm256_add_pd(t0, t2);
t1 = _mm256_add_pd(t1, t2);
t0 = _mm256_blend_pd(t0, t1, 0xC);
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
_mm256_store_pd(m, t1);
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
a0 = _mm256_castpd256_pd128(t0);
a1 = _mm256_extractf128_pd(t0, 0x1);
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
//static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
const unsigned long long int none = 0x0;
return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
}
// TODO: Implement this, althrough it is just required for debugging
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128d a0, a1;
// test with shuffle & add as an alternative to hadd later
a = _mm256_hadd_pd(a, a);
a0 = _mm256_castpd256_pd128(a);
a1 = _mm256_extractf128_pd(a, 0x1);
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
fprintf(stderr, "simd_h_decr3(): Not implemented for AVX2 with double precision!");
exit(-1);
}
// Functions used in LAMMPS kernel
#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }

View File

@ -1,84 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <immintrin.h>
#include <zmmintrin.h>
#define MD_SIMD_FLOAT __m256
#define MD_SIMD_MASK __mmask8
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128 t0;
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m128 t0, t2;
v0 = _mm256_hadd_ps(v0, v1);
v2 = _mm256_hadd_ps(v2, v3);
v0 = _mm256_hadd_ps(v0, v2);
t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
t2 = _mm_add_ps(t0, _mm_load_ps(m));
_mm_store_ps(m, t2);
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
return _mm256_broadcast_ps((const __m128 *)(m));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
__m128 t0, t1;
t0 = _mm_broadcast_ss(m);
t1 = _mm_broadcast_ss(m + 1);
return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m128 t0, t1;
v0 = _mm256_hadd_ps(v0, v1);
t0 = _mm256_extractf128_ps(v0, 0x1);
t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
t1 = _mm_add_ps(t0, _mm_load_ps(m));
_mm_store_ps(m, t1);
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
_mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
simd_h_decr(m, a0);
simd_h_decr(m + CLUSTER_N, a1);
simd_h_decr(m + CLUSTER_N * 2, a2);
}

View File

@ -1,114 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <immintrin.h>
#ifndef NO_ZMM_INTRIN
# include <zmmintrin.h>
#endif
#define MD_SIMD_FLOAT __m512d
#define MD_SIMD_MASK __mmask8
#define MD_SIMD_INT __m256i
#define MD_SIMD_BITMASK MD_SIMD_INT
#define MD_SIMD_IBOOL __mmask16
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_pd(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_pd(a, b); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_pd(a, b, c); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_pd(a); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_pd(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_pd(p, a); }
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_pd(_mm512_setzero_pd(), m, a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
return *((MD_FLOAT *) &x);
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m512d t0, t2;
__m256d t3, t4;
t0 = _mm512_add_pd(v0, _mm512_permute_pd(v0, 0x55));
t2 = _mm512_add_pd(v2, _mm512_permute_pd(v2, 0x55));
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xaa), v1, _mm512_permute_pd(v1, 0x55));
t2 = _mm512_mask_add_pd(t2, simd_mask_from_u32(0xaa), v3, _mm512_permute_pd(v3, 0x55));
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0x4e));
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xF0), t2, _mm512_shuffle_f64x2(t2, t2, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0xb1));
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0x0C), t0, t0, 0xee);
t3 = _mm512_castpd512_pd256(t0);
t4 = _mm256_load_pd(m);
t4 = _mm256_add_pd(t4, t3);
_mm256_store_pd(m, t4);
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m512d t0;
__m256d t2, t3;
t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
t2 = _mm512_castpd512_pd256(t0);
t3 = _mm256_load_pd(m);
t3 = _mm256_add_pd(t3, t2);
_mm256_store_pd(m, t3);
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
}
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m256d t;
a = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
t = _mm256_load_pd(m);
t = _mm256_sub_pd(t, _mm512_castpd512_pd256(a));
_mm256_store_pd(m, t);
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
simd_h_decr(m, a0);
simd_h_decr(m + CLUSTER_N, a1);
simd_h_decr(m + CLUSTER_N * 2, a2);
}
// Functions used in LAMMPS kernel
//static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm512_i32gather_pd(vidx, m, s); }
#define simd_gather(vidx,m,s) (_mm512_i32gather_pd(vidx, m, s))
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm256_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm256_setzero_si256(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_si256((const MD_SIMD_INT *) m); }
//static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); }
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_add_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_mul_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return _mm256_mask_load_epi32(simd_int_zero(), k, m); }
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cmp_epi32_mask(a, b, _MM_CMPINT_LT); }

View File

@ -1,103 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#ifndef NO_ZMM_INTRIN
# include <zmmintrin.h>
#endif
#define MD_SIMD_FLOAT __m512
#define MD_SIMD_MASK __mmask16
#define MD_SIMD_INT __m256i
#define MD_SIMD_IBOOL __mmask16
#define MD_SIMD_INT32 __m512i
#define MD_SIMD_BITMASK MD_SIMD_INT32
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
return _mm512_load_si512(m);
}
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
return _mm512_set1_epi32(a);
}
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
}
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_ps(p, a); }
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_ps(_mm512_setzero_ps(), m, a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
exit(-1);
return 0.0;
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
exit(-1);
return 0.0;
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m512 t0, t1;
__m128 t2, t3;
t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
t0 = _mm512_add_ps(t0, t1);
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
t3 = _mm512_castps512_ps128(t0);
t2 = _mm_load_ps(m);
t2 = _mm_add_ps(t2, t3);
_mm_store_ps(m, t2);
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
return _mm_cvtss_f32(t3);
}
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m256 t;
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
t = _mm256_load_ps(m);
t = _mm256_sub_ps(t, _mm512_castps512_ps256(a));
_mm256_store_ps(m, t);
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
simd_h_decr(m, a0);
simd_h_decr(m + CLUSTER_N, a1);
simd_h_decr(m + CLUSTER_N * 2, a2);
}

View File

@ -1,103 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#define MD_SIMD_FLOAT __m256d
#define MD_SIMD_INT __m128i
#define MD_SIMD_MASK __m256d
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
MD_SIMD_FLOAT ret;
fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX with double precision!");
exit(-1);
return ret;
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
MD_SIMD_FLOAT ret;
fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX with double precision!");
exit(-1);
return ret;
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX with double precision!");
exit(-1);
return 0.0;
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m256d t0, t1, t2;
__m128d a0, a1;
t0 = _mm256_hadd_pd(v0, v1);
t1 = _mm256_hadd_pd(v2, v3);
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
t0 = _mm256_add_pd(t0, t2);
t1 = _mm256_add_pd(t1, t2);
t0 = _mm256_blend_pd(t0, t1, 0b1100);
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
_mm256_store_pd(m, t1);
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
a0 = _mm256_castpd256_pd128(t0);
a1 = _mm256_extractf128_pd(t0, 0x1);
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
#ifdef __ISA_AVX_FMA__
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
#else
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
#endif
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
const unsigned long long int none = 0x0;
return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
}
// TODO: Implement this, althrough it is just required for debugging
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128d a0, a1;
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
a0 = _mm256_castpd256_pd128(a);
a1 = _mm256_extractf128_pd(a, 0x1);
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
fprintf(stderr, "simd_h_decr3(): Not implemented for AVX with double precision!");
exit(-1);
}
// Functions used in LAMMPS kernel
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }

View File

@ -1,84 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <immintrin.h>
#include <zmmintrin.h>
#define MD_SIMD_FLOAT __m256
#define MD_SIMD_MASK __mmask8
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128 t0;
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m128 t0, t2;
v0 = _mm256_hadd_ps(v0, v1);
v2 = _mm256_hadd_ps(v2, v3);
v0 = _mm256_hadd_ps(v0, v2);
t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
t2 = _mm_add_ps(t0, _mm_load_ps(m));
_mm_store_ps(m, t2);
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
return _mm256_broadcast_ps((const __m128 *)(m));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
__m128 t0, t1;
t0 = _mm_broadcast_ss(m);
t1 = _mm_broadcast_ss(m + 1);
return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m128 t0, t1;
v0 = _mm256_hadd_ps(v0, v1);
t0 = _mm256_extractf128_ps(v0, 0x1);
t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
t1 = _mm_add_ps(t0, _mm_load_ps(m));
_mm_store_ps(m, t1);
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
_mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
simd_h_decr(m, a0);
simd_h_decr(m + CLUSTER_N, a1);
simd_h_decr(m + CLUSTER_N * 2, a2);
}

View File

@ -1,15 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#include <atom.h>
#ifndef __THERMO_H_
#define __THERMO_H_
extern void setupThermo(Parameter*, int);
extern void computeThermo(int, Parameter*, Atom*);
extern void adjustThermo(Parameter*, Atom*);
#endif

View File

@ -1,17 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __TIMERS_H_
#define __TIMERS_H_
typedef enum {
TOTAL = 0,
NEIGH,
FORCE,
NUMTIMER
} timertype;
#endif

View File

@ -1,21 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <time.h>
double getTimeStamp(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
}
double getTimeResolution(void)
{
struct timespec ts;
clock_getres(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
}

View File

@ -1,13 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __TIMING_H_
#define __TIMING_H_
extern double getTimeStamp(void);
extern double getTimeResolution(void);
#endif

View File

@ -1,120 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <util.h>
/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
#define IA 16807
#define IM 2147483647
#define AM (1.0 / IM)
#define IQ 127773
#define IR 2836
#define MASK 123459876
double myrandom(int* seed)
{
int k = (*seed) / IQ;
double ans;
*seed = IA * (*seed - k * IQ) - IR * k;
if (*seed < 0) *seed += IM;
ans = AM * (*seed);
return ans;
}
void random_reset(int* seed, int ibase, double* coord)
{
int i;
char* str = (char*)&ibase;
int n = sizeof(int);
unsigned int hash = 0;
for (i = 0; i < n; i++) {
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
str = (char*)coord;
n = 3 * sizeof(double);
for (i = 0; i < n; i++) {
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
// keep 31 bits of unsigned int as new seed
// do not allow seed = 0, since will cause hang in gaussian()
*seed = hash & 0x7ffffff;
if (!(*seed)) *seed = 1;
// warm up the RNG
for (i = 0; i < 5; i++)
myrandom(seed);
// save = 0;
}
int str2ff(const char* string)
{
if (strncmp(string, "lj", 2) == 0) return FF_LJ;
if (strncmp(string, "eam", 3) == 0) return FF_EAM;
if (strncmp(string, "dem", 3) == 0) return FF_DEM;
return -1;
}
const char* ff2str(int ff)
{
if (ff == FF_LJ) {
return "lj";
}
if (ff == FF_EAM) {
return "eam";
}
if (ff == FF_DEM) {
return "dem";
}
return "invalid";
}
int get_cuda_num_threads(void)
{
const char* num_threads_env = getenv("NUM_THREADS");
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
}
void readline(char* line, FILE* fp)
{
if (fgets(line, MAXLINE, fp) == NULL) {
if (errno != 0) {
perror("readline()");
exit(-1);
}
}
}
void debug_printf(const char* format, ...)
{
#ifdef DEBUG
va_list arg;
int ret;
va_start(arg, format);
if ((vfprintf(stdout, format, arg)) < 0) {
perror("debug_printf()");
}
va_end(arg);
#endif
}

View File

@ -1,47 +0,0 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __UTIL_H_
#define __UTIL_H_
#include <stdio.h>
#ifndef MIN
#define MIN(x, y) ((x) < (y) ? (x) : (y))
#endif
#ifndef MAX
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#endif
#ifndef ABS
#define ABS(a) ((a) >= 0 ? (a) : -(a))
#endif
#define DEBUG_MESSAGE debug_printf
#ifndef MAXLINE
#define MAXLINE 4096
#endif
#define FF_LJ 0
#define FF_EAM 1
#define FF_DEM 2
#if PRECISION == 1
#define PRECISION_STRING "single"
#else
#define PRECISION_STRING "double"
#endif
extern double myrandom(int *);
extern void random_reset(int *seed, int ibase, double *coord);
extern int str2ff(const char *string);
extern const char *ff2str(int ff);
extern void readline(char *line, FILE *fp);
extern void debug_printf(const char *format, ...);
extern int get_cuda_num_threads(void);
#endif

View File

@ -1,8 +1,24 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <stdio.h>
#include <stdlib.h>
@ -27,7 +43,7 @@ void initEam(Eam* eam, Parameter* param) {
}
void coeff(Eam* eam, Parameter* param) {
read_eam_file(&eam->file, param->eam_file);
read_file(&eam->file, param->input_file);
param->mass = eam->file.mass;
param->cutforce = eam->file.cut;
param->cutneigh = param->cutforce + 1.0;
@ -43,7 +59,7 @@ void init_style(Eam* eam, Parameter* param) {
array2spline(eam, param);
}
void read_eam_file(Funcfl* file, const char* filename) {
void read_file(Funcfl* file, const char* filename) {
FILE* fptr;
char line[MAXLINE];
@ -54,10 +70,10 @@ void read_eam_file(Funcfl* file, const char* filename) {
}
int tmp;
readline(line, fptr);
readline(line, fptr);
fgets(line, MAXLINE, fptr);
fgets(line, MAXLINE, fptr);
sscanf(line, "%d %lg", &tmp, &(file->mass));
readline(line, fptr);
fgets(line, MAXLINE, fptr);
sscanf(line, "%d %lg %d %lg %lg", &file->nrho, &file->drho, &file->nr, &file->dr, &file->cut);
//printf("Read: %lf %i %lf %i %lf %lf\n",file->mass,file->nrho,file->drho,file->nr,file->dr,file->cut);
@ -261,9 +277,9 @@ void grab(FILE* fptr, int n, MD_FLOAT* list) {
int i = 0;
while(i < n) {
readline(line, fptr);
fgets(line, MAXLINE, fptr);
ptr = strtok(line, " \t\n\r\f");
list[i++] = atof(ptr);
while((ptr = strtok(NULL, " \t\n\r\f"))) list[i++] = atof(ptr);
while(ptr = strtok(NULL, " \t\n\r\f")) list[i++] = atof(ptr);
}
}

View File

@ -1,12 +1,32 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <likwid-marker.h>
#include <timing.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
#include <stdio.h>
@ -99,4 +119,114 @@
# define DIST_TRACE(l, e)
#endif
extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);
double computeForceTracing(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
MEM_TRACER_INIT;
INDEX_TRACER_INIT;
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
for(int i = 0; i < Nlocal; i++) {
fx[i] = 0.0;
fy[i] = 0.0;
fz[i] = 0.0;
}
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) {
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
MEM_TRACE(atom_x(i), 'R');
MEM_TRACE(atom_y(i), 'R');
MEM_TRACE(atom_z(i), 'R');
INDEX_TRACE_ATOM(i);
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
MEM_TRACE(atom->type(i), 'R');
#endif
#if defined(VARIANT) && VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1
#define REPEAT_NEIGHBORS_LOOP
int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS;
for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) {
#endif
//DIST_TRACE_SORT(neighs, numneighs);
INDEX_TRACE(neighs, numneighs);
//DIST_TRACE(neighs, numneighs);
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
MEM_TRACE(neighs[k], 'R');
MEM_TRACE(atom_x(j), 'R');
MEM_TRACE(atom_y(j), 'R');
MEM_TRACE(atom_z(j), 'R');
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
MEM_TRACE(atom->type(j), 'R');
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
}
}
#ifdef REPEAT_NEIGHBORS_LOOP
}
#endif
fx[i] += fix;
fy[i] += fiy;
fz[i] += fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
MEM_TRACE(fx[i], 'R');
MEM_TRACE(fx[i], 'W');
MEM_TRACE(fy[i], 'R');
MEM_TRACE(fy[i], 'W');
MEM_TRACE(fz[i], 'R');
MEM_TRACE(fz[i], 'W');
}
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
INDEX_TRACER_END;
MEM_TRACER_END;
return E-S;
}

274
src/force.cu Normal file
View File

@ -0,0 +1,274 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
extern "C" {
#include <likwid-marker.h>
#include <timing.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <allocate.h>
}
// cuda kernel
__global__ void calc_force(
Atom a,
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon,
int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
Atom *atom = &a;
const int numneighs = neigh_numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
for(int k = 0; k < numneighs; k++) {
int j = neigh_neighbors[atom->Nlocal * k + i];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
}
}
atom_fx(i) = fix;
atom_fy(i) = fiy;
atom_fz(i) = fiz;
}
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
Atom *atom = &a;
atom_vx(i) += dtforce * atom_fx(i);
atom_vy(i) += dtforce * atom_fy(i);
atom_vz(i) += dtforce * atom_fz(i);
atom_x(i) = atom_x(i) + dt * atom_vx(i);
atom_y(i) = atom_y(i) + dt * atom_vy(i);
atom_z(i) = atom_z(i) + dt * atom_vz(i);
}
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
Atom *atom = &a;
atom_vx(i) += dtforce * atom_fx(i);
atom_vy(i) += dtforce * atom_fy(i);
atom_vz(i) += dtforce * atom_fz(i);
}
extern "C" {
static Atom c_atom;
int *c_neighs;
int *c_neigh_numneigh;
int get_num_threads() {
const char *num_threads_env = getenv("NUM_THREADS");
int num_threads = 0;
if(num_threads_env == nullptr)
num_threads = 32;
else {
num_threads = atoi(num_threads_env);
}
return num_threads;
}
void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads = get_num_threads();
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, c_atom);
checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
if(doReneighbour) {
checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
}
}
void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads = get_num_threads();
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, c_atom);
checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
if(doReneighbour) {
checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
}
checkCUDAError( "InitialIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
}
void initCudaAtom(Atom *atom, Neighbor *neighbor) {
const int Nlocal = atom->Nlocal;
checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal * 3) );
checkCUDAError( "c_atom.vx malloc", cudaMalloc((void**)&(c_atom.vx), sizeof(MD_FLOAT) * Nlocal * 3) );
checkCUDAError( "c_atom.vx memcpy", cudaMemcpy(c_atom.vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
}
double computeForce(
bool reneighbourHappenend,
Parameter *param,
Atom *atom,
Neighbor *neighbor
)
{
int Nlocal = atom->Nlocal;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
const int num_threads = get_num_threads();
c_atom.Natoms = atom->Natoms;
c_atom.Nlocal = atom->Nlocal;
c_atom.Nghost = atom->Nghost;
c_atom.Nmax = atom->Nmax;
c_atom.ntypes = atom->ntypes;
/*
int nDevices;
cudaGetDeviceCount(&nDevices);
size_t free, total;
for(int i = 0; i < nDevices; ++i) {
cudaMemGetInfo( &free, &total );
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
}
*/
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
// checkCUDAError( "c_atom.fx memset", cudaMemset(c_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
cudaProfilerStart();
checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
if(reneighbourHappenend) {
checkCUDAError( "c_neigh_numneigh memcpy", cudaMemcpy(c_neigh_numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) );
checkCUDAError( "c_neighs memcpy", cudaMemcpy(c_neighs, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) );
}
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh);
checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
cudaProfilerStop();
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
return E-S;
}
}

View File

@ -1,8 +1,24 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <likwid-marker.h>
#include <math.h>
@ -16,8 +32,7 @@
#include <eam.h>
#include <util.h>
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
/*
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
if(eam->nmax < atom->Nmax) {
eam->nmax = atom->Nmax;
if(eam->fp != NULL) { free(eam->fp); }
@ -28,13 +43,11 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
int* neighs;
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
int rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; int rdrho = eam->rdrho;
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
*/
double S = getTimeStamp();
LIKWID_MARKER_START("force_eam_fp");
/*
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
@ -194,7 +207,6 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
*/
LIKWID_MARKER_STOP("force_eam");
double E = getTimeStamp();
return E-S;

33
src/includes/allocate.h Normal file
View File

@ -0,0 +1,33 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <stdlib.h>
#include <cuda_runtime.h>
#ifndef __ALLOCATE_H_
#define __ALLOCATE_H_
extern void* allocate (int alignment, size_t bytesize);
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
extern void checkCUDAError(const char *msg, cudaError_t err);
#endif

78
src/includes/atom.h Normal file
View File

@ -0,0 +1,78 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <parameter.h>
#ifndef __ATOM_H_
#define __ATOM_H_
typedef struct {
int Natoms, Nlocal, Nghost, Nmax;
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
MD_FLOAT *fx, *fy, *fz;
int *border_map;
int *type;
int ntypes;
MD_FLOAT *epsilon;
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
} Atom;
extern void initAtom(Atom*);
extern void createAtom(Atom*, Parameter*);
extern void growAtom(Atom*);
#ifdef AOS
#define POS_DATA_LAYOUT "AoS"
#define atom_x(i) atom->x[(i) * 3 + 0]
#define atom_y(i) atom->x[(i) * 3 + 1]
#define atom_z(i) atom->x[(i) * 3 + 2]
#define atom_fx(i) atom->fx[(i) * 3 + 0]
#define atom_fy(i) atom->fx[(i) * 3 + 1]
#define atom_fz(i) atom->fx[(i) * 3 + 2]
#define atom_vx(i) atom->vx[(i) * 3 + 0]
#define atom_vy(i) atom->vx[(i) * 3 + 1]
#define atom_vz(i) atom->vx[(i) * 3 + 2]
#else
#define POS_DATA_LAYOUT "SoA"
#define atom_x(i) atom->x[i]
#define atom_y(i) atom->y[i]
#define atom_z(i) atom->z[i]
#define atom_fx(i) atom->fx[i]
#define atom_fy(i) atom->fy[i]
#define atom_fz(i) atom->fz[i]
#define atom_vx(i) atom->vx[i]
#define atom_vy(i) atom->vy[i]
#define atom_vz(i) atom->vz[i]
#endif
#endif

55
src/includes/eam.h Normal file
View File

@ -0,0 +1,55 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <stdio.h>
#include <atom.h>
#include <parameter.h>
#ifndef __EAM_H_
#define __EAM_H_
typedef struct {
int nrho, nr;
MD_FLOAT drho, dr, cut, mass;
MD_FLOAT *frho, *rhor, *zr;
} Funcfl;
typedef struct {
MD_FLOAT* fp;
int nmax;
int nrho, nr;
int nrho_tot, nr_tot;
MD_FLOAT dr, rdr, drho, rdrho;
MD_FLOAT *frho, *rhor, *z2r;
MD_FLOAT *rhor_spline, *frho_spline, *z2r_spline;
Funcfl file;
} Eam;
void initEam(Eam* eam, Parameter* param);
void coeff(Eam* eam, Parameter* param);
void init_style(Eam* eam, Parameter *param);
void read_file(Funcfl* file, const char* filename);
void file2array(Eam* eam);
void array2spline(Eam* eam, Parameter* param);
void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline);
void grab(FILE* fptr, int n, MD_FLOAT* list);
#endif

View File

@ -1,8 +1,32 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
* =======================================================================================
*
* Filename: likwid-marker.h
*
* Description: Header File of likwid Marker API
*
* Version: <VERSION>
* Released: <DATE>
*
* Authors: Thomas Gruber (tg), thomas.roehl@googlemail.com
*
* Project: likwid
*
* Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
* =======================================================================================
*/
#ifndef LIKWID_MARKER_H
#define LIKWID_MARKER_H

41
src/includes/neighbor.h Normal file
View File

@ -0,0 +1,41 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <atom.h>
#include <parameter.h>
#ifndef __NEIGHBOR_H_
#define __NEIGHBOR_H_
typedef struct {
int every;
int ncalls;
int* neighbors;
int maxneighs;
int* numneigh;
} Neighbor;
extern void initNeighbor(Neighbor*, Parameter*);
extern void setupNeighbor();
extern void binatoms(Atom*);
extern void buildNeighbor(Atom*, Neighbor*);
extern void sortAtom(Atom*);
#endif

57
src/includes/parameter.h Normal file
View File

@ -0,0 +1,57 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#ifndef __PARAMETER_H_
#define __PARAMETER_H_
#define FF_LJ 0
#define FF_EAM 1
#if PRECISION == 1
#define MD_FLOAT float
#else
#define MD_FLOAT double
#endif
typedef struct {
int force_field;
char* input_file;
char* vtk_file;
MD_FLOAT epsilon;
MD_FLOAT sigma6;
MD_FLOAT temp;
MD_FLOAT rho;
MD_FLOAT mass;
int ntypes;
int ntimes;
int nstat;
int every;
MD_FLOAT dt;
MD_FLOAT dtforce;
MD_FLOAT cutforce;
MD_FLOAT cutneigh;
int nx, ny, nz;
MD_FLOAT lattice;
MD_FLOAT xprd, yprd, zprd;
double proc_freq;
} Parameter;
#endif

32
src/includes/pbc.h Normal file
View File

@ -0,0 +1,32 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <atom.h>
#include <parameter.h>
#ifndef __PBC_H_
#define __PBC_H_
extern void initPbc();
extern void updatePbc(Atom*, Parameter*);
extern void updateAtomsPbc(Atom*, Parameter*);
extern void setupPbc(Atom*, Parameter*);
#endif

46
src/includes/stats.h Normal file
View File

@ -0,0 +1,46 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <atom.h>
#include <parameter.h>
#ifndef __STATS_H_
#define __STATS_H_
typedef struct {
long long int total_force_neighs;
long long int total_force_iters;
} Stats;
void initStats(Stats *s);
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
#ifdef COMPUTE_STATS
# define addStat(stat, value) stat += value;
# define beginStatTimer() double Si = getTimeStamp();
# define endStatTimer(stat) stat += getTimeStamp() - Si;
#else
# define addStat(stat, value)
# define beginStatTimer()
# define endStatTimer(stat)
#endif
#endif

31
src/includes/thermo.h Normal file
View File

@ -0,0 +1,31 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <parameter.h>
#include <atom.h>
#ifndef __THERMO_H_
#define __THERMO_H_
extern void setupThermo(Parameter*, int);
extern void computeThermo(int, Parameter*, Atom*);
extern void adjustThermo(Parameter*, Atom*);
#endif

11
src/includes/timers.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef __TIMERS_H_
#define __TIMERS_H_
typedef enum {
TOTAL = 0,
NEIGH,
FORCE,
NUMTIMER
} timertype;
#endif

30
src/includes/timing.h Normal file
View File

@ -0,0 +1,30 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#ifndef __TIMING_H_
#define __TIMING_H_
extern double getTimeStamp();
extern double getTimeResolution();
extern double getTimeStamp_();
#endif

37
src/includes/util.h Normal file
View File

@ -0,0 +1,37 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#ifndef __UTIL_H_
#define __UTIL_H_
#ifndef MIN
#define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
#define MAX(x,y) ((x)>(y)?(x):(y))
#endif
#ifndef ABS
#define ABS(a) ((a) >= 0 ? (a) : -(a))
#endif
extern double myrandom(int*);
#endif

28
src/includes/vtk.h Normal file
View File

@ -0,0 +1,28 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <atom.h>
#ifndef __VTK_H_
#define __VTK_H_
extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
#endif

253
src/main-stub.c Normal file
View File

@ -0,0 +1,253 @@
#include <stdio.h>
#include <string.h>
//---
#include <likwid-marker.h>
//---
#include <timing.h>
#include <allocate.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#include <thermo.h>
#include <pbc.h>
#include <timers.h>
#define HLINE "----------------------------------------------------------------------------\n"
#define LATTICE_DISTANCE 10.0
#define NEIGH_DISTANCE 1.0
extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int);
void init(Parameter *param) {
param->epsilon = 1.0;
param->sigma6 = 1.0;
param->rho = 0.8442;
param->ntypes = 4;
param->ntimes = 200;
param->nx = 4;
param->ny = 4;
param->nz = 2;
param->lattice = LATTICE_DISTANCE;
param->cutforce = 5.0;
param->cutneigh = param->cutforce;
param->mass = 1.0;
// Unused
param->dt = 0.005;
param->dtforce = 0.5 * param->dt;
param->nstat = 100;
param->temp = 1.44;
param->every = 20;
param->proc_freq = 0.0;
}
// Show debug messages
//#define DEBUG(msg) printf(msg)
// Do not show debug messages
#define DEBUG(msg)
#define ADD_ATOM(x, y, z, vx, vy, vz) atom_x(atom->Nlocal) = base_x + x * NEIGH_DISTANCE; \
atom_y(atom->Nlocal) = base_y + y * NEIGH_DISTANCE; \
atom_z(atom->Nlocal) = base_z + z * NEIGH_DISTANCE; \
atom->vx[atom->Nlocal] = vy; \
atom->vy[atom->Nlocal] = vy; \
atom->vz[atom->Nlocal] = vz; \
atom->Nlocal++
int main(int argc, const char *argv[]) {
Atom atom_data;
Atom *atom = (Atom *)(&atom_data);
Neighbor neighbor;
Stats stats;
Parameter param;
int atoms_per_unit_cell = 8;
int csv = 0;
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++)
{
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0))
{
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nx") == 0))
{
param.nx = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-ny") == 0))
{
param.ny = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nz") == 0))
{
param.nz = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-na") == 0))
{
atoms_per_unit_cell = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-f") == 0))
{
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "-csv") == 0))
{
csv = 1;
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0))
{
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-na <int>: set number of atoms per unit cell\n");
printf("-f <real>: set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
printf("-csv: set output as CSV style\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
param.xprd = param.nx * LATTICE_DISTANCE;
param.yprd = param.ny * LATTICE_DISTANCE;
param.zprd = param.nz * LATTICE_DISTANCE;
DEBUG("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
atom->ntypes = param.ntypes;
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param.epsilon;
atom->sigma6[i] = param.sigma6;
atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG("Creating atoms...\n");
for(int i = 0; i < param.nx; ++i) {
for(int j = 0; j < param.ny; ++j) {
for(int k = 0; k < param.nz; ++k) {
int added_atoms = 0;
int fac_x = 1;
int fac_y = 1;
int fac_z = 1;
int fmod = 0;
MD_FLOAT base_x = i * LATTICE_DISTANCE;
MD_FLOAT base_y = j * LATTICE_DISTANCE;
MD_FLOAT base_z = k * LATTICE_DISTANCE;
MD_FLOAT vx = 0.0;
MD_FLOAT vy = 0.0;
MD_FLOAT vz = 0.0;
while(atom->Nlocal > atom->Nmax - atoms_per_unit_cell) {
growAtom(atom);
}
while(fac_x * fac_y * fac_z < atoms_per_unit_cell) {
if(fmod == 0) { fac_x *= 2; }
if(fmod == 1) { fac_y *= 2; }
if(fmod == 2) { fac_z *= 2; }
fmod = (fmod + 1) % 3;
}
MD_FLOAT offset_x = (fac_x > 1) ? 1.0 / (fac_x - 1) : (int)fac_x;
MD_FLOAT offset_y = (fac_y > 1) ? 1.0 / (fac_y - 1) : (int)fac_y;
MD_FLOAT offset_z = (fac_z > 1) ? 1.0 / (fac_z - 1) : (int)fac_z;
for(int ii = 0; ii < fac_x; ++ii) {
for(int jj = 0; jj < fac_y; ++jj) {
for(int kk = 0; kk < fac_z; ++kk) {
if(added_atoms < atoms_per_unit_cell) {
atom->type[atom->Nlocal] = rand() % atom->ntypes;
ADD_ATOM(ii * offset_x, jj * offset_y, kk * offset_z, vx, vy, vz);
added_atoms++;
}
}
}
}
}
}
}
const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
const double estim_neighbors_volume = (double)(atom->Nlocal * (atoms_per_unit_cell - 1 + 2) * sizeof(int));
const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
if(!csv) {
printf("Number of timesteps: %d\n", param.ntimes);
printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
printf("Total number of atoms: %d\n", atom->Nlocal);
printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG("Setting up neighbor lists...\n");
setupNeighbor();
DEBUG("Building neighbor lists...\n");
buildNeighbor(atom, &neighbor);
DEBUG("Computing forces...\n");
computeForce(&param, atom, &neighbor, &stats, 1, 1);
double S, E;
S = getTimeStamp();
for(int i = 0; i < param.ntimes; i++) {
computeForce(&param, atom, &neighbor, &stats, 0, i + 1);
}
E = getTimeStamp();
double T_accum = E-S;
double freq_hz = param.proc_freq * 1.e9;
const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
if(!csv) {
printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
if(param.proc_freq > 0.0) {
printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
}
} else {
printf("steps,unit cells,atoms/unit cell,total atoms,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
if(param.proc_freq > 0.0) {
printf(",cy/atom,cy/neigh");
}
printf("\n");
printf("%d,%dx%dx%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
param.ntimes, param.nx, param.ny, param.nz, atoms_per_unit_cell, atom->Nlocal,
estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
if(param.proc_freq > 0.0) {
printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
}
printf("\n");
}
double timer[NUMTIMER];
timer[FORCE] = T_accum;
displayStatistics(atom, &param, &stats, timer);
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

335
src/main.c Normal file
View File

@ -0,0 +1,335 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <unistd.h>
#include <limits.h>
#include <math.h>
#include <float.h>
#include <likwid-marker.h>
#include <timing.h>
#include <allocate.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#include <thermo.h>
#include <pbc.h>
#include <timers.h>
#include <eam.h>
#include <vtk.h>
#define HLINE "----------------------------------------------------------------------------\n"
extern void initCudaAtom(Atom *atom, Neighbor *neighbor);
extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom);
extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom);
extern double computeForce(bool, Parameter*, Atom*, Neighbor*);
extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
void init(Parameter *param)
{
param->input_file = NULL;
param->vtk_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma6 = 1.0;
param->rho = 0.8442;
param->ntypes = 4;
param->ntimes = 200;
param->dt = 0.005;
param->nx = 32;
param->ny = 32;
param->nz = 32;
param->cutforce = 2.5;
param->cutneigh = param->cutforce + 0.30;
param->temp = 1.44;
param->nstat = 100;
param->mass = 1.0;
param->dtforce = 0.5 * param->dt;
param->every = 20;
param->proc_freq = 2.4;
}
double setup(
Parameter *param,
Eam *eam,
Atom *atom,
Neighbor *neighbor,
Stats *stats)
{
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
param->xprd = param->nx * param->lattice;
param->yprd = param->ny * param->lattice;
param->zprd = param->nz * param->lattice;
S = getTimeStamp();
initAtom(atom);
initNeighbor(neighbor, param);
initPbc(atom);
initStats(stats);
setupNeighbor();
createAtom(atom, param);
setupThermo(param, atom->Natoms);
adjustThermo(param, atom);
setupPbc(atom, param);
updatePbc(atom, param);
buildNeighbor(atom, neighbor);
E = getTimeStamp();
initCudaAtom(atom, neighbor);
return E-S;
}
double reneighbour(
Parameter *param,
Atom *atom,
Neighbor *neighbor)
{
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
updateAtomsPbc(atom, param);
setupPbc(atom, param);
updatePbc(atom, param);
//sortAtom(atom);
buildNeighbor(atom, neighbor);
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
return E-S;
}
void initialIntegrate(Parameter *param, Atom *atom)
{
for(int i = 0; i < atom->Nlocal; i++) {
atom_vx(i) += param->dtforce * atom_fx(i);
atom_vy(i) += param->dtforce * atom_fy(i);
atom_vz(i) += param->dtforce * atom_fz(i);
atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
}
}
void finalIntegrate(Parameter *param, Atom *atom)
{
for(int i = 0; i < atom->Nlocal; i++) {
atom_vx(i) += param->dtforce * atom_fx(i);
atom_vy(i) += param->dtforce * atom_fy(i);
atom_vz(i) += param->dtforce * atom_fz(i);
}
}
void printAtomState(Atom *atom)
{
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
/* int nall = atom->Nlocal + atom->Nghost; */
/* for (int i=0; i<nall; i++) { */
/* printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
/* } */
}
int str2ff(const char *string)
{
if(strncmp(string, "lj", 2) == 0) return FF_LJ;
if(strncmp(string, "eam", 3) == 0) return FF_EAM;
return -1;
}
const char* ff2str(int ff)
{
if(ff == FF_LJ) { return "lj"; }
if(ff == FF_EAM) { return "eam"; }
return "invalid";
}
int main(int argc, char** argv)
{
double timer[NUMTIMER];
Eam eam;
Atom atom;
Neighbor neighbor;
Stats stats;
Parameter param;
LIKWID_MARKER_INIT;
#pragma omp parallel
{
LIKWID_MARKER_REGISTER("force");
//LIKWID_MARKER_REGISTER("reneighbour");
//LIKWID_MARKER_REGISTER("pbc");
}
init(&param);
for(int i = 0; i < argc; i++)
{
if((strcmp(argv[i], "-f") == 0))
{
if((param.force_field = str2ff(argv[++i])) < 0) {
fprintf(stderr, "Invalid force field!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-i") == 0))
{
param.input_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0))
{
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nx") == 0))
{
param.nx = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-ny") == 0))
{
param.ny = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nz") == 0))
{
param.nz = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "--freq") == 0))
{
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--vtk") == 0))
{
param.vtk_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0))
{
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-f <string>: force field (lj or eam), default lj\n");
printf("-i <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
setup(&param, &eam, &atom, &neighbor, &stats);
computeThermo(0, &param, &atom);
if(param.force_field == FF_EAM) {
computeForceEam(&eam, &param, &atom, &neighbor, &stats, 1, 0);
} else {
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
computeForceTracing(&param, &atom, &neighbor, &stats, 1, 0);
#else
computeForce(true, &param, &atom, &neighbor);
#endif
}
timer[FORCE] = 0.0;
timer[NEIGH] = 0.0;
timer[TOTAL] = getTimeStamp();
if(param.vtk_file != NULL) {
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
}
for(int n = 0; n < param.ntimes; n++) {
const bool doReneighbour = (n + 1) % param.every == 0;
cuda_initial_integrate(doReneighbour, &param, &atom);
if(doReneighbour) {
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
} else {
updatePbc(&atom, &param);
}
if(param.force_field == FF_EAM) {
timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats, 0, n + 1);
} else {
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
timer[FORCE] += computeForceTracing(&param, &atom, &neighbor, &stats, 0, n + 1);
#else
timer[FORCE] += computeForce(doReneighbour, &param, &atom, &neighbor);
#endif
}
cuda_final_integrate(doReneighbour, &param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
computeThermo(n + 1, &param, &atom);
}
if(param.vtk_file != NULL) {
write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
}
}
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
computeThermo(-1, &param, &atom);
printf(HLINE);
printf("Force field: %s\n", ff2str(param.force_field));
printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
#if PRECISION == 1
printf("Using single precision floating point.\n");
#else
printf("Using double precision floating point.\n");
#endif
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS
displayStatistics(&atom, &param, &stats, timer);
#endif
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

Some files were not shown because too many files have changed in this diff Show More