Compare commits
3 Commits
main
...
gromacs_gp
Author | SHA1 | Date | |
---|---|---|---|
|
055a009dbd | ||
|
182c065fe2 | ||
|
ee3f6de050 |
176
.clang-format
176
.clang-format
@ -1,176 +0,0 @@
|
||||
---
|
||||
Language: Cpp
|
||||
# BasedOnStyle: WebKit
|
||||
AccessModifierOffset: -4
|
||||
AlignAfterOpenBracket: DontAlign
|
||||
AlignArrayOfStructures: None
|
||||
AlignConsecutiveAssignments: Consecutive
|
||||
AlignConsecutiveBitFields: None
|
||||
AlignConsecutiveDeclarations: None
|
||||
AlignConsecutiveMacros: Consecutive
|
||||
AlignEscapedNewlines: Right
|
||||
AlignOperands: Align
|
||||
AlignTrailingComments: true
|
||||
AllowAllArgumentsOnNextLine: false
|
||||
AllowAllParametersOfDeclarationOnNextLine: true
|
||||
AllowShortEnumsOnASingleLine: true
|
||||
AllowShortBlocksOnASingleLine: Never
|
||||
AllowShortCaseLabelsOnASingleLine: false
|
||||
AllowShortFunctionsOnASingleLine: All
|
||||
AllowShortLambdasOnASingleLine: All
|
||||
AllowShortIfStatementsOnASingleLine: OnlyFirstIf
|
||||
AllowShortLoopsOnASingleLine: false
|
||||
AlwaysBreakAfterDefinitionReturnType: None
|
||||
AlwaysBreakAfterReturnType: None
|
||||
AlwaysBreakBeforeMultilineStrings: false
|
||||
AlwaysBreakTemplateDeclarations: MultiLine
|
||||
AttributeMacros:
|
||||
- __capability
|
||||
BinPackArguments: false
|
||||
BinPackParameters: false
|
||||
BraceWrapping:
|
||||
AfterCaseLabel: false
|
||||
AfterClass: false
|
||||
AfterControlStatement: Never
|
||||
AfterEnum: false
|
||||
AfterFunction: true
|
||||
AfterNamespace: false
|
||||
AfterObjCDeclaration: false
|
||||
AfterStruct: false
|
||||
AfterUnion: false
|
||||
AfterExternBlock: false
|
||||
BeforeCatch: false
|
||||
BeforeElse: false
|
||||
BeforeLambdaBody: false
|
||||
BeforeWhile: false
|
||||
IndentBraces: false
|
||||
SplitEmptyFunction: true
|
||||
SplitEmptyRecord: true
|
||||
SplitEmptyNamespace: true
|
||||
BreakBeforeBinaryOperators: None
|
||||
BreakBeforeBraces: WebKit
|
||||
BreakBeforeInheritanceComma: false
|
||||
BreakInheritanceList: BeforeColon
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializersBeforeComma: false
|
||||
BreakConstructorInitializers: BeforeComma
|
||||
BreakAfterJavaFieldAnnotations: false
|
||||
BreakStringLiterals: true
|
||||
ColumnLimit: 90
|
||||
CommentPragmas: '^ IWYU pragma:'
|
||||
CompactNamespaces: false
|
||||
ConstructorInitializerIndentWidth: 4
|
||||
ContinuationIndentWidth: 4
|
||||
Cpp11BracedListStyle: false
|
||||
DeriveLineEnding: true
|
||||
DerivePointerAlignment: false
|
||||
DisableFormat: false
|
||||
EmptyLineAfterAccessModifier: Never
|
||||
EmptyLineBeforeAccessModifier: LogicalBlock
|
||||
ExperimentalAutoDetectBinPacking: false
|
||||
BasedOnStyle: ''
|
||||
ConstructorInitializerAllOnOneLineOrOnePerLine: false
|
||||
AllowAllConstructorInitializersOnNextLine: true
|
||||
FixNamespaceComments: false
|
||||
ForEachMacros:
|
||||
- foreach
|
||||
- Q_FOREACH
|
||||
- BOOST_FOREACH
|
||||
IfMacros:
|
||||
- KJ_IF_MAYBE
|
||||
IncludeBlocks: Preserve
|
||||
IncludeCategories:
|
||||
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
|
||||
Priority: 2
|
||||
SortPriority: 0
|
||||
CaseSensitive: false
|
||||
- Regex: '^(<|"(gtest|gmock|isl|json)/)'
|
||||
Priority: 3
|
||||
SortPriority: 0
|
||||
CaseSensitive: false
|
||||
- Regex: '.*'
|
||||
Priority: 1
|
||||
SortPriority: 0
|
||||
CaseSensitive: false
|
||||
IncludeIsMainRegex: '(Test)?$'
|
||||
IncludeIsMainSourceRegex: ''
|
||||
IndentAccessModifiers: false
|
||||
IndentCaseLabels: false
|
||||
IndentCaseBlocks: false
|
||||
IndentGotoLabels: true
|
||||
IndentPPDirectives: None
|
||||
IndentExternBlock: AfterExternBlock
|
||||
IndentWidth: 4
|
||||
IndentWrappedFunctionNames: false
|
||||
InsertTrailingCommas: None
|
||||
JavaScriptQuotes: Leave
|
||||
JavaScriptWrapImports: true
|
||||
KeepEmptyLinesAtTheStartOfBlocks: true
|
||||
LambdaBodyIndentation: Signature
|
||||
MacroBlockBegin: ''
|
||||
MacroBlockEnd: ''
|
||||
MaxEmptyLinesToKeep: 1
|
||||
NamespaceIndentation: Inner
|
||||
ObjCBinPackProtocolList: Auto
|
||||
ObjCBlockIndentWidth: 4
|
||||
ObjCBreakBeforeNestedBlockParam: true
|
||||
ObjCSpaceAfterProperty: true
|
||||
ObjCSpaceBeforeProtocolList: true
|
||||
PenaltyBreakAssignment: 200
|
||||
PenaltyBreakBeforeFirstCallParameter: 19
|
||||
PenaltyBreakComment: 300
|
||||
PenaltyBreakFirstLessLess: 120
|
||||
PenaltyBreakString: 1000
|
||||
PenaltyBreakTemplateDeclaration: 10
|
||||
PenaltyExcessCharacter: 1000000
|
||||
PenaltyReturnTypeOnItsOwnLine: 60
|
||||
PenaltyIndentedWhitespace: 0
|
||||
PointerAlignment: Left
|
||||
PPIndentWidth: -1
|
||||
ReferenceAlignment: Pointer
|
||||
ReflowComments: true
|
||||
ShortNamespaceLines: 1
|
||||
SortIncludes: CaseSensitive
|
||||
SortJavaStaticImport: Before
|
||||
SortUsingDeclarations: true
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceAfterLogicalNot: false
|
||||
SpaceAfterTemplateKeyword: true
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeCaseColon: false
|
||||
SpaceBeforeCpp11BracedList: true
|
||||
SpaceBeforeCtorInitializerColon: true
|
||||
SpaceBeforeInheritanceColon: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
SpaceAroundPointerQualifiers: Default
|
||||
SpaceBeforeRangeBasedForLoopColon: true
|
||||
SpaceInEmptyBlock: false
|
||||
SpaceInEmptyParentheses: false
|
||||
SpacesBeforeTrailingComments: 1
|
||||
SpacesInAngles: Never
|
||||
SpacesInConditionalStatement: false
|
||||
SpacesInContainerLiterals: true
|
||||
SpacesInCStyleCastParentheses: false
|
||||
SpacesInLineCommentPrefix:
|
||||
Minimum: 1
|
||||
Maximum: -1
|
||||
SpacesInParentheses: false
|
||||
SpacesInSquareBrackets: false
|
||||
SpaceBeforeSquareBrackets: false
|
||||
BitFieldColonSpacing: Both
|
||||
Standard: Latest
|
||||
StatementAttributeLikeMacros:
|
||||
- Q_EMIT
|
||||
StatementMacros:
|
||||
- Q_UNUSED
|
||||
- QT_REQUIRE_VERSION
|
||||
TabWidth: 8
|
||||
UseCRLF: false
|
||||
UseTab: Never
|
||||
WhitespaceSensitiveMacros:
|
||||
- STRINGIZE
|
||||
- PP_STRINGIZE
|
||||
- BOOST_PP_STRINGIZE
|
||||
- NS_SWIFT_NAME
|
||||
- CF_SWIFT_NAME
|
||||
...
|
14
.clang-tidy
14
.clang-tidy
@ -1,14 +0,0 @@
|
||||
---
|
||||
Checks: 'clang-diagnostic-*,clang-analyzer-*,clang-bugprone-*,readability-identifier-naming'
|
||||
WarningsAsErrors: true
|
||||
HeaderFilterRegex: '.*'
|
||||
AnalyzeTemporaryDtors: false
|
||||
CheckOptions:
|
||||
- key: readability-identifier-naming.StructCase
|
||||
value: 'CamelCase'
|
||||
- key: readability-identifier-naming.FunctionCase
|
||||
value: 'camelBack'
|
||||
- key: readability-identifier-naming.VariableCase
|
||||
value: 'camelBack'
|
||||
- key: readability-identifier-naming.GlobalConstantCase
|
||||
value: 'UPPER_CASE'
|
3
.clangd
3
.clangd
@ -1,3 +0,0 @@
|
||||
CompileFlags:
|
||||
Add: [-I/Users/jan/prg/MD-Bench/src/verletlist/, -I/Users/jan/prg/MD-Bench/src/common/, -DALIGNMENT=64]
|
||||
Compiler: clang
|
23
.gitignore
vendored
23
.gitignore
vendored
@ -51,17 +51,14 @@ Module.symvers
|
||||
Mkfile.old
|
||||
dkms.conf
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# TODO list
|
||||
todo.txt
|
||||
|
||||
# Build directories and executables
|
||||
#GCC-*/
|
||||
#ICC-*/
|
||||
#ICX-*/
|
||||
#CLANG-*/
|
||||
#NVCC-*/
|
||||
build-*/
|
||||
MDBench-*
|
||||
GCC/
|
||||
ICC/
|
||||
ICX/
|
||||
CLANG/
|
||||
NVCC/
|
||||
MDBench-GCC*
|
||||
MDBench-ICC*
|
||||
MDBench-ICX*
|
||||
MDBench-CLANG*
|
||||
MDBench-NVCC*
|
||||
|
133
Makefile
133
Makefile
@ -1,32 +1,119 @@
|
||||
#CONFIGURE BUILD SYSTEM
|
||||
TAG = $(OPT_TAG)-$(TOOLCHAIN)-$(DATA_TYPE)
|
||||
TARGET = MDBench-$(TAG)
|
||||
BUILD_DIR = ./build/build-$(TAG)
|
||||
SRC_ROOT = ./src
|
||||
SRC_DIR = $(SRC_ROOT)/$(OPT_SCHEME)
|
||||
COMMON_DIR = $(SRC_ROOT)/common
|
||||
CUDA_DIR = $(SRC_DIR)/cuda
|
||||
MAKE_DIR = ./make
|
||||
TARGET = MDBench-$(TAG)-$(OPT_SCHEME)
|
||||
BUILD_DIR = ./$(TAG)-$(OPT_SCHEME)
|
||||
SRC_DIR = ./$(OPT_SCHEME)
|
||||
ASM_DIR = ./asm
|
||||
COMMON_DIR = ./common
|
||||
CUDA_DIR = ./$(SRC_DIR)/cuda
|
||||
MAKE_DIR = ./
|
||||
Q ?= @
|
||||
|
||||
#DO NOT EDIT BELOW
|
||||
include config.mk
|
||||
include $(MAKE_DIR)/include_$(TOOLCHAIN).mk
|
||||
include $(MAKE_DIR)/config.mk
|
||||
include $(MAKE_DIR)/include_$(TAG).mk
|
||||
include $(MAKE_DIR)/include_LIKWID.mk
|
||||
ifneq ($(strip $(ISA)),NONE)
|
||||
include $(MAKE_DIR)/include_ISA.mk
|
||||
endif
|
||||
INCLUDES += -I./$(SRC_DIR) -I./$(COMMON_DIR)
|
||||
include $(MAKE_DIR)/include_GROMACS.mk
|
||||
INCLUDES += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes
|
||||
|
||||
VPATH = $(SRC_DIR) $(COMMON_DIR) $(CUDA_DIR)
|
||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
||||
DEFINES += -DAOS
|
||||
endif
|
||||
ifeq ($(strip $(DATA_TYPE)),SP)
|
||||
DEFINES += -DPRECISION=1
|
||||
else
|
||||
DEFINES += -DPRECISION=2
|
||||
endif
|
||||
|
||||
ifneq ($(ASM_SYNTAX), ATT)
|
||||
ASFLAGS += -masm=intel
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(EXPLICIT_TYPES)),true)
|
||||
DEFINES += -DEXPLICIT_TYPES
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(MEM_TRACER)),true)
|
||||
DEFINES += -DMEM_TRACER
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(INDEX_TRACER)),true)
|
||||
DEFINES += -DINDEX_TRACER
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(COMPUTE_STATS)),true)
|
||||
DEFINES += -DCOMPUTE_STATS
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(XTC_OUTPUT)),true)
|
||||
DEFINES += -DXTC_OUTPUT
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
|
||||
DEFINES += -DUSE_REFERENCE_VERSION
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true)
|
||||
DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(DEBUG)),true)
|
||||
DEFINES += -DDEBUG
|
||||
endif
|
||||
|
||||
ifneq ($(VECTOR_WIDTH),)
|
||||
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__SIMD_KERNEL__)),true)
|
||||
DEFINES += -D__SIMD_KERNEL__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__SSE__)),true)
|
||||
DEFINES += -D__ISA_SSE__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX__)),true)
|
||||
DEFINES += -D__ISA_AVX__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX_FMA__)),true)
|
||||
DEFINES += -D__ISA_AVX_FMA__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX2__)),true)
|
||||
DEFINES += -D__ISA_AVX2__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX512__)),true)
|
||||
DEFINES += -D__ISA_AVX512__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
|
||||
DEFINES += -DENABLE_OMP_SIMD
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(USE_SIMD_KERNEL)),true)
|
||||
DEFINES += -DUSE_SIMD_KERNEL
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(USE_SUPER_CLUSTERS)),true)
|
||||
DEFINES += -DUSE_SUPER_CLUSTERS
|
||||
endif
|
||||
|
||||
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
|
||||
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
||||
OBJ = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
|
||||
OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(COMMON_DIR)/*.c))
|
||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
||||
OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
|
||||
OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
|
||||
OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%-common.o,$(wildcard $(COMMON_DIR)/*.c))
|
||||
ifeq ($(strip $(TAG)),NVCC)
|
||||
OBJ += $(patsubst $(CUDA_DIR)/%.cu, $(BUILD_DIR)/%-cuda.o,$(wildcard $(CUDA_DIR)/*.cu))
|
||||
endif
|
||||
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
|
||||
|
||||
# $(warning $(OBJ))
|
||||
|
||||
ifneq ($(VARIANT),)
|
||||
.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
|
||||
DEFINES += -DVARIANT=$(VARIANT)
|
||||
@ -45,6 +132,11 @@ $(BUILD_DIR)/%.o: %.c
|
||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||
|
||||
$(BUILD_DIR)/%-common.o: $(COMMON_DIR)/%.c
|
||||
$(info ===> COMPILE $@)
|
||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||
|
||||
$(BUILD_DIR)/%-cuda.o: %.cu
|
||||
$(info ===> COMPILE $@)
|
||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
@ -63,16 +155,11 @@ $(BUILD_DIR)/%.o: %.s
|
||||
clean:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf $(BUILD_DIR)
|
||||
|
||||
cleanall:
|
||||
$(info ===> CLEAN)
|
||||
@rm -rf build
|
||||
@rm -rf MDBench-*
|
||||
@rm -f tags
|
||||
|
||||
distclean: clean
|
||||
$(info ===> DIST CLEAN)
|
||||
@rm -f $(TARGET)
|
||||
@rm -f $(TARGET)*
|
||||
@rm -f tags
|
||||
|
||||
info:
|
||||
@ -86,6 +173,6 @@ tags:
|
||||
$(Q)ctags -R
|
||||
|
||||
$(BUILD_DIR):
|
||||
@mkdir -p $(BUILD_DIR)
|
||||
@mkdir $(BUILD_DIR)
|
||||
|
||||
-include $(OBJ:.o=.d)
|
||||
|
54
README.md
54
README.md
@ -1,14 +1,34 @@
|
||||
# MD-Bench
|
||||
|
||||
MD-Bench is a toolbox for the performance engineering of short-range force
|
||||
calculation kernels on molecular-dynamics applications. It aims at covering all
|
||||
available state-of-the-art algorithms from different community codes such as
|
||||
LAMMPS and GROMACS.
|
||||

|
||||
|
||||
MD-Bench is a toolbox for the performance engineering of short-range force calculation kernels on molecular-dynamics applications.
|
||||
It aims at covering all available state-of-the-art algorithms from different community codes such as LAMMPS and GROMACS.
|
||||
|
||||
Apart from that, many tools to study and evaluate the in-depth performance of such kernels on distinct hardware are offered, like gather-bench, a standalone benchmark that mimics the data movement from MD kernels and the stubbed force calculation cases that focus on isolating the impacts caused by memory latencies and control flow divergence contributions in the overall performance.
|
||||
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Verlet Lists</th>
|
||||
<th>GROMACS MxN</th>
|
||||
<th>Stubbed cases</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><a target="_blank" rel="noopener noreferrer" href="figures/verlet_v2.png"><img src="figures/verlet_v2.png" alt="Image" title="Verlet Lists" style="width: 100%;"></a></td>
|
||||
<td><a target="_blank" rel="noopener noreferrer" href="figures/gromacs_mxn_v2.png"><img src="figures/gromacs_mxn_v2.png" alt="Image" title="GROMACS MxN" style="width: 90%;"></a></td>
|
||||
<td><a target="_blank" rel="noopener noreferrer" href="figures/stub_new_v3.png"><img src="figures/stub_new_v3.png" alt="Image" title="Stubbed cases" style="width: 100%;"></a></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!--  -->
|
||||
|
||||
## Build instructions
|
||||
|
||||
Properly configure your building by changing `config.mk` file. The following
|
||||
options are available:
|
||||
Properly configure your building by changing `config.mk` file. The following options are available:
|
||||
|
||||
- **TAG:** Compiler tag (available options: GCC, CLANG, ICC, ONEAPI, NVCC).
|
||||
- **ISA:** Instruction set (available options: SSE, AVX, AVX\_FMA, AVX2, AVX512).
|
||||
@ -25,18 +45,15 @@ options are available:
|
||||
- **COMPUTE\_STATS:** Compute statistics.
|
||||
|
||||
Configurations for LAMMPS Verlet Lists optimization scheme:
|
||||
|
||||
- **ENABLE\_OMP\_SIMD:** Use omp simd pragma on half neighbor-lists kernels.
|
||||
- **USE\_SIMD\_KERNEL:** Compile kernel with explicit SIMD intrinsics.
|
||||
|
||||
Configurations for GROMACS MxN optimization scheme:
|
||||
|
||||
- **USE\_REFERENCE\_VERSION:** Use reference version (only for correction purposes).
|
||||
- **XTC\_OUTPUT:** Enable XTC output.
|
||||
- **HALF\_NEIGHBOR\_LISTS\_CHECK\_CJ:** Check if j-clusters are local when decreasing the reaction force.
|
||||
|
||||
Configurations for CUDA:
|
||||
|
||||
- **USE\_CUDA\_HOST\_MEMORY:** Use CUDA host memory to optimize host-device transfers.
|
||||
|
||||
When done, just use `make` to compile the code.
|
||||
@ -51,14 +68,11 @@ Use the following command to run a simulation:
|
||||
./MD-Bench-<TAG>-<OPT_SCHEME> [OPTION]...
|
||||
```
|
||||
|
||||
Where `TAG` and `OPT_SCHEME` correspond to the building options with the same
|
||||
name. Without any options, a Copper FCC lattice system with size 32x32x32
|
||||
(131072 atoms) over 200 time-steps using the Lennard-Jones potential (sigma=1.0,
|
||||
epsilon=1.0) is simulated.
|
||||
Where `TAG` and `OPT_SCHEME` correspond to the building options with the same name.
|
||||
Without any options, a Copper FCC lattice system with size 32x32x32 (131072 atoms) over 200 time-steps using the Lennard-Jones potential (sigma=1.0, epsilon=1.0) is simulated.
|
||||
|
||||
The default behavior and other options can be changed using the following parameters:
|
||||
|
||||
```sh
|
||||
```
|
||||
-p <string>: file to read parameters from (can be specified more than once)
|
||||
-f <string>: force field (lj or eam), default lj
|
||||
-i <string>: input file with atom positions (dump)
|
||||
@ -78,17 +92,11 @@ TBD
|
||||
|
||||
## Citations
|
||||
|
||||
Rafael Ravedutti Lucio Machado, Jan Eitzinger, Harald Köstler, and Gerhard
|
||||
Wellein: MD-Bench: A generic proxy-app toolbox for state-of-the-art molecular
|
||||
dynamics algorithms. Accepted for [PPAM](https://ppam.edu.pl/) 2022, the 14th
|
||||
International Conference on Parallel Processing and Applied Mathematics, Gdansk,
|
||||
Poland, September 11-14, 2022. PPAM 2022 Best Paper Award. Preprint:
|
||||
[arXiv:2207.13094](https://arxiv.org/abs/2207.13094)
|
||||
Rafael Ravedutti Lucio Machado, Jan Eitzinger, Harald Köstler, and Gerhard Wellein: MD-Bench: A generic proxy-app toolbox for state-of-the-art molecular dynamics algorithms. Accepted for [PPAM](https://ppam.edu.pl/) 2022, the 14th International Conference on Parallel Processing and Applied Mathematics, Gdansk, Poland, September 11-14, 2022. PPAM 2022 Best Paper Award. Preprint: [arXiv:2207.13094](https://arxiv.org/abs/2207.13094)
|
||||
|
||||
## Credits
|
||||
|
||||
MD-Bench is developed by the Erlangen National High Performance Computing Center
|
||||
([NHR@FAU](https://hpc.fau.de/)) at the University of Erlangen-Nürnberg.
|
||||
MD-Bench is developed by the Erlangen National High Performance Computing Center ([NHR@FAU](https://hpc.fau.de/)) at the University of Erlangen-Nürnberg.
|
||||
|
||||
## License
|
||||
|
||||
|
0
asm/.gitkeep
Normal file
0
asm/.gitkeep
Normal file
626
asm/unused/force-mem-only-with-likwid.s
Normal file
626
asm/unused/force-mem-only-with-likwid.s
Normal file
@ -0,0 +1,626 @@
|
||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
|
||||
# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
|
||||
# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
|
||||
# mark_description "ICC/force.s";
|
||||
.file "force.c"
|
||||
.text
|
||||
..TXTST0:
|
||||
.L_2__routine_start_computeForce_0:
|
||||
# -- Begin computeForce
|
||||
.text
|
||||
# mark_begin;
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
|
||||
computeForce:
|
||||
# parameter 1: %rdi
|
||||
# parameter 2: %rsi
|
||||
# parameter 3: %rdx
|
||||
# parameter 4: %ecx
|
||||
# parameter 5: %r8d
|
||||
# parameter 6: %r9d
|
||||
..B1.1: # Preds ..B1.0
|
||||
# Execution count [1.00e+00]
|
||||
.cfi_startproc
|
||||
..___tag_value_computeForce.1:
|
||||
..L2:
|
||||
#121.112
|
||||
pushq %rbp #121.112
|
||||
.cfi_def_cfa_offset 16
|
||||
movq %rsp, %rbp #121.112
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_offset 6, -16
|
||||
andq $-64, %rsp #121.112
|
||||
pushq %r12 #121.112
|
||||
pushq %r13 #121.112
|
||||
pushq %r14 #121.112
|
||||
pushq %r15 #121.112
|
||||
pushq %rbx #121.112
|
||||
subq $88, %rsp #121.112
|
||||
xorl %eax, %eax #124.16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
|
||||
movq %rdx, %r15 #121.112
|
||||
movq %rsi, %r12 #121.112
|
||||
movq %rdi, %rbx #121.112
|
||||
..___tag_value_computeForce.11:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #124.16
|
||||
..___tag_value_computeForce.12:
|
||||
# LOE rbx r12 r15 xmm0
|
||||
..B1.51: # Preds ..B1.1
|
||||
# Execution count [1.00e+00]
|
||||
vmovsd %xmm0, 24(%rsp) #124.16[spill]
|
||||
# LOE rbx r12 r15
|
||||
..B1.2: # Preds ..B1.51
|
||||
# Execution count [1.00e+00]
|
||||
movl 4(%r12), %r13d #125.18
|
||||
movq 64(%r12), %r9 #127.20
|
||||
movq 72(%r12), %r14 #127.45
|
||||
movq 80(%r12), %r8 #127.70
|
||||
vmovsd 72(%rbx), %xmm2 #129.27
|
||||
vmovsd 8(%rbx), %xmm1 #130.23
|
||||
vmovsd (%rbx), %xmm0 #131.24
|
||||
testl %r13d, %r13d #134.24
|
||||
jle ..B1.43 # Prob 50% #134.24
|
||||
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.3: # Preds ..B1.2
|
||||
# Execution count [1.00e+00]
|
||||
xorl %ebx, %ebx #134.5
|
||||
movl %r13d, %edx #134.5
|
||||
xorl %ecx, %ecx #134.5
|
||||
movl $1, %esi #134.5
|
||||
xorl %eax, %eax #135.17
|
||||
shrl $1, %edx #134.5
|
||||
je ..B1.7 # Prob 9% #134.5
|
||||
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.5: # Preds ..B1.3 ..B1.5
|
||||
# Execution count [2.50e+00]
|
||||
movq %rax, (%rcx,%r9) #135.9
|
||||
incq %rbx #134.5
|
||||
movq %rax, (%rcx,%r14) #136.9
|
||||
movq %rax, (%rcx,%r8) #137.9
|
||||
movq %rax, 8(%rcx,%r9) #135.9
|
||||
movq %rax, 8(%rcx,%r14) #136.9
|
||||
movq %rax, 8(%rcx,%r8) #137.9
|
||||
addq $16, %rcx #134.5
|
||||
cmpq %rdx, %rbx #134.5
|
||||
jb ..B1.5 # Prob 63% #134.5
|
||||
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.6: # Preds ..B1.5
|
||||
# Execution count [9.00e-01]
|
||||
lea 1(%rbx,%rbx), %esi #135.9
|
||||
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.7: # Preds ..B1.3 ..B1.6
|
||||
# Execution count [1.00e+00]
|
||||
lea -1(%rsi), %edx #134.5
|
||||
cmpl %r13d, %edx #134.5
|
||||
jae ..B1.9 # Prob 9% #134.5
|
||||
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
|
||||
..B1.8: # Preds ..B1.7
|
||||
# Execution count [9.00e-01]
|
||||
movslq %esi, %rsi #134.5
|
||||
movq %rax, -8(%r9,%rsi,8) #135.9
|
||||
movq %rax, -8(%r14,%rsi,8) #136.9
|
||||
movq %rax, -8(%r8,%rsi,8) #137.9
|
||||
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
|
||||
..B1.9: # Preds ..B1.7 ..B1.8
|
||||
# Execution count [5.00e-01]
|
||||
movl $.L_2__STRING.0, %edi #141.5
|
||||
movq %r8, 32(%rsp) #141.5[spill]
|
||||
movq %r9, 80(%rsp) #141.5[spill]
|
||||
vmovsd %xmm2, (%rsp) #141.5[spill]
|
||||
vmovsd %xmm1, 8(%rsp) #141.5[spill]
|
||||
vmovsd %xmm0, 16(%rsp) #141.5[spill]
|
||||
..___tag_value_computeForce.18:
|
||||
# likwid_markerStartRegion(const char *)
|
||||
call likwid_markerStartRegion #141.5
|
||||
..___tag_value_computeForce.19:
|
||||
# LOE r12 r14 r15 r13d
|
||||
..B1.10: # Preds ..B1.9
|
||||
# Execution count [9.00e-01]
|
||||
vmovsd 16(%rsp), %xmm0 #[spill]
|
||||
xorl %esi, %esi #143.15
|
||||
vmovsd (%rsp), %xmm2 #[spill]
|
||||
xorl %eax, %eax #143.5
|
||||
vmulsd %xmm2, %xmm2, %xmm13 #129.45
|
||||
xorl %edi, %edi #143.5
|
||||
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #173.13
|
||||
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #197.45
|
||||
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #173.13
|
||||
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #197.58
|
||||
vmovsd 8(%rsp), %xmm1 #[spill]
|
||||
vbroadcastsd %xmm13, %zmm14 #129.25
|
||||
vbroadcastsd %xmm1, %zmm13 #130.21
|
||||
vbroadcastsd %xmm0, %zmm9 #197.45
|
||||
movslq %r13d, %r13 #143.5
|
||||
movq 24(%r15), %r10 #145.25
|
||||
movslq 16(%r15), %rdx #144.43
|
||||
movq 8(%r15), %rcx #144.19
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq 16(%r12), %rbx #146.25
|
||||
shlq $2, %rdx #126.5
|
||||
movq %r13, 64(%rsp) #143.5[spill]
|
||||
movq %r10, 72(%rsp) #143.5[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.11: # Preds ..B1.41 ..B1.10
|
||||
# Execution count [5.00e+00]
|
||||
movq 72(%rsp), %r9 #145.25[spill]
|
||||
vxorpd %xmm24, %xmm24, %xmm24 #149.22
|
||||
vmovapd %xmm24, %xmm18 #150.22
|
||||
movl (%r9,%rax,4), %r10d #145.25
|
||||
vmovapd %xmm18, %xmm4 #151.22
|
||||
vmovsd (%rdi,%rbx), %xmm10 #146.25
|
||||
vmovsd 8(%rdi,%rbx), %xmm6 #147.25
|
||||
vmovsd 16(%rdi,%rbx), %xmm12 #148.25
|
||||
testl %r10d, %r10d #173.32
|
||||
jle ..B1.41 # Prob 50% #173.32
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.12: # Preds ..B1.11
|
||||
# Execution count [4.50e+00]
|
||||
vpxord %zmm8, %zmm8, %zmm8 #149.22
|
||||
vmovaps %zmm8, %zmm7 #150.22
|
||||
vmovaps %zmm7, %zmm11 #151.22
|
||||
cmpl $8, %r10d #173.13
|
||||
jl ..B1.48 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.13: # Preds ..B1.12
|
||||
# Execution count [4.50e+00]
|
||||
cmpl $1200, %r10d #173.13
|
||||
jl ..B1.47 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.14: # Preds ..B1.13
|
||||
# Execution count [4.50e+00]
|
||||
movq %rdx, %r15 #144.43
|
||||
imulq %rsi, %r15 #144.43
|
||||
addq %rcx, %r15 #126.5
|
||||
movq %r15, %r11 #173.13
|
||||
andq $63, %r11 #173.13
|
||||
testl $3, %r11d #173.13
|
||||
je ..B1.16 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.15: # Preds ..B1.14
|
||||
# Execution count [2.25e+00]
|
||||
xorl %r11d, %r11d #173.13
|
||||
jmp ..B1.18 # Prob 100% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.16: # Preds ..B1.14
|
||||
# Execution count [2.25e+00]
|
||||
testl %r11d, %r11d #173.13
|
||||
je ..B1.18 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.17: # Preds ..B1.16
|
||||
# Execution count [2.50e+01]
|
||||
negl %r11d #173.13
|
||||
addl $64, %r11d #173.13
|
||||
shrl $2, %r11d #173.13
|
||||
cmpl %r11d, %r10d #173.13
|
||||
cmovl %r10d, %r11d #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.18: # Preds ..B1.15 ..B1.17 ..B1.16
|
||||
# Execution count [5.00e+00]
|
||||
movl %r10d, %r13d #173.13
|
||||
subl %r11d, %r13d #173.13
|
||||
andl $7, %r13d #173.13
|
||||
negl %r13d #173.13
|
||||
addl %r10d, %r13d #173.13
|
||||
cmpl $1, %r11d #173.13
|
||||
jb ..B1.26 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.19: # Preds ..B1.18
|
||||
# Execution count [4.50e+00]
|
||||
vmovdqa %ymm15, %ymm4 #173.13
|
||||
xorl %r12d, %r12d #173.13
|
||||
vpbroadcastd %r11d, %ymm3 #173.13
|
||||
vbroadcastsd %xmm10, %zmm2 #146.23
|
||||
vbroadcastsd %xmm6, %zmm1 #147.23
|
||||
vbroadcastsd %xmm12, %zmm0 #148.23
|
||||
movslq %r11d, %r9 #173.13
|
||||
movq %r8, 32(%rsp) #173.13[spill]
|
||||
movq %r14, (%rsp) #173.13[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.20: # Preds ..B1.24 ..B1.19
|
||||
# Execution count [2.50e+01]
|
||||
vpcmpgtd %ymm4, %ymm3, %k3 #173.13
|
||||
vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z} #174.25
|
||||
kmovw %k3, %r14d #173.13
|
||||
vpaddd %ymm17, %ymm17, %ymm18 #175.40
|
||||
vpaddd %ymm18, %ymm17, %ymm17 #175.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.23: # Preds ..B1.20
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #175.40
|
||||
kmovw %k3, %k2 #175.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #175.40
|
||||
vpxord %zmm19, %zmm19, %zmm19 #175.40
|
||||
vpxord %zmm20, %zmm20, %zmm20 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3} #175.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
|
||||
..B1.24: # Preds ..B1.23
|
||||
# Execution count [2.50e+01]
|
||||
addq $8, %r12 #173.13
|
||||
#vpaddd %ymm16, %ymm4, %ymm4 #173.13
|
||||
#vsubpd %zmm18, %zmm0, %zmm29 #177.40
|
||||
#vsubpd %zmm19, %zmm1, %zmm27 #176.40
|
||||
#vsubpd %zmm20, %zmm2, %zmm26 #175.40
|
||||
#vmulpd %zmm27, %zmm27, %zmm25 #178.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm25 #178.53
|
||||
#vfmadd231pd %zmm29, %zmm29, %zmm25 #178.67
|
||||
#vrcp14pd %zmm25, %zmm24 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm25, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm24, %k0 #195.42
|
||||
#kmovw %k2, %r8d #194.26
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmovaps %zmm25, %zmm17 #195.42
|
||||
#andl %r8d, %r14d #194.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
|
||||
#kmovw %r14d, %k3 #198.21
|
||||
#vmulpd %zmm17, %zmm17, %zmm18 #195.42
|
||||
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #195.42
|
||||
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm24, %zmm19 #196.42
|
||||
#vmulpd %zmm9, %zmm24, %zmm21 #197.58
|
||||
#vmulpd %zmm19, %zmm24, %zmm22 #196.48
|
||||
#vmulpd %zmm22, %zmm24, %zmm20 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm22, %zmm24 #197.58
|
||||
#vmulpd %zmm21, %zmm20, %zmm23 #197.65
|
||||
#vmulpd %zmm24, %zmm23, %zmm28 #197.71
|
||||
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #198.21
|
||||
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #199.21
|
||||
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #200.21
|
||||
cmpq %r9, %r12 #173.13
|
||||
jb ..B1.20 # Prob 82% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.25: # Preds ..B1.24
|
||||
# Execution count [4.50e+00]
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq (%rsp), %r14 #[spill]
|
||||
cmpl %r11d, %r10d #173.13
|
||||
je ..B1.40 # Prob 10% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.26: # Preds ..B1.25 ..B1.18 ..B1.47
|
||||
# Execution count [2.50e+01]
|
||||
lea 8(%r11), %r9d #173.13
|
||||
cmpl %r9d, %r13d #173.13
|
||||
jl ..B1.34 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.27: # Preds ..B1.26
|
||||
# Execution count [4.50e+00]
|
||||
movq %rdx, %r12 #144.43
|
||||
imulq %rsi, %r12 #144.43
|
||||
vbroadcastsd %xmm10, %zmm1 #146.23
|
||||
vbroadcastsd %xmm6, %zmm0 #147.23
|
||||
vbroadcastsd %xmm12, %zmm2 #148.23
|
||||
movslq %r11d, %r9 #173.13
|
||||
addq %rcx, %r12 #126.5
|
||||
movq %rdi, 8(%rsp) #126.5[spill]
|
||||
movq %rdx, 16(%rsp) #126.5[spill]
|
||||
movq %rcx, 40(%rsp) #126.5[spill]
|
||||
movq %rax, 48(%rsp) #126.5[spill]
|
||||
movq %rsi, 56(%rsp) #126.5[spill]
|
||||
movq %r8, 32(%rsp) #126.5[spill]
|
||||
movq %r14, (%rsp) #126.5[spill]
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.28: # Preds ..B1.32 ..B1.27
|
||||
# Execution count [2.50e+01]
|
||||
vmovdqu (%r12,%r9,4), %ymm3 #174.25
|
||||
vpaddd %ymm3, %ymm3, %ymm4 #175.40
|
||||
vpaddd %ymm4, %ymm3, %ymm3 #175.40
|
||||
movl (%r12,%r9,4), %r14d #174.25
|
||||
movl 4(%r12,%r9,4), %r8d #174.25
|
||||
movl 8(%r12,%r9,4), %edi #174.25
|
||||
movl 12(%r12,%r9,4), %esi #174.25
|
||||
lea (%r14,%r14,2), %r14d #175.40
|
||||
movl 16(%r12,%r9,4), %ecx #174.25
|
||||
lea (%r8,%r8,2), %r8d #175.40
|
||||
movl 20(%r12,%r9,4), %edx #174.25
|
||||
lea (%rdi,%rdi,2), %edi #175.40
|
||||
movl 24(%r12,%r9,4), %eax #174.25
|
||||
lea (%rsi,%rsi,2), %esi #175.40
|
||||
movl 28(%r12,%r9,4), %r15d #174.25
|
||||
lea (%rcx,%rcx,2), %ecx #175.40
|
||||
lea (%rdx,%rdx,2), %edx #175.40
|
||||
lea (%rax,%rax,2), %eax #175.40
|
||||
lea (%r15,%r15,2), %r15d #175.40
|
||||
# LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.31: # Preds ..B1.28
|
||||
# Execution count [1.25e+01]
|
||||
vpcmpeqb %xmm0, %xmm0, %k1 #175.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k2 #175.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k3 #175.40
|
||||
vpxord %zmm4, %zmm4, %zmm4 #175.40
|
||||
vpxord %zmm17, %zmm17, %zmm17 #175.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3} #175.40
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
|
||||
..B1.32: # Preds ..B1.31
|
||||
# Execution count [2.50e+01]
|
||||
addl $8, %r11d #173.13
|
||||
addq $8, %r9 #173.13
|
||||
#vsubpd %zmm4, %zmm2, %zmm26 #177.40
|
||||
#vsubpd %zmm17, %zmm0, %zmm24 #176.40
|
||||
#vsubpd %zmm18, %zmm1, %zmm23 #175.40
|
||||
#vmulpd %zmm24, %zmm24, %zmm3 #178.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm3 #178.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm3 #178.67
|
||||
#vrcp14pd %zmm3, %zmm22 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm3, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm22, %k0 #195.42
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmulpd %zmm3, %zmm3, %zmm4 #195.42
|
||||
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #195.42
|
||||
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm22, %zmm17 #196.42
|
||||
#vmulpd %zmm9, %zmm22, %zmm19 #197.58
|
||||
#vmulpd %zmm17, %zmm22, %zmm20 #196.48
|
||||
#vmulpd %zmm20, %zmm22, %zmm18 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm20, %zmm22 #197.58
|
||||
#vmulpd %zmm19, %zmm18, %zmm21 #197.65
|
||||
#vmulpd %zmm22, %zmm21, %zmm25 #197.71
|
||||
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #198.21
|
||||
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #199.21
|
||||
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #200.21
|
||||
cmpl %r13d, %r11d #173.13
|
||||
jb ..B1.28 # Prob 82% #173.13
|
||||
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.33: # Preds ..B1.32
|
||||
# Execution count [4.50e+00]
|
||||
movq 8(%rsp), %rdi #[spill]
|
||||
movq 16(%rsp), %rdx #[spill]
|
||||
movq 40(%rsp), %rcx #[spill]
|
||||
movq 48(%rsp), %rax #[spill]
|
||||
movq 56(%rsp), %rsi #[spill]
|
||||
movq 32(%rsp), %r8 #[spill]
|
||||
movq (%rsp), %r14 #[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.34: # Preds ..B1.33 ..B1.26 ..B1.48
|
||||
# Execution count [5.00e+00]
|
||||
lea 1(%r13), %r9d #173.13
|
||||
cmpl %r10d, %r9d #173.13
|
||||
ja ..B1.40 # Prob 50% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.35: # Preds ..B1.34
|
||||
# Execution count [2.50e+01]
|
||||
imulq %rdx, %rsi #144.43
|
||||
vbroadcastsd %xmm10, %zmm4 #146.23
|
||||
subl %r13d, %r10d #173.13
|
||||
addq %rcx, %rsi #126.5
|
||||
vpbroadcastd %r10d, %ymm0 #173.13
|
||||
vpcmpgtd %ymm15, %ymm0, %k3 #173.13
|
||||
movslq %r13d, %r13 #173.13
|
||||
kmovw %k3, %r9d #173.13
|
||||
vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z} #174.25
|
||||
vpaddd %ymm1, %ymm1, %ymm2 #175.40
|
||||
vpaddd %ymm2, %ymm1, %ymm0 #175.40
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.38: # Preds ..B1.35
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #175.40
|
||||
kmovw %k3, %k2 #175.40
|
||||
vpxord %zmm1, %zmm1, %zmm1 #175.40
|
||||
vpxord %zmm2, %zmm2, %zmm2 #175.40
|
||||
vpxord %zmm3, %zmm3, %zmm3 #175.40
|
||||
vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1} #175.40
|
||||
vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2} #175.40
|
||||
vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3} #175.40
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.39: # Preds ..B1.38
|
||||
# Execution count [2.50e+01]
|
||||
#vbroadcastsd %xmm6, %zmm6 #147.23
|
||||
#vbroadcastsd %xmm12, %zmm12 #148.23
|
||||
#vsubpd %zmm1, %zmm12, %zmm23 #177.40
|
||||
#vsubpd %zmm2, %zmm6, %zmm21 #176.40
|
||||
#vsubpd %zmm3, %zmm4, %zmm20 #175.40
|
||||
#vmulpd %zmm21, %zmm21, %zmm19 #178.53
|
||||
#vfmadd231pd %zmm20, %zmm20, %zmm19 #178.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm19 #178.67
|
||||
#vrcp14pd %zmm19, %zmm18 #195.42
|
||||
#vcmppd $1, %zmm14, %zmm19, %k2 #194.26
|
||||
#vfpclasspd $30, %zmm18, %k0 #195.42
|
||||
#kmovw %k2, %esi #194.26
|
||||
#knotw %k0, %k1 #195.42
|
||||
#vmovaps %zmm19, %zmm0 #195.42
|
||||
#andl %esi, %r9d #194.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
|
||||
#kmovw %r9d, %k3 #198.21
|
||||
#vmulpd %zmm0, %zmm0, %zmm1 #195.42
|
||||
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #195.42
|
||||
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #195.42
|
||||
#vmulpd %zmm13, %zmm18, %zmm2 #196.42
|
||||
#vmulpd %zmm9, %zmm18, %zmm4 #197.58
|
||||
#vmulpd %zmm2, %zmm18, %zmm10 #196.48
|
||||
#vmulpd %zmm10, %zmm18, %zmm3 #196.54
|
||||
#vfmsub213pd %zmm5, %zmm10, %zmm18 #197.58
|
||||
#vmulpd %zmm4, %zmm3, %zmm17 #197.65
|
||||
#vmulpd %zmm18, %zmm17, %zmm22 #197.71
|
||||
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #198.21
|
||||
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #199.21
|
||||
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #200.21
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.40: # Preds ..B1.25 ..B1.39 ..B1.34
|
||||
# Execution count [4.50e+00]
|
||||
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #151.22
|
||||
vpermd %zmm11, %zmm19, %zmm0 #151.22
|
||||
vpermd %zmm7, %zmm19, %zmm6 #150.22
|
||||
vpermd %zmm8, %zmm19, %zmm20 #149.22
|
||||
vaddpd %zmm11, %zmm0, %zmm11 #151.22
|
||||
vaddpd %zmm7, %zmm6, %zmm7 #150.22
|
||||
vaddpd %zmm8, %zmm20, %zmm8 #149.22
|
||||
vpermpd $78, %zmm11, %zmm1 #151.22
|
||||
vpermpd $78, %zmm7, %zmm10 #150.22
|
||||
vpermpd $78, %zmm8, %zmm21 #149.22
|
||||
vaddpd %zmm1, %zmm11, %zmm2 #151.22
|
||||
vaddpd %zmm10, %zmm7, %zmm12 #150.22
|
||||
vaddpd %zmm21, %zmm8, %zmm22 #149.22
|
||||
vpermpd $177, %zmm2, %zmm3 #151.22
|
||||
vpermpd $177, %zmm12, %zmm17 #150.22
|
||||
vpermpd $177, %zmm22, %zmm23 #149.22
|
||||
vaddpd %zmm3, %zmm2, %zmm4 #151.22
|
||||
vaddpd %zmm17, %zmm12, %zmm18 #150.22
|
||||
vaddpd %zmm23, %zmm22, %zmm24 #149.22
|
||||
# LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.41: # Preds ..B1.40 ..B1.11
|
||||
# Execution count [5.00e+00]
|
||||
movq 80(%rsp), %rsi #208.9[spill]
|
||||
addq $24, %rdi #143.5
|
||||
vaddsd (%rsi,%rax,8), %xmm24, %xmm0 #208.9
|
||||
vmovsd %xmm0, (%rsi,%rax,8) #208.9
|
||||
movslq %eax, %rsi #143.32
|
||||
vaddsd (%r14,%rax,8), %xmm18, %xmm1 #209.9
|
||||
vmovsd %xmm1, (%r14,%rax,8) #209.9
|
||||
incq %rsi #143.32
|
||||
vaddsd (%r8,%rax,8), %xmm4, %xmm2 #210.9
|
||||
vmovsd %xmm2, (%r8,%rax,8) #210.9
|
||||
incq %rax #143.5
|
||||
cmpq 64(%rsp), %rax #143.5[spill]
|
||||
jb ..B1.11 # Prob 82% #143.5
|
||||
jmp ..B1.44 # Prob 100% #143.5
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.43: # Preds ..B1.2
|
||||
# Execution count [5.00e-01]
|
||||
movl $.L_2__STRING.0, %edi #141.5
|
||||
..___tag_value_computeForce.48:
|
||||
# likwid_markerStartRegion(const char *)
|
||||
call likwid_markerStartRegion #141.5
|
||||
..___tag_value_computeForce.49:
|
||||
# LOE
|
||||
..B1.44: # Preds ..B1.41 ..B1.43
|
||||
# Execution count [1.00e+00]
|
||||
movl $.L_2__STRING.0, %edi #219.5
|
||||
vzeroupper #219.5
|
||||
..___tag_value_computeForce.50:
|
||||
# likwid_markerStopRegion(const char *)
|
||||
call likwid_markerStopRegion #219.5
|
||||
..___tag_value_computeForce.51:
|
||||
# LOE
|
||||
..B1.45: # Preds ..B1.44
|
||||
# Execution count [1.00e+00]
|
||||
xorl %eax, %eax #221.16
|
||||
..___tag_value_computeForce.52:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #221.16
|
||||
..___tag_value_computeForce.53:
|
||||
# LOE xmm0
|
||||
..B1.46: # Preds ..B1.45
|
||||
# Execution count [1.00e+00]
|
||||
vsubsd 24(%rsp), %xmm0, %xmm0 #224.14[spill]
|
||||
addq $88, %rsp #224.14
|
||||
.cfi_restore 3
|
||||
popq %rbx #224.14
|
||||
.cfi_restore 15
|
||||
popq %r15 #224.14
|
||||
.cfi_restore 14
|
||||
popq %r14 #224.14
|
||||
.cfi_restore 13
|
||||
popq %r13 #224.14
|
||||
.cfi_restore 12
|
||||
popq %r12 #224.14
|
||||
movq %rbp, %rsp #224.14
|
||||
popq %rbp #224.14
|
||||
.cfi_def_cfa 7, 8
|
||||
.cfi_restore 6
|
||||
ret #224.14
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_offset 6, -16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE
|
||||
..B1.47: # Preds ..B1.13
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
movl %r10d, %r13d #173.13
|
||||
xorl %r11d, %r11d #173.13
|
||||
andl $-8, %r13d #173.13
|
||||
jmp ..B1.26 # Prob 100% #173.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.48: # Preds ..B1.12
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
xorl %r13d, %r13d #173.13
|
||||
jmp ..B1.34 # Prob 100% #173.13
|
||||
.align 16,0x90
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
.cfi_endproc
|
||||
# mark_end;
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.7:
|
||||
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
|
||||
.type .L_2il0floatpacket.7,@object
|
||||
.size .L_2il0floatpacket.7,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.8:
|
||||
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
|
||||
.type .L_2il0floatpacket.8,@object
|
||||
.size .L_2il0floatpacket.8,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.10:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.10,@object
|
||||
.size .L_2il0floatpacket.10,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.9:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.9,@object
|
||||
.size .L_2il0floatpacket.9,8
|
||||
.section .rodata.str1.4, "aMS",@progbits,1
|
||||
.align 4
|
||||
.align 4
|
||||
.L_2__STRING.0:
|
||||
.long 1668444006
|
||||
.word 101
|
||||
.type .L_2__STRING.0,@object
|
||||
.size .L_2__STRING.0,6
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
585
asm/unused/force-mem-only.s
Normal file
585
asm/unused/force-mem-only.s
Normal file
@ -0,0 +1,585 @@
|
||||
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
|
||||
# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
|
||||
# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
|
||||
.file "force.c"
|
||||
.text
|
||||
..TXTST0:
|
||||
.L_2__routine_start_computeForce_0:
|
||||
# -- Begin computeForce
|
||||
.text
|
||||
# mark_begin;
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
# --- computeForce(Parameter *, Atom *, Neighbor *, int)
|
||||
computeForce:
|
||||
# parameter 1: %rdi
|
||||
# parameter 2: %rsi
|
||||
# parameter 3: %rdx
|
||||
# parameter 4: %ecx
|
||||
..B1.1: # Preds ..B1.0
|
||||
# Execution count [1.00e+00]
|
||||
.cfi_startproc
|
||||
..___tag_value_computeForce.1:
|
||||
..L2:
|
||||
#103.87
|
||||
pushq %rbp #103.87
|
||||
.cfi_def_cfa_offset 16
|
||||
movq %rsp, %rbp #103.87
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_offset 6, -16
|
||||
andq $-64, %rsp #103.87
|
||||
pushq %r12 #103.87
|
||||
pushq %r13 #103.87
|
||||
pushq %r14 #103.87
|
||||
subq $104, %rsp #103.87
|
||||
xorl %eax, %eax #106.16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
movq %rdx, %r14 #103.87
|
||||
movq %rsi, %r13 #103.87
|
||||
movq %rdi, %r12 #103.87
|
||||
..___tag_value_computeForce.9:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #106.16
|
||||
..___tag_value_computeForce.10:
|
||||
# LOE rbx r12 r13 r14 r15 xmm0
|
||||
..B1.48: # Preds ..B1.1
|
||||
# Execution count [1.00e+00]
|
||||
vmovsd %xmm0, 16(%rsp) #106.16[spill]
|
||||
# LOE rbx r12 r13 r14 r15
|
||||
..B1.2: # Preds ..B1.48
|
||||
# Execution count [1.00e+00]
|
||||
movl 4(%r13), %ecx #107.18
|
||||
movq 64(%r13), %r11 #109.20
|
||||
movq 72(%r13), %r10 #109.45
|
||||
movq 80(%r13), %r9 #109.70
|
||||
vmovsd 72(%r12), %xmm2 #111.27
|
||||
vmovsd 8(%r12), %xmm1 #112.23
|
||||
vmovsd (%r12), %xmm0 #113.24
|
||||
testl %ecx, %ecx #116.24
|
||||
jle ..B1.42 # Prob 50% #116.24
|
||||
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.3: # Preds ..B1.2
|
||||
# Execution count [1.00e+00]
|
||||
xorl %edi, %edi #116.5
|
||||
movl %ecx, %edx #116.5
|
||||
xorl %esi, %esi #116.5
|
||||
movl $1, %r8d #116.5
|
||||
xorl %eax, %eax #117.17
|
||||
shrl $1, %edx #116.5
|
||||
je ..B1.7 # Prob 9% #116.5
|
||||
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.5: # Preds ..B1.3 ..B1.5
|
||||
# Execution count [2.50e+00]
|
||||
movq %rax, (%rsi,%r11) #117.9
|
||||
incq %rdi #116.5
|
||||
movq %rax, (%rsi,%r10) #118.9
|
||||
movq %rax, (%rsi,%r9) #119.9
|
||||
movq %rax, 8(%rsi,%r11) #117.9
|
||||
movq %rax, 8(%rsi,%r10) #118.9
|
||||
movq %rax, 8(%rsi,%r9) #119.9
|
||||
addq $16, %rsi #116.5
|
||||
cmpq %rdx, %rdi #116.5
|
||||
jb ..B1.5 # Prob 63% #116.5
|
||||
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.6: # Preds ..B1.5
|
||||
# Execution count [9.00e-01]
|
||||
lea 1(%rdi,%rdi), %r8d #117.9
|
||||
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.7: # Preds ..B1.3 ..B1.6
|
||||
# Execution count [1.00e+00]
|
||||
lea -1(%r8), %edx #116.5
|
||||
cmpl %ecx, %edx #116.5
|
||||
jae ..B1.9 # Prob 9% #116.5
|
||||
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
|
||||
..B1.8: # Preds ..B1.7
|
||||
# Execution count [9.00e-01]
|
||||
movslq %r8d, %r8 #116.5
|
||||
movq %rax, -8(%r11,%r8,8) #117.9
|
||||
movq %rax, -8(%r10,%r8,8) #118.9
|
||||
movq %rax, -8(%r9,%r8,8) #119.9
|
||||
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
|
||||
..B1.9: # Preds ..B1.7 ..B1.8
|
||||
# Execution count [9.00e-01]
|
||||
vmulsd %xmm2, %xmm2, %xmm13 #111.45
|
||||
xorl %edi, %edi #124.15
|
||||
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #153.13
|
||||
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #177.45
|
||||
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #153.13
|
||||
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #177.58
|
||||
vbroadcastsd %xmm13, %zmm14 #111.25
|
||||
vbroadcastsd %xmm1, %zmm13 #112.21
|
||||
vbroadcastsd %xmm0, %zmm9 #177.45
|
||||
movq 16(%r13), %rdx #127.25
|
||||
xorl %r8d, %r8d #124.5
|
||||
movslq %ecx, %r12 #124.5
|
||||
xorl %eax, %eax #124.5
|
||||
movq 24(%r14), %r13 #126.25
|
||||
movslq 16(%r14), %rcx #125.43
|
||||
movq 8(%r14), %rsi #125.19
|
||||
shlq $2, %rcx #108.5
|
||||
movq %r12, 80(%rsp) #124.5[spill]
|
||||
movq %r13, 88(%rsp) #124.5[spill]
|
||||
movq %r11, 96(%rsp) #124.5[spill]
|
||||
movq %r15, 8(%rsp) #124.5[spill]
|
||||
movq %rbx, (%rsp) #124.5[spill]
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.10: # Preds ..B1.40 ..B1.9
|
||||
# Execution count [5.00e+00]
|
||||
movq 88(%rsp), %rbx #126.25[spill]
|
||||
vxorpd %xmm24, %xmm24, %xmm24 #130.22
|
||||
vmovapd %xmm24, %xmm18 #131.22
|
||||
movl (%rbx,%r8,4), %r11d #126.25
|
||||
vmovapd %xmm18, %xmm4 #132.22
|
||||
vmovsd (%rax,%rdx), %xmm10 #127.25
|
||||
vmovsd 8(%rax,%rdx), %xmm6 #128.25
|
||||
vmovsd 16(%rax,%rdx), %xmm12 #129.25
|
||||
testl %r11d, %r11d #153.32
|
||||
jle ..B1.40 # Prob 50% #153.32
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.11: # Preds ..B1.10
|
||||
# Execution count [4.50e+00]
|
||||
vpxord %zmm8, %zmm8, %zmm8 #130.22
|
||||
vmovaps %zmm8, %zmm7 #131.22
|
||||
vmovaps %zmm7, %zmm11 #132.22
|
||||
cmpl $8, %r11d #153.13
|
||||
jl ..B1.45 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.12: # Preds ..B1.11
|
||||
# Execution count [4.50e+00]
|
||||
cmpl $1200, %r11d #153.13
|
||||
jl ..B1.44 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.13: # Preds ..B1.12
|
||||
# Execution count [4.50e+00]
|
||||
movq %rcx, %r15 #125.43
|
||||
imulq %rdi, %r15 #125.43
|
||||
addq %rsi, %r15 #108.5
|
||||
movq %r15, %r12 #153.13
|
||||
andq $63, %r12 #153.13
|
||||
testl $3, %r12d #153.13
|
||||
je ..B1.15 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.14: # Preds ..B1.13
|
||||
# Execution count [2.25e+00]
|
||||
xorl %r12d, %r12d #153.13
|
||||
jmp ..B1.17 # Prob 100% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.15: # Preds ..B1.13
|
||||
# Execution count [2.25e+00]
|
||||
testl %r12d, %r12d #153.13
|
||||
je ..B1.17 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.16: # Preds ..B1.15
|
||||
# Execution count [2.50e+01]
|
||||
negl %r12d #153.13
|
||||
addl $64, %r12d #153.13
|
||||
shrl $2, %r12d #153.13
|
||||
cmpl %r12d, %r11d #153.13
|
||||
cmovl %r11d, %r12d #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.17: # Preds ..B1.14 ..B1.16 ..B1.15
|
||||
# Execution count [5.00e+00]
|
||||
movl %r11d, %r14d #153.13
|
||||
subl %r12d, %r14d #153.13
|
||||
andl $7, %r14d #153.13
|
||||
negl %r14d #153.13
|
||||
addl %r11d, %r14d #153.13
|
||||
cmpl $1, %r12d #153.13
|
||||
jb ..B1.25 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.18: # Preds ..B1.17
|
||||
# Execution count [4.50e+00]
|
||||
vmovdqa %ymm15, %ymm4 #153.13
|
||||
xorl %r13d, %r13d #153.13
|
||||
vpbroadcastd %r12d, %ymm3 #153.13
|
||||
vbroadcastsd %xmm10, %zmm2 #127.23
|
||||
vbroadcastsd %xmm6, %zmm1 #128.23
|
||||
vbroadcastsd %xmm12, %zmm0 #129.23
|
||||
movslq %r12d, %rbx #153.13
|
||||
movq %r9, 24(%rsp) #153.13[spill]
|
||||
movq %r10, 32(%rsp) #153.13[spill]
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.19: # Preds ..B1.23 ..B1.18
|
||||
# Execution count [2.50e+01]
|
||||
vpcmpgtd %ymm4, %ymm3, %k3 #153.13
|
||||
vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #154.25
|
||||
kmovw %k3, %r10d #153.13
|
||||
vpaddd %ymm17, %ymm17, %ymm18 #155.40
|
||||
vpaddd %ymm18, %ymm17, %ymm17 #155.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.22: # Preds ..B1.19
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #155.40
|
||||
kmovw %k3, %k2 #155.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #155.40
|
||||
vpxord %zmm19, %zmm19, %zmm19 #155.40
|
||||
vpxord %zmm20, %zmm20, %zmm20 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3} #155.40
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
|
||||
..B1.23: # Preds ..B1.22
|
||||
# Execution count [2.50e+01]
|
||||
addq $8, %r13 #153.13
|
||||
#vpaddd %ymm16, %ymm4, %ymm4 #153.13
|
||||
#vsubpd %zmm18, %zmm0, %zmm29 #157.40
|
||||
#vsubpd %zmm19, %zmm1, %zmm27 #156.40
|
||||
#vsubpd %zmm20, %zmm2, %zmm26 #155.40
|
||||
#vmulpd %zmm27, %zmm27, %zmm25 #158.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm25 #158.53
|
||||
#vfmadd231pd %zmm29, %zmm29, %zmm25 #158.67
|
||||
#vrcp14pd %zmm25, %zmm24 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm25, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm24, %k0 #175.42
|
||||
#kmovw %k2, %r9d #174.26
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmovaps %zmm25, %zmm17 #175.42
|
||||
#andl %r9d, %r10d #174.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
|
||||
#kmovw %r10d, %k3 #178.21
|
||||
#vmulpd %zmm17, %zmm17, %zmm18 #175.42
|
||||
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #175.42
|
||||
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm24, %zmm19 #176.42
|
||||
#vmulpd %zmm9, %zmm24, %zmm21 #177.58
|
||||
#vmulpd %zmm19, %zmm24, %zmm22 #176.48
|
||||
#vmulpd %zmm22, %zmm24, %zmm20 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm22, %zmm24 #177.58
|
||||
#vmulpd %zmm21, %zmm20, %zmm23 #177.65
|
||||
#vmulpd %zmm24, %zmm23, %zmm28 #177.71
|
||||
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #178.21
|
||||
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #179.21
|
||||
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #180.21
|
||||
cmpq %rbx, %r13 #153.13
|
||||
jb ..B1.19 # Prob 82% #153.13
|
||||
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.24: # Preds ..B1.23
|
||||
# Execution count [4.50e+00]
|
||||
movq 24(%rsp), %r9 #[spill]
|
||||
movq 32(%rsp), %r10 #[spill]
|
||||
cmpl %r12d, %r11d #153.13
|
||||
je ..B1.39 # Prob 10% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.25: # Preds ..B1.24 ..B1.17 ..B1.44
|
||||
# Execution count [2.50e+01]
|
||||
lea 8(%r12), %ebx #153.13
|
||||
cmpl %ebx, %r14d #153.13
|
||||
jl ..B1.33 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.26: # Preds ..B1.25
|
||||
# Execution count [4.50e+00]
|
||||
movq %rcx, %r13 #125.43
|
||||
imulq %rdi, %r13 #125.43
|
||||
vbroadcastsd %xmm10, %zmm1 #127.23
|
||||
vbroadcastsd %xmm6, %zmm0 #128.23
|
||||
vbroadcastsd %xmm12, %zmm2 #129.23
|
||||
movslq %r12d, %rbx #153.13
|
||||
addq %rsi, %r13 #108.5
|
||||
movq %rax, 40(%rsp) #108.5[spill]
|
||||
movq %rcx, 48(%rsp) #108.5[spill]
|
||||
movq %rsi, 56(%rsp) #108.5[spill]
|
||||
movq %r8, 64(%rsp) #108.5[spill]
|
||||
movq %rdi, 72(%rsp) #108.5[spill]
|
||||
movq %r9, 24(%rsp) #108.5[spill]
|
||||
movq %r10, 32(%rsp) #108.5[spill]
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.27: # Preds ..B1.31 ..B1.26
|
||||
# Execution count [2.50e+01]
|
||||
vmovdqu (%r13,%rbx,4), %ymm3 #154.25
|
||||
vpaddd %ymm3, %ymm3, %ymm4 #155.40
|
||||
vpaddd %ymm4, %ymm3, %ymm3 #155.40
|
||||
movl (%r13,%rbx,4), %r10d #154.25
|
||||
movl 4(%r13,%rbx,4), %r9d #154.25
|
||||
movl 8(%r13,%rbx,4), %r8d #154.25
|
||||
movl 12(%r13,%rbx,4), %edi #154.25
|
||||
lea (%r10,%r10,2), %r10d #155.40
|
||||
movl 16(%r13,%rbx,4), %esi #154.25
|
||||
lea (%r9,%r9,2), %r9d #155.40
|
||||
movl 20(%r13,%rbx,4), %ecx #154.25
|
||||
lea (%r8,%r8,2), %r8d #155.40
|
||||
movl 24(%r13,%rbx,4), %eax #154.25
|
||||
lea (%rdi,%rdi,2), %edi #155.40
|
||||
movl 28(%r13,%rbx,4), %r15d #154.25
|
||||
lea (%rsi,%rsi,2), %esi #155.40
|
||||
lea (%rcx,%rcx,2), %ecx #155.40
|
||||
lea (%rax,%rax,2), %eax #155.40
|
||||
lea (%r15,%r15,2), %r15d #155.40
|
||||
# LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.30: # Preds ..B1.27
|
||||
# Execution count [1.25e+01]
|
||||
vpcmpeqb %xmm0, %xmm0, %k1 #155.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k2 #155.40
|
||||
vpcmpeqb %xmm0, %xmm0, %k3 #155.40
|
||||
vpxord %zmm4, %zmm4, %zmm4 #155.40
|
||||
vpxord %zmm17, %zmm17, %zmm17 #155.40
|
||||
vpxord %zmm18, %zmm18, %zmm18 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3} #155.40
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
|
||||
..B1.31: # Preds ..B1.30
|
||||
# Execution count [2.50e+01]
|
||||
addl $8, %r12d #153.13
|
||||
addq $8, %rbx #153.13
|
||||
#vsubpd %zmm4, %zmm2, %zmm26 #157.40
|
||||
#vsubpd %zmm17, %zmm0, %zmm24 #156.40
|
||||
#vsubpd %zmm18, %zmm1, %zmm23 #155.40
|
||||
#vmulpd %zmm24, %zmm24, %zmm3 #158.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm3 #158.53
|
||||
#vfmadd231pd %zmm26, %zmm26, %zmm3 #158.67
|
||||
#vrcp14pd %zmm3, %zmm22 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm3, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm22, %k0 #175.42
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmulpd %zmm3, %zmm3, %zmm4 #175.42
|
||||
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #175.42
|
||||
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm22, %zmm17 #176.42
|
||||
#vmulpd %zmm9, %zmm22, %zmm19 #177.58
|
||||
#vmulpd %zmm17, %zmm22, %zmm20 #176.48
|
||||
#vmulpd %zmm20, %zmm22, %zmm18 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm20, %zmm22 #177.58
|
||||
#vmulpd %zmm19, %zmm18, %zmm21 #177.65
|
||||
#vmulpd %zmm22, %zmm21, %zmm25 #177.71
|
||||
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #178.21
|
||||
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #179.21
|
||||
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #180.21
|
||||
cmpl %r14d, %r12d #153.13
|
||||
jb ..B1.27 # Prob 82% #153.13
|
||||
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.32: # Preds ..B1.31
|
||||
# Execution count [4.50e+00]
|
||||
movq 40(%rsp), %rax #[spill]
|
||||
movq 48(%rsp), %rcx #[spill]
|
||||
movq 56(%rsp), %rsi #[spill]
|
||||
movq 64(%rsp), %r8 #[spill]
|
||||
movq 72(%rsp), %rdi #[spill]
|
||||
movq 24(%rsp), %r9 #[spill]
|
||||
movq 32(%rsp), %r10 #[spill]
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.33: # Preds ..B1.32 ..B1.25 ..B1.45
|
||||
# Execution count [5.00e+00]
|
||||
lea 1(%r14), %ebx #153.13
|
||||
cmpl %r11d, %ebx #153.13
|
||||
ja ..B1.39 # Prob 50% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.34: # Preds ..B1.33
|
||||
# Execution count [2.50e+01]
|
||||
imulq %rcx, %rdi #125.43
|
||||
vbroadcastsd %xmm10, %zmm4 #127.23
|
||||
subl %r14d, %r11d #153.13
|
||||
addq %rsi, %rdi #108.5
|
||||
vpbroadcastd %r11d, %ymm0 #153.13
|
||||
vpcmpgtd %ymm15, %ymm0, %k3 #153.13
|
||||
movslq %r14d, %r14 #153.13
|
||||
vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z} #154.25
|
||||
kmovw %k3, %edi #153.13
|
||||
vpaddd %ymm1, %ymm1, %ymm2 #155.40
|
||||
vpaddd %ymm2, %ymm1, %ymm0 #155.40
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
|
||||
..B1.37: # Preds ..B1.34
|
||||
# Execution count [1.25e+01]
|
||||
kmovw %k3, %k1 #155.40
|
||||
kmovw %k3, %k2 #155.40
|
||||
vpxord %zmm1, %zmm1, %zmm1 #155.40
|
||||
vpxord %zmm2, %zmm2, %zmm2 #155.40
|
||||
vpxord %zmm3, %zmm3, %zmm3 #155.40
|
||||
vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1} #155.40
|
||||
vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2} #155.40
|
||||
vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3} #155.40
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.38: # Preds ..B1.37
|
||||
# Execution count [2.50e+01]
|
||||
#vbroadcastsd %xmm6, %zmm6 #128.23
|
||||
#vbroadcastsd %xmm12, %zmm12 #129.23
|
||||
#vsubpd %zmm1, %zmm12, %zmm23 #157.40
|
||||
#vsubpd %zmm2, %zmm6, %zmm21 #156.40
|
||||
#vsubpd %zmm3, %zmm4, %zmm20 #155.40
|
||||
#vmulpd %zmm21, %zmm21, %zmm19 #158.53
|
||||
#vfmadd231pd %zmm20, %zmm20, %zmm19 #158.53
|
||||
#vfmadd231pd %zmm23, %zmm23, %zmm19 #158.67
|
||||
#vrcp14pd %zmm19, %zmm18 #175.42
|
||||
#vcmppd $1, %zmm14, %zmm19, %k2 #174.26
|
||||
#vfpclasspd $30, %zmm18, %k0 #175.42
|
||||
#kmovw %k2, %ebx #174.26
|
||||
#knotw %k0, %k1 #175.42
|
||||
#vmovaps %zmm19, %zmm0 #175.42
|
||||
#andl %ebx, %edi #174.26
|
||||
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
|
||||
#kmovw %edi, %k3 #178.21
|
||||
#vmulpd %zmm0, %zmm0, %zmm1 #175.42
|
||||
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #175.42
|
||||
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #175.42
|
||||
#vmulpd %zmm13, %zmm18, %zmm2 #176.42
|
||||
#vmulpd %zmm9, %zmm18, %zmm4 #177.58
|
||||
#vmulpd %zmm2, %zmm18, %zmm10 #176.48
|
||||
#vmulpd %zmm10, %zmm18, %zmm3 #176.54
|
||||
#vfmsub213pd %zmm5, %zmm10, %zmm18 #177.58
|
||||
#vmulpd %zmm4, %zmm3, %zmm17 #177.65
|
||||
#vmulpd %zmm18, %zmm17, %zmm22 #177.71
|
||||
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #178.21
|
||||
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #179.21
|
||||
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #180.21
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.39: # Preds ..B1.24 ..B1.38 ..B1.33
|
||||
# Execution count [4.50e+00]
|
||||
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #132.22
|
||||
vpermd %zmm11, %zmm19, %zmm0 #132.22
|
||||
vpermd %zmm7, %zmm19, %zmm6 #131.22
|
||||
vpermd %zmm8, %zmm19, %zmm20 #130.22
|
||||
vaddpd %zmm11, %zmm0, %zmm11 #132.22
|
||||
vaddpd %zmm7, %zmm6, %zmm7 #131.22
|
||||
vaddpd %zmm8, %zmm20, %zmm8 #130.22
|
||||
vpermpd $78, %zmm11, %zmm1 #132.22
|
||||
vpermpd $78, %zmm7, %zmm10 #131.22
|
||||
vpermpd $78, %zmm8, %zmm21 #130.22
|
||||
vaddpd %zmm1, %zmm11, %zmm2 #132.22
|
||||
vaddpd %zmm10, %zmm7, %zmm12 #131.22
|
||||
vaddpd %zmm21, %zmm8, %zmm22 #130.22
|
||||
vpermpd $177, %zmm2, %zmm3 #132.22
|
||||
vpermpd $177, %zmm12, %zmm17 #131.22
|
||||
vpermpd $177, %zmm22, %zmm23 #130.22
|
||||
vaddpd %zmm3, %zmm2, %zmm4 #132.22
|
||||
vaddpd %zmm17, %zmm12, %zmm18 #131.22
|
||||
vaddpd %zmm23, %zmm22, %zmm24 #130.22
|
||||
# LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.40: # Preds ..B1.39 ..B1.10
|
||||
# Execution count [5.00e+00]
|
||||
movq 96(%rsp), %rbx #188.9[spill]
|
||||
addq $24, %rax #124.5
|
||||
movslq %r8d, %rdi #124.32
|
||||
incq %rdi #124.32
|
||||
#vaddsd (%rbx,%r8,8), %xmm24, %xmm0 #188.9
|
||||
#vmovsd %xmm0, (%rbx,%r8,8) #188.9
|
||||
#vaddsd (%r10,%r8,8), %xmm18, %xmm1 #189.9
|
||||
#vmovsd %xmm1, (%r10,%r8,8) #189.9
|
||||
#vaddsd (%r9,%r8,8), %xmm4, %xmm2 #190.9
|
||||
#vmovsd %xmm2, (%r9,%r8,8) #190.9
|
||||
incq %r8 #124.5
|
||||
cmpq 80(%rsp), %r8 #124.5[spill]
|
||||
jb ..B1.10 # Prob 82% #124.5
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
|
||||
..B1.41: # Preds ..B1.40
|
||||
# Execution count [9.00e-01]
|
||||
movq 8(%rsp), %r15 #[spill]
|
||||
.cfi_restore 15
|
||||
movq (%rsp), %rbx #[spill]
|
||||
.cfi_restore 3
|
||||
# LOE rbx r15
|
||||
..B1.42: # Preds ..B1.2 ..B1.41
|
||||
# Execution count [1.00e+00]
|
||||
xorl %eax, %eax #201.16
|
||||
vzeroupper #201.16
|
||||
..___tag_value_computeForce.43:
|
||||
# getTimeStamp()
|
||||
call getTimeStamp #201.16
|
||||
..___tag_value_computeForce.44:
|
||||
# LOE rbx r15 xmm0
|
||||
..B1.43: # Preds ..B1.42
|
||||
# Execution count [1.00e+00]
|
||||
vsubsd 16(%rsp), %xmm0, %xmm0 #204.14[spill]
|
||||
addq $104, %rsp #204.14
|
||||
.cfi_restore 14
|
||||
popq %r14 #204.14
|
||||
.cfi_restore 13
|
||||
popq %r13 #204.14
|
||||
.cfi_restore 12
|
||||
popq %r12 #204.14
|
||||
movq %rbp, %rsp #204.14
|
||||
popq %rbp #204.14
|
||||
.cfi_def_cfa 7, 8
|
||||
.cfi_restore 6
|
||||
ret #204.14
|
||||
.cfi_def_cfa 6, 16
|
||||
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_offset 6, -16
|
||||
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
|
||||
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
|
||||
# LOE
|
||||
..B1.44: # Preds ..B1.12
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
movl %r11d, %r14d #153.13
|
||||
xorl %r12d, %r12d #153.13
|
||||
andl $-8, %r14d #153.13
|
||||
jmp ..B1.25 # Prob 100% #153.13
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
..B1.45: # Preds ..B1.11
|
||||
# Execution count [4.50e-01]: Infreq
|
||||
xorl %r14d, %r14d #153.13
|
||||
jmp ..B1.33 # Prob 100% #153.13
|
||||
.align 16,0x90
|
||||
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
|
||||
.cfi_endproc
|
||||
# mark_end;
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.7:
|
||||
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
|
||||
.type .L_2il0floatpacket.7,@object
|
||||
.size .L_2il0floatpacket.7,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.8:
|
||||
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
|
||||
.type .L_2il0floatpacket.8,@object
|
||||
.size .L_2il0floatpacket.8,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.10:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.10,@object
|
||||
.size .L_2il0floatpacket.10,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.9:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.9,@object
|
||||
.size .L_2il0floatpacket.9,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
324
asm/unused/force.s
Normal file
324
asm/unused/force.s
Normal file
@ -0,0 +1,324 @@
|
||||
.intel_syntax noprefix
|
||||
|
||||
.text
|
||||
.align 16,0x90
|
||||
.globl computeForce
|
||||
computeForce:
|
||||
# parameter 1: rdi Parameter*
|
||||
# parameter 2: rsi Atom*
|
||||
# parameter 3: rdx Neighbor*
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rbx
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
|
||||
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
|
||||
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
|
||||
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
|
||||
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
|
||||
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
|
||||
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
|
||||
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
|
||||
test r9d, r9d # atom->Nlocal <= 0
|
||||
jle ..atom_loop_exit
|
||||
xor r10d, r10d # r10d <- 0
|
||||
mov ecx, r9d # ecx <- atom->Nlocal
|
||||
xor r8d, r8d # r8d <- 0
|
||||
mov r11d, 1 # r11d <- 1
|
||||
xor eax, eax # eax <- 0
|
||||
shr ecx, 1 # ecx <- atom->Nlocal >> 1
|
||||
je ..zero_last_element # ecx == 0
|
||||
|
||||
# Init forces to zero loop (unroll factor = 2)
|
||||
..init_force_loop:
|
||||
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
|
||||
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
|
||||
add r8, 16 # i++
|
||||
inc r10 # i++
|
||||
cmp r10, rcx # i < Nlocal
|
||||
jb ..init_force_loop
|
||||
|
||||
# Trick to make r11d contain value of last element to be zeroed plus 1
|
||||
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
|
||||
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
|
||||
..zero_last_element:
|
||||
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
|
||||
cmp ecx, r9d # i >= Nlocal
|
||||
jae ..before_atom_loop
|
||||
|
||||
# Set last element to zero
|
||||
movsxd r11, r11d # r11 <- i * 2
|
||||
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
|
||||
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
|
||||
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
|
||||
|
||||
# Initialize registers to be used within atom loop
|
||||
..before_atom_loop:
|
||||
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
|
||||
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
|
||||
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
|
||||
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
|
||||
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
|
||||
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
|
||||
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
|
||||
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
|
||||
movsxd r9, r9d # r9 <- atom->Nlocal
|
||||
xor r10d, r10d # r10d <- 0 (i)
|
||||
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
|
||||
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
|
||||
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
|
||||
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
|
||||
### AOS
|
||||
xor eax, eax
|
||||
### SOA
|
||||
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
|
||||
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
|
||||
###
|
||||
shl r12, 2 # r12 <- neighbor->maxneighs * 4
|
||||
|
||||
# Register spilling
|
||||
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
|
||||
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
|
||||
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
|
||||
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
|
||||
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
|
||||
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
|
||||
|
||||
..atom_loop_begin:
|
||||
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
|
||||
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
|
||||
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
|
||||
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
|
||||
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
|
||||
|
||||
### AOS
|
||||
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
|
||||
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
|
||||
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
|
||||
### SOA
|
||||
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
|
||||
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
|
||||
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
|
||||
###
|
||||
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
|
||||
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
|
||||
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
|
||||
test r13d, r13d # numneighs <= 0
|
||||
jle ..atom_loop_exit
|
||||
|
||||
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
|
||||
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
|
||||
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
|
||||
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
|
||||
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
|
||||
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
|
||||
xor r9d, r9d # r9d <- 0 (k)
|
||||
mov r14d, r13d # r14d <- numneighs
|
||||
cmp r14d, 8
|
||||
jl ..compute_forces_remainder
|
||||
|
||||
..compute_forces:
|
||||
vpcmpeqb k1, xmm0, xmm0
|
||||
vpcmpeqb k2, xmm0, xmm0
|
||||
vpcmpeqb k3, xmm0, xmm0
|
||||
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
|
||||
sub r14d, 8
|
||||
add r9, 8
|
||||
cmp r14d, 8
|
||||
jge ..compute_forces
|
||||
|
||||
# Check if there are remaining neighbors to be computed
|
||||
..compute_forces_remainder:
|
||||
test r14d, r14d
|
||||
jle ..sum_up_forces
|
||||
|
||||
vpbroadcastd ymm4, r14d
|
||||
vpcmpgtd k1, ymm4, ymm17
|
||||
kmovw r15d, k1
|
||||
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
|
||||
kmovw k2, k1
|
||||
kmovw k3, k1
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
#### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
kmovw r9d, k5 # r9d <- rsq < cutforcesq
|
||||
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
|
||||
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
|
||||
|
||||
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
|
||||
# and add them (reduction) to obtain the final contribution for the current atom
|
||||
..sum_up_forces:
|
||||
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
|
||||
vpermd zmm0, zmm10, zmm11
|
||||
vpermd zmm5, zmm10, zmm12
|
||||
vpermd zmm21, zmm10, zmm13
|
||||
vaddpd zmm11, zmm0, zmm11
|
||||
vaddpd zmm12, zmm5, zmm12
|
||||
vaddpd zmm13, zmm21, zmm13
|
||||
vpermpd zmm1, zmm11, 78
|
||||
vpermpd zmm6, zmm12, 78
|
||||
vpermpd zmm22, zmm13, 78
|
||||
vaddpd zmm2, zmm11, zmm1
|
||||
vaddpd zmm8, zmm12, zmm6
|
||||
vaddpd zmm23, zmm13, zmm22
|
||||
vpermpd zmm3, zmm2, 177
|
||||
vpermpd zmm9, zmm8, 177
|
||||
vpermpd zmm24, zmm23, 177
|
||||
vaddpd zmm4, zmm2, zmm3
|
||||
vaddpd zmm20, zmm8, zmm9
|
||||
vaddpd zmm25, zmm23, zmm24
|
||||
|
||||
..atom_loop_exit:
|
||||
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
|
||||
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
|
||||
|
||||
### AOS
|
||||
add rax, 24
|
||||
###
|
||||
|
||||
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
|
||||
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
|
||||
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
|
||||
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
|
||||
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
|
||||
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
|
||||
inc r10 #55.5
|
||||
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
|
||||
jb ..atom_loop_begin
|
||||
vzeroupper #93.12
|
||||
vxorpd xmm0, xmm0, xmm0 #93.12
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
|
||||
pop rbx
|
||||
pop r15
|
||||
pop r14 #93.12
|
||||
pop r13 #93.12
|
||||
pop r12 #93.12
|
||||
pop rbp #93.12
|
||||
ret #93.12
|
||||
|
||||
.type computeForce,@function
|
||||
.size computeForce,.-computeForce
|
||||
|
||||
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForce
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
326
asm/unused/force_lj.s
Normal file
326
asm/unused/force_lj.s
Normal file
@ -0,0 +1,326 @@
|
||||
.intel_syntax noprefix
|
||||
|
||||
.text
|
||||
.align 16,0x90
|
||||
.globl computeForceLJ
|
||||
computeForceLJ:
|
||||
# parameter 1: rdi Parameter*
|
||||
# parameter 2: rsi Atom*
|
||||
# parameter 3: rdx Neighbor*
|
||||
push rbp
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
push rbx
|
||||
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
|
||||
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
|
||||
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
|
||||
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
|
||||
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
|
||||
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
|
||||
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
|
||||
test r9d, r9d # atom->Nlocal <= 0
|
||||
jle ..atom_loop_exit
|
||||
xor r10d, r10d # r10d <- 0
|
||||
mov ecx, r9d # ecx <- atom->Nlocal
|
||||
xor r8d, r8d # r8d <- 0
|
||||
mov r11d, 1 # r11d <- 1
|
||||
xor eax, eax # eax <- 0
|
||||
shr ecx, 1 # ecx <- atom->Nlocal >> 1
|
||||
je ..zero_last_element # ecx == 0
|
||||
|
||||
# Init forces to zero loop (unroll factor = 2)
|
||||
..init_force_loop:
|
||||
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
|
||||
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
|
||||
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
|
||||
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
|
||||
add r8, 16 # i++
|
||||
inc r10 # i++
|
||||
cmp r10, rcx # i < Nlocal
|
||||
jb ..init_force_loop
|
||||
|
||||
# Trick to make r11d contain value of last element to be zeroed plus 1
|
||||
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
|
||||
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
|
||||
..zero_last_element:
|
||||
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
|
||||
cmp ecx, r9d # i >= Nlocal
|
||||
jae ..before_atom_loop
|
||||
|
||||
# Set last element to zero
|
||||
movsxd r11, r11d # r11 <- i * 2
|
||||
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
|
||||
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
|
||||
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
|
||||
|
||||
# Initialize registers to be used within atom loop
|
||||
..before_atom_loop:
|
||||
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
|
||||
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
|
||||
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
|
||||
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
|
||||
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
|
||||
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
|
||||
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
|
||||
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
|
||||
movsxd r9, r9d # r9 <- atom->Nlocal
|
||||
xor r10d, r10d # r10d <- 0 (i)
|
||||
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
|
||||
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
|
||||
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
|
||||
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
|
||||
### AOS
|
||||
xor eax, eax
|
||||
### SOA
|
||||
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
|
||||
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
|
||||
###
|
||||
shl r12, 2 # r12 <- neighbor->maxneighs * 4
|
||||
|
||||
# Register spilling
|
||||
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
|
||||
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
|
||||
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
|
||||
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
|
||||
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
|
||||
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
|
||||
#sub rsp, 64
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
|
||||
#add rsp, 64
|
||||
|
||||
..atom_loop_begin:
|
||||
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
|
||||
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
|
||||
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
|
||||
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
|
||||
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
|
||||
|
||||
### AOS
|
||||
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
|
||||
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
|
||||
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
|
||||
### SOA
|
||||
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
|
||||
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
|
||||
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
|
||||
###
|
||||
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
|
||||
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
|
||||
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
|
||||
test r13d, r13d # numneighs <= 0
|
||||
jle ..atom_loop_exit
|
||||
|
||||
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
|
||||
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
|
||||
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
|
||||
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
|
||||
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
|
||||
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
|
||||
xor r9d, r9d # r9d <- 0 (k)
|
||||
mov r14d, r13d # r14d <- numneighs
|
||||
cmp r14d, 8
|
||||
jl ..compute_forces_remainder
|
||||
|
||||
..compute_forces:
|
||||
vpcmpeqb k1, xmm0, xmm0
|
||||
vpcmpeqb k2, xmm0, xmm0
|
||||
vpcmpeqb k3, xmm0, xmm0
|
||||
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
|
||||
sub r14d, 8
|
||||
add r9, 8
|
||||
cmp r14d, 8
|
||||
jge ..compute_forces
|
||||
|
||||
# Check if there are remaining neighbors to be computed
|
||||
..compute_forces_remainder:
|
||||
test r14d, r14d
|
||||
jle ..sum_up_forces
|
||||
|
||||
vpbroadcastd ymm4, r14d
|
||||
vpcmpgtd k1, ymm4, ymm17
|
||||
kmovw r15d, k1
|
||||
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
|
||||
kmovw k2, k1
|
||||
kmovw k3, k1
|
||||
vpxord zmm5, zmm5, zmm5
|
||||
vpxord zmm6, zmm6, zmm6
|
||||
|
||||
### AOS
|
||||
vpaddd ymm4, ymm3, ymm3
|
||||
vpaddd ymm3, ymm3, ymm4
|
||||
vpxord zmm4, zmm4, zmm4
|
||||
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
|
||||
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
|
||||
#### SOA
|
||||
#vpxord zmm4, zmm4, zmm4
|
||||
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
|
||||
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
|
||||
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
|
||||
###
|
||||
|
||||
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
|
||||
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
|
||||
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
|
||||
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
|
||||
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
|
||||
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
|
||||
|
||||
# Cutoff radius condition
|
||||
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
|
||||
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
|
||||
kmovw r9d, k5 # r9d <- rsq < cutforcesq
|
||||
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
|
||||
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
|
||||
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
|
||||
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
|
||||
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
|
||||
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
|
||||
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
|
||||
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
|
||||
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
|
||||
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
|
||||
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
|
||||
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
|
||||
|
||||
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
|
||||
# and add them (reduction) to obtain the final contribution for the current atom
|
||||
..sum_up_forces:
|
||||
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
|
||||
vpermd zmm0, zmm10, zmm11
|
||||
vpermd zmm5, zmm10, zmm12
|
||||
vpermd zmm21, zmm10, zmm13
|
||||
vaddpd zmm11, zmm0, zmm11
|
||||
vaddpd zmm12, zmm5, zmm12
|
||||
vaddpd zmm13, zmm21, zmm13
|
||||
vpermpd zmm1, zmm11, 78
|
||||
vpermpd zmm6, zmm12, 78
|
||||
vpermpd zmm22, zmm13, 78
|
||||
vaddpd zmm2, zmm11, zmm1
|
||||
vaddpd zmm8, zmm12, zmm6
|
||||
vaddpd zmm23, zmm13, zmm22
|
||||
vpermpd zmm3, zmm2, 177
|
||||
vpermpd zmm9, zmm8, 177
|
||||
vpermpd zmm24, zmm23, 177
|
||||
vaddpd zmm4, zmm2, zmm3
|
||||
vaddpd zmm20, zmm8, zmm9
|
||||
vaddpd zmm25, zmm23, zmm24
|
||||
|
||||
..atom_loop_exit:
|
||||
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
|
||||
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
|
||||
|
||||
### AOS
|
||||
add rax, 24
|
||||
###
|
||||
|
||||
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
|
||||
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
|
||||
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
|
||||
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
|
||||
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
|
||||
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
|
||||
inc r10 #55.5
|
||||
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
|
||||
jb ..atom_loop_begin
|
||||
vzeroupper #93.12
|
||||
vxorpd xmm0, xmm0, xmm0 #93.12
|
||||
#call getTimeStamp # xmm0 <- getTimeStamp()
|
||||
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
|
||||
pop rbx
|
||||
pop r15
|
||||
pop r14 #93.12
|
||||
pop r13 #93.12
|
||||
pop r12 #93.12
|
||||
pop rbp #93.12
|
||||
ret #93.12
|
||||
|
||||
.type computeForceLJ,@function
|
||||
.size computeForceLJ,.-computeForceLJ
|
||||
|
||||
|
||||
..LNcomputeForce.0:
|
||||
.data
|
||||
# -- End computeForceLJ
|
||||
.section .rodata, "a"
|
||||
.align 64
|
||||
.align 64
|
||||
.L_2il0floatpacket.2:
|
||||
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.2,@object
|
||||
.size .L_2il0floatpacket.2,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.4:
|
||||
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
|
||||
.type .L_2il0floatpacket.4,@object
|
||||
.size .L_2il0floatpacket.4,64
|
||||
.align 64
|
||||
.L_2il0floatpacket.6:
|
||||
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
|
||||
.type .L_2il0floatpacket.6,@object
|
||||
.size .L_2il0floatpacket.6,64
|
||||
.align 32
|
||||
.L_2il0floatpacket.0:
|
||||
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
|
||||
.type .L_2il0floatpacket.0,@object
|
||||
.size .L_2il0floatpacket.0,32
|
||||
.align 32
|
||||
.L_2il0floatpacket.1:
|
||||
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
|
||||
.type .L_2il0floatpacket.1,@object
|
||||
.size .L_2il0floatpacket.1,32
|
||||
.align 8
|
||||
.L_2il0floatpacket.3:
|
||||
.long 0x00000000,0x40480000
|
||||
.type .L_2il0floatpacket.3,@object
|
||||
.size .L_2il0floatpacket.3,8
|
||||
.align 8
|
||||
.L_2il0floatpacket.5:
|
||||
.long 0x00000000,0x3ff00000
|
||||
.type .L_2il0floatpacket.5,@object
|
||||
.size .L_2il0floatpacket.5,8
|
||||
.data
|
||||
.section .note.GNU-stack, ""
|
||||
# End
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -9,10 +9,8 @@
|
||||
|
||||
#if PRECISION == 1
|
||||
#define MD_FLOAT float
|
||||
# define MD_UINT unsigned int
|
||||
#else
|
||||
#define MD_FLOAT double
|
||||
# define MD_UINT unsigned long long int
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
@ -21,7 +19,6 @@ typedef struct {
|
||||
char* input_file;
|
||||
char* vtk_file;
|
||||
char* xtc_file;
|
||||
char* write_atom_file;
|
||||
MD_FLOAT epsilon;
|
||||
MD_FLOAT sigma;
|
||||
MD_FLOAT sigma6;
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -48,13 +48,11 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
|
||||
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
|
||||
t0 = _mm256_add_pd(t0, t2);
|
||||
t1 = _mm256_add_pd(t1, t2);
|
||||
t0 = _mm256_blend_pd(t0, t1, 0xC);
|
||||
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
|
||||
t0 = _mm256_blend_pd(t0, t1, 0b1100);
|
||||
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
|
||||
_mm256_store_pd(m, t1);
|
||||
|
||||
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
|
||||
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
|
||||
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(t0);
|
||||
a1 = _mm256_extractf128_pd(t0, 0x1);
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
@ -93,7 +91,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
|
||||
}
|
||||
|
||||
// Functions used in LAMMPS kernel
|
||||
#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
|
||||
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
|
||||
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
|
||||
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
|
||||
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -12,10 +12,7 @@
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -7,30 +7,11 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#ifndef NO_ZMM_INTRIN
|
||||
#include <zmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define MD_SIMD_FLOAT __m512
|
||||
#define MD_SIMD_MASK __mmask16
|
||||
#define MD_SIMD_INT __m256i
|
||||
#define MD_SIMD_IBOOL __mmask16
|
||||
#define MD_SIMD_INT32 __m512i
|
||||
#define MD_SIMD_BITMASK MD_SIMD_INT32
|
||||
|
||||
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
|
||||
return _mm512_load_si512(m);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
|
||||
return _mm512_set1_epi32(a);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
|
||||
return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||
@ -88,7 +69,7 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
|
||||
return _mm_cvtss_f32(t3);
|
||||
}
|
||||
|
||||
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
|
||||
__m256 t;
|
||||
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
|
||||
t = _mm256_load_ps(m);
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -7,7 +7,8 @@
|
||||
#ifndef __TIMING_H_
|
||||
#define __TIMING_H_
|
||||
|
||||
extern double getTimeStamp(void);
|
||||
extern double getTimeResolution(void);
|
||||
extern double getTimeStamp();
|
||||
extern double getTimeResolution();
|
||||
extern double getTimeStamp_();
|
||||
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -7,7 +7,6 @@
|
||||
#ifndef __UTIL_H_
|
||||
#define __UTIL_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#ifndef MIN
|
||||
# define MIN(x,y) ((x)<(y)?(x):(y))
|
||||
#endif
|
||||
@ -40,8 +39,8 @@ extern double myrandom(int *);
|
||||
extern void random_reset(int *seed, int ibase, double *coord);
|
||||
extern int str2ff(const char *string);
|
||||
extern const char* ff2str(int ff);
|
||||
extern int get_num_threads();
|
||||
extern void readline(char *line, FILE *fp);
|
||||
extern void debug_printf(const char *format, ...);
|
||||
extern int get_cuda_num_threads(void);
|
||||
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -17,7 +17,6 @@ void initParameter(Parameter *param) {
|
||||
param->vtk_file = NULL;
|
||||
param->xtc_file = NULL;
|
||||
param->eam_file = NULL;
|
||||
param->write_atom_file = NULL;
|
||||
param->force_field = FF_LJ;
|
||||
param->epsilon = 1.0;
|
||||
param->sigma = 1.0;
|
||||
@ -132,19 +131,19 @@ void readParameter(Parameter *param, const char *filename) {
|
||||
void printParameter(Parameter *param) {
|
||||
printf("Parameters:\n");
|
||||
if(param->input_file != NULL) {
|
||||
printf("\tInput file: %s\n", param->input_file);
|
||||
printf("Input file: %s\n", param->input_file);
|
||||
}
|
||||
|
||||
if(param->vtk_file != NULL) {
|
||||
printf("\tVTK file: %s\n", param->vtk_file);
|
||||
printf("VTK file: %s\n", param->vtk_file);
|
||||
}
|
||||
|
||||
if(param->xtc_file != NULL) {
|
||||
printf("\tXTC file: %s\n", param->xtc_file);
|
||||
printf("XTC file: %s\n", param->xtc_file);
|
||||
}
|
||||
|
||||
if(param->eam_file != NULL) {
|
||||
printf("\tEAM file: %s\n", param->eam_file);
|
||||
printf("EAM file: %s\n", param->eam_file);
|
||||
}
|
||||
|
||||
printf("\tForce field: %s\n", ff2str(param->force_field));
|
||||
@ -170,11 +169,6 @@ void printParameter(Parameter *param) {
|
||||
printf("\tNumber of timesteps: %d\n", param->ntimes);
|
||||
printf("\tReport stats every (timesteps): %d\n", param->nstat);
|
||||
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
|
||||
#ifdef SORT_ATOMS
|
||||
printf("\tSort atoms when reneighboring: yes\n");
|
||||
#else
|
||||
printf("\tSort atoms when reneighboring: no\n");
|
||||
#endif
|
||||
printf("\tPrune every (timesteps): %d\n", param->prune_every);
|
||||
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
|
||||
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,21 +1,27 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
double getTimeStamp(void)
|
||||
double getTimeStamp()
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
|
||||
}
|
||||
|
||||
double getTimeResolution(void)
|
||||
double getTimeResolution()
|
||||
{
|
||||
struct timespec ts;
|
||||
clock_getres(CLOCK_MONOTONIC, &ts);
|
||||
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
|
||||
}
|
||||
|
||||
double getTimeStamp_()
|
||||
{
|
||||
return getTimeStamp();
|
||||
}
|
@ -1,10 +1,11 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <errno.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -18,8 +19,7 @@
|
||||
#define IR 2836
|
||||
#define MASK 123459876
|
||||
|
||||
double myrandom(int* seed)
|
||||
{
|
||||
double myrandom(int* seed) {
|
||||
int k= (*seed) / IQ;
|
||||
double ans;
|
||||
|
||||
@ -29,8 +29,7 @@ double myrandom(int* seed)
|
||||
return ans;
|
||||
}
|
||||
|
||||
void random_reset(int* seed, int ibase, double* coord)
|
||||
{
|
||||
void random_reset(int *seed, int ibase, double *coord) {
|
||||
int i;
|
||||
char *str = (char *) &ibase;
|
||||
int n = sizeof(int);
|
||||
@ -62,41 +61,30 @@ void random_reset(int* seed, int ibase, double* coord)
|
||||
|
||||
// warm up the RNG
|
||||
|
||||
for (i = 0; i < 5; i++)
|
||||
myrandom(seed);
|
||||
for (i = 0; i < 5; i++) myrandom(seed);
|
||||
//save = 0;
|
||||
}
|
||||
|
||||
int str2ff(const char* string)
|
||||
{
|
||||
int str2ff(const char *string) {
|
||||
if(strncmp(string, "lj", 2) == 0) return FF_LJ;
|
||||
if(strncmp(string, "eam", 3) == 0) return FF_EAM;
|
||||
if(strncmp(string, "dem", 3) == 0) return FF_DEM;
|
||||
return -1;
|
||||
}
|
||||
|
||||
const char* ff2str(int ff)
|
||||
{
|
||||
if (ff == FF_LJ) {
|
||||
return "lj";
|
||||
}
|
||||
if (ff == FF_EAM) {
|
||||
return "eam";
|
||||
}
|
||||
if (ff == FF_DEM) {
|
||||
return "dem";
|
||||
}
|
||||
const char* ff2str(int ff) {
|
||||
if(ff == FF_LJ) { return "lj"; }
|
||||
if(ff == FF_EAM) { return "eam"; }
|
||||
if(ff == FF_DEM) { return "dem"; }
|
||||
return "invalid";
|
||||
}
|
||||
|
||||
int get_cuda_num_threads(void)
|
||||
{
|
||||
int get_num_threads() {
|
||||
const char *num_threads_env = getenv("NUM_THREADS");
|
||||
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
|
||||
}
|
||||
|
||||
void readline(char* line, FILE* fp)
|
||||
{
|
||||
void readline(char *line, FILE *fp) {
|
||||
if(fgets(line, MAXLINE, fp) == NULL) {
|
||||
if(errno != 0) {
|
||||
perror("readline()");
|
||||
@ -105,16 +93,13 @@ void readline(char* line, FILE* fp)
|
||||
}
|
||||
}
|
||||
|
||||
void debug_printf(const char* format, ...)
|
||||
{
|
||||
void debug_printf(const char *format, ...) {
|
||||
#ifdef DEBUG
|
||||
va_list arg;
|
||||
int ret;
|
||||
|
||||
va_start(arg, format);
|
||||
if ((vfprintf(stdout, format, arg)) < 0) {
|
||||
perror("debug_printf()");
|
||||
}
|
||||
if((vfprintf(stdout, format, arg)) < 0) { perror("debug_printf()"); }
|
||||
va_end(arg);
|
||||
#endif
|
||||
}
|
116
config.mk
116
config.mk
@ -1,23 +1,20 @@
|
||||
# Compiler tool chain (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
|
||||
TOOLCHAIN ?= CLANG
|
||||
# Instruction set for instrinsic kernels (NONE/SSE/AVX/AVX_FMA/AVX2/AVX512)
|
||||
ISA ?= ARM
|
||||
SIMD ?= NONE
|
||||
# Optimization scheme (verletlist/clusterpair/clusters_per_bin)
|
||||
OPT_SCHEME ?= verletlist
|
||||
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
|
||||
TAG ?= NVCC
|
||||
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
|
||||
ISA ?= AVX512
|
||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
||||
OPT_SCHEME ?= gromacs
|
||||
# Enable likwid (true or false)
|
||||
ENABLE_LIKWID ?= false
|
||||
ENABLE_LIKWID ?= true
|
||||
# SP or DP
|
||||
DATA_TYPE ?= DP
|
||||
# AOS or SOA
|
||||
DATA_LAYOUT ?= AOS
|
||||
# Assembly syntax to generate (ATT/INTEL)
|
||||
ASM_SYNTAX ?= INTEL
|
||||
ASM_SYNTAX ?= ATT
|
||||
# Debug
|
||||
DEBUG ?= false
|
||||
DEBUG ?= true
|
||||
|
||||
# Sort atoms when reneighboring (true or false)
|
||||
SORT_ATOMS ?= true
|
||||
# Explicitly store and load atom types (true or false)
|
||||
EXPLICIT_TYPES ?= false
|
||||
# Trace memory addresses for cache simulator (true or false)
|
||||
@ -29,7 +26,7 @@ COMPUTE_STATS ?= true
|
||||
|
||||
# Configurations for lammps optimization scheme
|
||||
# Use omp simd pragma when running with half neighbor-lists
|
||||
ENABLE_OMP_SIMD ?= false
|
||||
ENABLE_OMP_SIMD ?= true
|
||||
# Use kernel with explicit SIMD intrinsics
|
||||
USE_SIMD_KERNEL ?= false
|
||||
|
||||
@ -39,102 +36,13 @@ USE_REFERENCE_VERSION ?= false
|
||||
# Enable XTC output
|
||||
XTC_OUTPUT ?= false
|
||||
# Check if cj is local when decreasing reaction force
|
||||
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= true
|
||||
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
|
||||
|
||||
# Configurations for CUDA
|
||||
# Use CUDA host memory to optimize transfers
|
||||
USE_CUDA_HOST_MEMORY ?= false
|
||||
USE_SUPER_CLUSTERS ?= true
|
||||
|
||||
#Feature options
|
||||
OPTIONS = -DALIGNMENT=64
|
||||
#OPTIONS += More options
|
||||
|
||||
#DO NOT EDIT BELOW
|
||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
||||
DEFINES += -DAOS
|
||||
endif
|
||||
ifeq ($(strip $(DATA_TYPE)),SP)
|
||||
DEFINES += -DPRECISION=1
|
||||
else
|
||||
DEFINES += -DPRECISION=2
|
||||
endif
|
||||
|
||||
ifneq ($(ASM_SYNTAX), ATT)
|
||||
ASFLAGS += -masm=intel
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(SORT_ATOMS)),true)
|
||||
DEFINES += -DSORT_ATOMS
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(EXPLICIT_TYPES)),true)
|
||||
DEFINES += -DEXPLICIT_TYPES
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(MEM_TRACER)),true)
|
||||
DEFINES += -DMEM_TRACER
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(INDEX_TRACER)),true)
|
||||
DEFINES += -DINDEX_TRACER
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(COMPUTE_STATS)),true)
|
||||
DEFINES += -DCOMPUTE_STATS
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(XTC_OUTPUT)),true)
|
||||
DEFINES += -DXTC_OUTPUT
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
|
||||
DEFINES += -DUSE_REFERENCE_VERSION
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true)
|
||||
DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(DEBUG)),true)
|
||||
DEFINES += -DDEBUG
|
||||
endif
|
||||
|
||||
ifneq ($(VECTOR_WIDTH),)
|
||||
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__SIMD_KERNEL__)),true)
|
||||
DEFINES += -D__SIMD_KERNEL__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__SSE__)),true)
|
||||
DEFINES += -D__ISA_SSE__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX__)),true)
|
||||
DEFINES += -D__ISA_AVX__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX_FMA__)),true)
|
||||
DEFINES += -D__ISA_AVX_FMA__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX2__)),true)
|
||||
DEFINES += -D__ISA_AVX2__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(__ISA_AVX512__)),true)
|
||||
DEFINES += -D__ISA_AVX512__
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
|
||||
DEFINES += -DENABLE_OMP_SIMD
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(OPT_SCHEME)),verletlist)
|
||||
OPT_TAG = VL
|
||||
endif
|
||||
|
||||
ifneq ($(strip $(SIMD)),NONE)
|
||||
TOOLCHAIN = $(TOOLCHAIN)-$(ISA)-$(SIMD)
|
||||
endif
|
||||
|
@ -6,7 +6,7 @@ dt 0.001
|
||||
temp 80
|
||||
x_out_freq 500
|
||||
v_out_freq 5
|
||||
cutforce 1.8
|
||||
skin 0.1
|
||||
cutforce 0.9
|
||||
skin 0.0
|
||||
reneigh_every 100
|
||||
nstat 125000
|
||||
|
BIN
figures/features-v3.png
Normal file
BIN
figures/features-v3.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 273 KiB |
BIN
figures/gather_bench.png
Normal file
BIN
figures/gather_bench.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 98 KiB |
523
figures/gather_bench.svg
Normal file
523
figures/gather_bench.svg
Normal file
@ -0,0 +1,523 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
width="297mm"
|
||||
height="210mm"
|
||||
viewBox="0 0 297 210"
|
||||
version="1.1"
|
||||
id="svg5"
|
||||
inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
|
||||
sodipodi:docname="gather_bench.svg"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:svg="http://www.w3.org/2000/svg">
|
||||
<sodipodi:namedview
|
||||
id="namedview7"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pagecheckerboard="0"
|
||||
inkscape:document-units="mm"
|
||||
showgrid="false"
|
||||
inkscape:zoom="0.73508842"
|
||||
inkscape:cx="551.63432"
|
||||
inkscape:cy="348.25743"
|
||||
inkscape:window-width="1920"
|
||||
inkscape:window-height="1011"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="165"
|
||||
inkscape:window-maximized="1"
|
||||
inkscape:current-layer="layer1" />
|
||||
<defs
|
||||
id="defs2">
|
||||
<rect
|
||||
x="144.01516"
|
||||
y="304.36604"
|
||||
width="248.99777"
|
||||
height="100.91557"
|
||||
id="rect79475" />
|
||||
<rect
|
||||
x="309.01869"
|
||||
y="43.698615"
|
||||
width="552.19421"
|
||||
height="71.390348"
|
||||
id="rect65238" />
|
||||
<rect
|
||||
x="762.55856"
|
||||
y="341.3838"
|
||||
width="277.62756"
|
||||
height="105.0235"
|
||||
id="rect47632" />
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
id="linearGradient40704">
|
||||
<stop
|
||||
style="stop-color:#ccffaa;stop-opacity:1;"
|
||||
offset="0"
|
||||
id="stop40700" />
|
||||
<stop
|
||||
style="stop-color:#ccffaa;stop-opacity:0;"
|
||||
offset="1"
|
||||
id="stop40702" />
|
||||
</linearGradient>
|
||||
<marker
|
||||
style="overflow:visible;"
|
||||
id="Arrow2Mend"
|
||||
refX="0.0"
|
||||
refY="0.0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(0.6) rotate(180) translate(0,0)"
|
||||
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
|
||||
style="stroke:context-stroke;fill-rule:evenodd;fill:context-stroke;stroke-width:0.62500000;stroke-linejoin:round;"
|
||||
id="path39486" />
|
||||
</marker>
|
||||
<marker
|
||||
style="overflow:visible;"
|
||||
id="Arrow1Mend"
|
||||
refX="0.0"
|
||||
refY="0.0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow1Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(0.4) rotate(180) translate(10,0)"
|
||||
style="fill-rule:evenodd;fill:context-stroke;stroke:context-stroke;stroke-width:1.0pt;"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
id="path39468" />
|
||||
</marker>
|
||||
<marker
|
||||
style="overflow:visible;"
|
||||
id="Arrow1Lend"
|
||||
refX="0.0"
|
||||
refY="0.0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow1Lend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(0.8) rotate(180) translate(12.5,0)"
|
||||
style="fill-rule:evenodd;fill:context-stroke;stroke:context-stroke;stroke-width:1.0pt;"
|
||||
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
|
||||
id="path39462" />
|
||||
</marker>
|
||||
<rect
|
||||
x="707.09731"
|
||||
y="616.36746"
|
||||
width="407.71288"
|
||||
height="417.08306"
|
||||
id="rect24254" />
|
||||
<rect
|
||||
x="47.404365"
|
||||
y="100.3268"
|
||||
width="398.49855"
|
||||
height="110.16514"
|
||||
id="rect5050" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3-5" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3-5-6" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3-5-6-1" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-0" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-0-6" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-0-6-2" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-0-6-2-8" />
|
||||
<marker
|
||||
style="overflow:visible"
|
||||
id="Arrow2Mend-2"
|
||||
refX="0"
|
||||
refY="0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(-0.6)"
|
||||
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
|
||||
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
|
||||
id="path39486-3" />
|
||||
</marker>
|
||||
<marker
|
||||
style="overflow:visible"
|
||||
id="Arrow2Mend-2-5"
|
||||
refX="0"
|
||||
refY="0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(-0.6)"
|
||||
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
|
||||
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
|
||||
id="path39486-3-9" />
|
||||
</marker>
|
||||
<marker
|
||||
style="overflow:visible"
|
||||
id="Arrow2Mend-2-5-2"
|
||||
refX="0"
|
||||
refY="0"
|
||||
orient="auto"
|
||||
inkscape:stockid="Arrow2Mend"
|
||||
inkscape:isstock="true">
|
||||
<path
|
||||
transform="scale(-0.6)"
|
||||
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
|
||||
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
|
||||
id="path39486-3-9-8" />
|
||||
</marker>
|
||||
<linearGradient
|
||||
inkscape:collect="always"
|
||||
xlink:href="#linearGradient40704"
|
||||
id="linearGradient40706"
|
||||
x1="324.58157"
|
||||
y1="127.35331"
|
||||
x2="363.61096"
|
||||
y2="98.957848"
|
||||
gradientUnits="userSpaceOnUse" />
|
||||
<rect
|
||||
x="47.404366"
|
||||
y="100.3268"
|
||||
width="398.49854"
|
||||
height="110.16514"
|
||||
id="rect5050-3-5-6-1-7" />
|
||||
<rect
|
||||
x="309.01868"
|
||||
y="43.698616"
|
||||
width="552.19421"
|
||||
height="71.39035"
|
||||
id="rect65238-1" />
|
||||
</defs>
|
||||
<g
|
||||
inkscape:label="Layer 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1">
|
||||
<rect
|
||||
style="fill:#d5d5ff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
|
||||
id="rect55834"
|
||||
width="250.31726"
|
||||
height="74.676537"
|
||||
x="25.257824"
|
||||
y="97.277718" />
|
||||
<rect
|
||||
style="fill:#d5f6ff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
|
||||
id="rect55832"
|
||||
width="250.35208"
|
||||
height="64.461151"
|
||||
x="25.256891"
|
||||
y="32.817505" />
|
||||
<rect
|
||||
style="fill:#ccffaa;stroke:#091600;stroke-width:1.31891"
|
||||
id="rect6462"
|
||||
width="82.385742"
|
||||
height="20.525751"
|
||||
x="28.355024"
|
||||
y="48.740646" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,17.244577,26.206534)"
|
||||
id="text5048"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82948"><tspan
|
||||
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
|
||||
id="tspan82946">gather-bench</tspan></tspan></text>
|
||||
<rect
|
||||
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
|
||||
id="rect6462-9"
|
||||
width="18.764017"
|
||||
height="20.965076"
|
||||
x="39.518955"
|
||||
y="140.726" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.33667319,0,0,0.33667319,25.589293,109.42998)"
|
||||
id="text5048-3"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82950">L1</tspan></text>
|
||||
<rect
|
||||
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
|
||||
id="rect6462-9-0"
|
||||
width="21.653919"
|
||||
height="24.193966"
|
||||
x="97.687294"
|
||||
y="138.51564" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.3885252,0,0,0.3885252,81.212654,102.39964)"
|
||||
id="text5048-3-6"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82952">L2</tspan></text>
|
||||
<rect
|
||||
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
|
||||
id="rect6462-9-0-6"
|
||||
width="27.217058"
|
||||
height="30.409672"
|
||||
x="149.19933"
|
||||
y="134.83977" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.48834178,0,0,0.48834178,128.49215,89.445174)"
|
||||
id="text5048-3-6-1"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6-2);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82954">L3</tspan></text>
|
||||
<rect
|
||||
style="fill:#eeaaff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
|
||||
id="rect6462-9-0-6-7"
|
||||
width="61.032539"
|
||||
height="29.96501"
|
||||
x="204.01265"
|
||||
y="135.61238" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.48834178,0,0,0.48834178,182.37007,89.995434)"
|
||||
id="text5048-3-6-1-9"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6-2-8);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82956">DRAM</tspan></text>
|
||||
<rect
|
||||
style="fill:#ffccaa;stroke:#091600;stroke-width:1.10636"
|
||||
id="rect6462-6"
|
||||
width="74.980759"
|
||||
height="15.869514"
|
||||
x="126.09525"
|
||||
y="38.773243" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,115.65481,14.295323)"
|
||||
id="text5048-7"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82958">Single gather</tspan></text>
|
||||
<rect
|
||||
style="fill:#ffccaa;stroke:#091600;stroke-width:1.03971"
|
||||
id="rect6462-6-3"
|
||||
width="66.071701"
|
||||
height="15.904838"
|
||||
x="126.90776"
|
||||
y="63.642746" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,116.63325,39.114393)"
|
||||
id="text5048-7-5"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82960">MD gathers</tspan></text>
|
||||
<rect
|
||||
style="fill:#afe9dd;stroke:#091600;stroke-width:1.02848"
|
||||
id="rect6462-6-3-2"
|
||||
width="64.479698"
|
||||
height="15.947394"
|
||||
x="206.65364"
|
||||
y="52.98967" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,196.01512,28.482594)"
|
||||
id="text5048-7-5-9"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5-6);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82962">Contiguous</tspan></text>
|
||||
<rect
|
||||
style="fill:#afe9dd;stroke:#091600;stroke-width:0.987323"
|
||||
id="rect6462-6-3-2-2"
|
||||
width="59.269382"
|
||||
height="15.988551"
|
||||
x="208.16559"
|
||||
y="76.856781" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,197.58604,52.220445)"
|
||||
id="text5048-7-5-9-7"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5-6-1);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="47.404297"
|
||||
y="135.7168"
|
||||
id="tspan82964">"Random"</tspan></text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="scale(0.26458333)"
|
||||
id="text24252"
|
||||
style="fill:black;fill-opacity:1;stroke:none;font-family:sans-serif;font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect24254)" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="M 193.10512,71.273276 206.30683,61.033513"
|
||||
id="path39049"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="M 193.08841,71.196939 207.86207,84.43804"
|
||||
id="path39053"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.39816;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 58.548229,151.24436 38.298093,0.25023"
|
||||
id="path39219"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.24847;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 119.19252,150.09399 29.28333,0.26095"
|
||||
id="path39219-2"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 177.02022,150.44367 26.36623,0.26095"
|
||||
id="path39219-2-0"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend)"
|
||||
d="m 48.145458,92.71788 -0.644819,47.57709"
|
||||
id="path39377"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2)"
|
||||
d="M 48.121208,92.873762 106.60807,137.41946"
|
||||
id="path39377-7"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2-5)"
|
||||
d="M 48.073928,92.825143 158.88023,133.04546"
|
||||
id="path39377-7-2"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2-5-2)"
|
||||
d="M 48.051946,92.813593 233.0959,134.16596"
|
||||
id="path39377-7-2-9"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<rect
|
||||
style="fill:#e9afaf;stroke:#091600;stroke-width:1.34518"
|
||||
id="rect6462-6-3-2-2-3"
|
||||
width="65.880661"
|
||||
height="26.700579"
|
||||
x="38.104012"
|
||||
y="80.530182" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
|
||||
d="m 77.365612,69.678744 h 2e-6"
|
||||
id="path39808"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 111.64767,59.183009 6.84466,0.03069"
|
||||
id="path41004"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 119.03378,47.056357 -0.58704,25.198541"
|
||||
id="path41006"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.02423;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 118.07503,72.254897 7.94998,-0.05784"
|
||||
id="path41008"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.882836;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
|
||||
d="m 118.26666,47.054814 7.69322,0.173925"
|
||||
id="path41112"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<path
|
||||
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
|
||||
d="M 68.213642,69.068864 67.910274,80.302728"
|
||||
id="path55728"
|
||||
inkscape:connector-type="polyline"
|
||||
inkscape:connector-curvature="0" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,-1.3782637,4.0412367)"
|
||||
id="text65236"
|
||||
style="font-style:normal;font-weight:normal;font-size:53.3333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect65238);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="309.01953"
|
||||
y="90.886691"
|
||||
id="tspan82968"><tspan
|
||||
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
|
||||
id="tspan82966">Application Level</tspan></tspan></text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,2.7015103,160.71919)"
|
||||
id="text65236-2"
|
||||
style="font-style:normal;font-weight:normal;font-size:53.3333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect65238-1);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="309.01953"
|
||||
y="90.886691"
|
||||
id="tspan82972"><tspan
|
||||
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
|
||||
id="tspan82970">Hardware Level</tspan></tspan></text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
transform="matrix(0.26458333,0,0,0.26458333,2.3490396,0.57331532)"
|
||||
id="text79473"
|
||||
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect79475);fill:#000000;fill-opacity:1;stroke:none"><tspan
|
||||
x="144.01562"
|
||||
y="339.75586"
|
||||
id="tspan82974">vgather </tspan><tspan
|
||||
x="144.01562"
|
||||
y="389.75586"
|
||||
id="tspan82976">instructions</tspan></text>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 21 KiB |
BIN
figures/gromacs_mxn_v2.pdf
Normal file
BIN
figures/gromacs_mxn_v2.pdf
Normal file
Binary file not shown.
BIN
figures/gromacs_mxn_v2.png
Normal file
BIN
figures/gromacs_mxn_v2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 128 KiB |
BIN
figures/stub_new_v3.pdf
Normal file
BIN
figures/stub_new_v3.pdf
Normal file
Binary file not shown.
BIN
figures/stub_new_v3.png
Normal file
BIN
figures/stub_new_v3.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 52 KiB |
BIN
figures/verlet_v2.pdf
Normal file
BIN
figures/verlet_v2.pdf
Normal file
Binary file not shown.
BIN
figures/verlet_v2.png
Normal file
BIN
figures/verlet_v2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 62 KiB |
1
gather-bench
Submodule
1
gather-bench
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 2f654cb043359197be07e0fa362324dab8899a33
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -37,7 +37,24 @@ void initAtom(Atom *atom) {
|
||||
atom->iclusters = NULL;
|
||||
atom->jclusters = NULL;
|
||||
atom->icluster_bin = NULL;
|
||||
initMasks(atom);
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
atom->scl_x = NULL;
|
||||
atom->scl_v = NULL;
|
||||
atom->scl_f = NULL;
|
||||
|
||||
atom->Nsclusters = 0;
|
||||
atom->Nsclusters_local = 0;
|
||||
atom->Nsclusters_ghost = 0;
|
||||
atom->Nsclusters_max = 0;
|
||||
|
||||
atom->scl_type = NULL;
|
||||
|
||||
atom->siclusters = NULL;
|
||||
atom->icluster_idx = NULL;
|
||||
|
||||
atom->sicluster_bin = NULL;
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
@ -51,7 +68,6 @@ void createAtom(Atom *atom, Parameter *param) {
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
@ -394,113 +410,6 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
|
||||
return natoms;
|
||||
}
|
||||
|
||||
void initMasks(Atom *atom) {
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
|
||||
atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
|
||||
//atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
|
||||
|
||||
for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
|
||||
atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
}
|
||||
|
||||
for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
|
||||
atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
|
||||
atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
|
||||
}
|
||||
|
||||
for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
|
||||
atom->exclusion_filter[i] = (1U << i);
|
||||
}
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
|
||||
}
|
||||
#else
|
||||
for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
|
||||
for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
|
||||
atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
|
||||
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
|
||||
atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
|
||||
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void growAtom(Atom *atom) {
|
||||
int nold = atom->Nmax;
|
||||
atom->Nmax += DELTA;
|
||||
@ -530,3 +439,18 @@ void growClusters(Atom *atom) {
|
||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
|
||||
}
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
void growSuperClusters(Atom *atom) {
|
||||
int nold = atom->Nsclusters_max;
|
||||
atom->Nsclusters_max += DELTA;
|
||||
atom->siclusters = (SuperCluster*) reallocate(atom->siclusters, ALIGNMENT, atom->Nsclusters_max * sizeof(SuperCluster), nold * sizeof(SuperCluster));
|
||||
atom->icluster_idx = (int*) reallocate(atom->icluster_idx, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int), nold * SCLUSTER_SIZE * sizeof(int));
|
||||
atom->sicluster_bin = (int*) reallocate(atom->sicluster_bin, ALIGNMENT, atom->Nsclusters_max * sizeof(int), nold * sizeof(int));
|
||||
//atom->scl_type = (int*) reallocate(atom->scl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * SCLUSTER_SIZE * sizeof(int), nold * CLUSTER_M * SCLUSTER_SIZE * sizeof(int));
|
||||
|
||||
atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
}
|
||||
#endif //USE_SUPER_CLUSTERS
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -39,8 +39,29 @@ extern "C" {
|
||||
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
|
||||
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
|
||||
int isReneighboured;
|
||||
|
||||
int *cuda_iclusters;
|
||||
int *cuda_nclusters;
|
||||
|
||||
int cuda_max_scl;
|
||||
MD_FLOAT *cuda_scl_x;
|
||||
MD_FLOAT *cuda_scl_v;
|
||||
MD_FLOAT *cuda_scl_f;
|
||||
|
||||
extern void alignDataToSuperclusters(Atom *atom);
|
||||
extern void alignDataFromSuperclusters(Atom *atom);
|
||||
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
|
||||
}
|
||||
|
||||
extern __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_nclusters,
|
||||
int *cuda_natoms,
|
||||
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt);
|
||||
|
||||
extern __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_nclusters, int *cuda_natoms,
|
||||
int Nsclusters_local, MD_FLOAT dtforce);
|
||||
|
||||
extern "C"
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
|
||||
@ -59,10 +80,23 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
||||
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
|
||||
isReneighboured = 1;
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
cuda_max_scl = atom->Nsclusters_max;
|
||||
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
|
||||
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
|
||||
|
||||
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copyDataToCUDADevice(Atom *atom) {
|
||||
DEBUG_MESSAGE("copyDataToCUDADevice start\r\n");
|
||||
|
||||
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
@ -85,13 +119,49 @@ void copyDataToCUDADevice(Atom *atom) {
|
||||
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
//alignDataToSuperclusters(atom);
|
||||
|
||||
if (cuda_max_scl < atom->Nsclusters_max) {
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
|
||||
cuda_max_scl = atom->Nsclusters_max;
|
||||
|
||||
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
|
||||
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
|
||||
|
||||
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
}
|
||||
memcpyToGPU(cuda_scl_x, atom->scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_scl_v, atom->scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_scl_f, atom->scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
DEBUG_MESSAGE("copyDataToCUDADevice stop\r\n");
|
||||
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copyDataFromCUDADevice(Atom *atom) {
|
||||
DEBUG_MESSAGE("copyDataFromCUDADevice start\r\n");
|
||||
|
||||
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
|
||||
//alignDataFromSuperclusters(atom);
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n");
|
||||
}
|
||||
|
||||
extern "C"
|
||||
@ -109,6 +179,12 @@ void cudaDeviceFree() {
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
|
||||
free(natoms);
|
||||
free(ngatoms);
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
}
|
||||
|
||||
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
@ -165,6 +241,39 @@ __global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaUpdatePbcSup_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
|
||||
int *cuda_jclusters_natoms,
|
||||
int *cuda_PBCx,
|
||||
int *cuda_PBCy,
|
||||
int *cuda_PBCz,
|
||||
int Nsclusters_local,
|
||||
int Nclusters_ghost,
|
||||
MD_FLOAT param_xprd,
|
||||
MD_FLOAT param_yprd,
|
||||
MD_FLOAT param_zprd) {
|
||||
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (cg >= Nclusters_ghost) return;
|
||||
|
||||
//int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int jfac = SCLUSTER_SIZE / CLUSTER_M;
|
||||
int ncj = Nsclusters_local / jfac;
|
||||
MD_FLOAT xprd = param_xprd;
|
||||
MD_FLOAT yprd = param_yprd;
|
||||
MD_FLOAT zprd = param_zprd;
|
||||
|
||||
const int cj = ncj + cg;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
|
||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
||||
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
|
||||
|
||||
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
|
||||
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
|
||||
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
|
||||
int Nclusters_local, int Nclusters_max,
|
||||
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
|
||||
@ -251,9 +360,17 @@ extern "C"
|
||||
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
const int threads_num = 16;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaInitialIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_v, cuda_scl_f,
|
||||
cuda_nclusters,
|
||||
cuda_natoms, atom->Nsclusters_local, param->dtforce, param->dt);
|
||||
#else
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
|
||||
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
|
||||
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
|
||||
}
|
||||
@ -264,11 +381,19 @@ extern "C"
|
||||
void cudaUpdatePbc(Atom *atom, Parameter *param) {
|
||||
const int threads_num = 512;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);;
|
||||
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
|
||||
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
cudaUpdatePbcSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_border_map,
|
||||
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
|
||||
atom->Nclusters_local, atom->Nclusters_ghost,
|
||||
param->xprd, param->yprd, param->zprd);
|
||||
#else
|
||||
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
|
||||
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
|
||||
atom->Nclusters_local, atom->Nclusters_ghost,
|
||||
param->xprd, param->yprd, param->zprd);
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
|
||||
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
|
||||
}
|
||||
@ -310,8 +435,17 @@ extern "C"
|
||||
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
const int threads_num = 16;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaFinalIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_v, cuda_scl_f,
|
||||
cuda_nclusters, cuda_natoms,
|
||||
atom->Nsclusters_local, param->dt);
|
||||
#else
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
|
||||
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms,
|
||||
atom->Nclusters_local, param->dt);
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
|
||||
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
|
||||
}
|
288
gromacs/cuda/force_lj_sup.cu
Normal file
288
gromacs/cuda/force_lj_sup.cu
Normal file
@ -0,0 +1,288 @@
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <stdio.h>
|
||||
//---
|
||||
#include <cuda.h>
|
||||
#include <driver_types.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
extern MD_FLOAT *cuda_cl_x;
|
||||
extern MD_FLOAT *cuda_cl_v;
|
||||
extern MD_FLOAT *cuda_cl_f;
|
||||
extern int *cuda_neighbors;
|
||||
extern int *cuda_numneigh;
|
||||
extern int *cuda_natoms;
|
||||
extern int *natoms;
|
||||
extern int *ngatoms;
|
||||
extern int *cuda_border_map;
|
||||
extern int *cuda_jclusters_natoms;
|
||||
extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
|
||||
extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
|
||||
extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
|
||||
extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
|
||||
extern int isReneighboured;
|
||||
|
||||
extern int *cuda_iclusters;
|
||||
extern int *cuda_nclusters;
|
||||
|
||||
extern MD_FLOAT *cuda_scl_x;
|
||||
extern MD_FLOAT *cuda_scl_v;
|
||||
extern MD_FLOAT *cuda_scl_f;
|
||||
}
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
extern "C"
|
||||
void alignDataToSuperclusters(Atom *atom) {
|
||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
||||
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
|
||||
|
||||
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
|
||||
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
||||
|
||||
/*
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
||||
*/
|
||||
|
||||
memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
|
||||
memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
|
||||
memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void alignDataFromSuperclusters(Atom *atom) {
|
||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
||||
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
|
||||
|
||||
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
|
||||
|
||||
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
|
||||
|
||||
/*
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
||||
*/
|
||||
|
||||
memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
|
||||
memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
|
||||
memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_nclusters,
|
||||
int *cuda_natoms,
|
||||
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
|
||||
|
||||
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
if (sci_pos >= Nsclusters_local) return;
|
||||
|
||||
//unsigned int ci_pos = cii_pos / CLUSTER_M;
|
||||
//unsigned int scii_pos = cii_pos % CLUSTER_M;
|
||||
|
||||
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
|
||||
//if (scii_pos >= cuda_natoms[ci_pos]) return;
|
||||
|
||||
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
|
||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
|
||||
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
|
||||
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
|
||||
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
|
||||
ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];
|
||||
ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];
|
||||
ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_nclusters, int *cuda_natoms,
|
||||
int Nsclusters_local, MD_FLOAT dtforce) {
|
||||
|
||||
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
if (sci_pos >= Nsclusters_local) return;
|
||||
|
||||
//unsigned int ci_pos = cii_pos / CLUSTER_M;
|
||||
//unsigned int scii_pos = cii_pos % CLUSTER_M;
|
||||
|
||||
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
|
||||
//if (scii_pos >= cuda_natoms[ci_pos]) return;
|
||||
|
||||
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
|
||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
|
||||
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
|
||||
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
|
||||
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_nclusters, int *cuda_iclusters,
|
||||
int Nsclusters_local,
|
||||
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
|
||||
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
|
||||
|
||||
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
|
||||
if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
|
||||
|
||||
unsigned int ci_pos = scii_pos / CLUSTER_M;
|
||||
unsigned int cii_pos = scii_pos % CLUSTER_M;
|
||||
|
||||
if (ci_pos >= cuda_nclusters[sci_pos]) return;
|
||||
|
||||
int ci_cj0 = CJ0_FROM_CI(ci_pos);
|
||||
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
|
||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
|
||||
//int numneighs = cuda_numneigh[ci_pos];
|
||||
int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int glob_j = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
|
||||
int scj = glob_j / SCLUSTER_SIZE;
|
||||
// TODO Make cj accessible from super cluster data alignment (not reachable right now)
|
||||
int cj = SCJ_VECTOR_BASE_INDEX(scj) + CLUSTER_M * (glob_j % SCLUSTER_SIZE);
|
||||
int cj_vec_base = cj;
|
||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
|
||||
|
||||
MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];
|
||||
MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];
|
||||
MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
|
||||
//int cond = ci_cj0 != cj || cii_pos != cjj_pos || scj != sci_pos;
|
||||
int cond = (glob_j != cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] && cii_pos != cjj_pos);
|
||||
|
||||
if(cond) {
|
||||
MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
|
||||
MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
|
||||
MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
|
||||
if(half_neigh) {
|
||||
atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);
|
||||
atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);
|
||||
atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);
|
||||
}
|
||||
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
|
||||
atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);
|
||||
atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);
|
||||
atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
extern "C"
|
||||
double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");
|
||||
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
if (isReneighboured) {
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
|
||||
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
|
||||
}
|
||||
|
||||
for(int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
||||
memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));
|
||||
//memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);
|
||||
}
|
||||
|
||||
memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
|
||||
|
||||
isReneighboured = 0;
|
||||
}
|
||||
|
||||
const int threads_num = 1;
|
||||
dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);
|
||||
dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,
|
||||
cuda_nclusters, cuda_iclusters,
|
||||
atom->Nsclusters_local,
|
||||
cuda_numneigh, cuda_neighbors,
|
||||
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
|
||||
sigma6, epsilon);
|
||||
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
|
||||
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");
|
||||
return E-S;
|
||||
}
|
||||
#endif //USE_SUPER_CLUSTERS
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -16,32 +16,6 @@
|
||||
#include <simd.h>
|
||||
|
||||
|
||||
/*
|
||||
static inline void gmx_load_simd_2xnn_interactions(
|
||||
int excl,
|
||||
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
|
||||
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
|
||||
|
||||
//SimdInt32 mask_pr_S(excl);
|
||||
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||
}
|
||||
|
||||
static inline void gmx_load_simd_4xn_interactions(
|
||||
int excl,
|
||||
MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
|
||||
MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
|
||||
|
||||
//SimdInt32 mask_pr_S(excl);
|
||||
MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
|
||||
*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
|
||||
*interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
|
||||
*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
|
||||
*interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
|
||||
}
|
||||
*/
|
||||
|
||||
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
@ -61,12 +35,9 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
@ -148,8 +119,6 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ end\n");
|
||||
return E-S;
|
||||
@ -167,6 +136,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@ -179,41 +149,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
/*
|
||||
MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||
MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
|
||||
|
||||
MD_SIMD_FLOAT diagonal_jmi_S = simd_load(atom->diagonal_2xnn_j_minus_i);
|
||||
MD_SIMD_FLOAT zero_S = simd_broadcast(0.0);
|
||||
MD_SIMD_FLOAT one_S = simd_broadcast(1.0);
|
||||
|
||||
#if CLUSTER_M <= CLUSTER_N
|
||||
MD_SIMD_MASK diagonal_mask0, diagonal_mask2;
|
||||
diagonal_mask0 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask2 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
#else
|
||||
MD_SIMD_MASK diagonal_mask00, diagonal_mask02, diagonal_mask10, diagonal_mask12;
|
||||
diagonal_mask00 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask02 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask10 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_jmi_S = diagonal_jmi_S - one_S;
|
||||
diagonal_mask12 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
|
||||
#endif
|
||||
*/
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@ -224,7 +162,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||
@ -239,138 +176,76 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
//int imask = neighs_imask[k];
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
//MD_SIMD_MASK interact0;
|
||||
//MD_SIMD_MASK interact2;
|
||||
|
||||
//gmx_load_simd_2xnn_interactions((int)imask, filter0, filter2, &interact0, &interact2);
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 1]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x7 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0xf * cond0);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1]);
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
cutoff_mask0 = simd_mask_and(cutoff_mask0, excl_mask0);
|
||||
cutoff_mask2 = simd_mask_and(cutoff_mask2, excl_mask2);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||
|
||||
/*
|
||||
#if CLUSTER_M <= CLUSTER_N
|
||||
if(ci == ci_cj0) {
|
||||
cutoff_mask0 = simd_mask_and(cutoff_mask0, diagonal_mask0);
|
||||
cutoff_mask2 = simd_mask_and(cutoff_mask2, diagonal_mask2);
|
||||
}
|
||||
#else
|
||||
if(ci == ci_cj0) {
|
||||
cutoff_mask0 = cutoff_mask0 && diagonal_mask00;
|
||||
cutoff_mask2 = cutoff_mask2 && diagonal_mask02;
|
||||
} else if(ci == ci_cj1) {
|
||||
cutoff_mask0 = cutoff_mask0 && diagonal_mask10;
|
||||
cutoff_mask2 = cutoff_mask2 && diagonal_mask12;
|
||||
}
|
||||
#endif
|
||||
*/
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
|
||||
fix0 += tx0;
|
||||
fiy0 += ty0;
|
||||
fiz0 += tz0;
|
||||
fix2 += tx2;
|
||||
fiy2 += ty2;
|
||||
fiz2 += tz2;
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||
}
|
||||
#else
|
||||
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
|
||||
#endif
|
||||
}
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
|
||||
fix0 += tx0;
|
||||
fiy0 += ty0;
|
||||
fiz0 += tz0;
|
||||
fix2 += tx2;
|
||||
fiy2 += ty2;
|
||||
fiz2 += tz2;
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
fiz0 = simd_add(fiz0, tz0);
|
||||
fix2 = simd_add(fix2, tx2);
|
||||
fiy2 = simd_add(fiy2, ty2);
|
||||
fiz2 = simd_add(fiz2, tz2);
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
@ -391,8 +266,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||
return E-S;
|
||||
@ -410,6 +283,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@ -422,12 +296,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@ -438,7 +309,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||
@ -453,7 +323,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
@ -462,75 +332,52 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 1]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1]);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||
}
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
@ -551,8 +398,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||
return E-S;
|
||||
@ -578,6 +423,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@ -589,13 +436,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@ -606,7 +447,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||
@ -633,7 +473,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
@ -641,43 +481,45 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 3]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0));
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1));
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3]);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1));
|
||||
#endif
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||
@ -689,113 +531,28 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
|
||||
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
fiz0 = simd_add(fiz0, tz0);
|
||||
fix1 = simd_add(fix1, tx1);
|
||||
fiy1 = simd_add(fiy1, ty1);
|
||||
fiz1 = simd_add(fiz1, tz1);
|
||||
fix2 = simd_add(fix2, tx2);
|
||||
fiy2 = simd_add(fiy2, ty2);
|
||||
fiz2 = simd_add(fiz2, tz2);
|
||||
fix3 = simd_add(fix3, tx3);
|
||||
fiy3 = simd_add(fiy3, ty3);
|
||||
fiz3 = simd_add(fiz3, tz3);
|
||||
|
||||
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
|
||||
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
|
||||
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||
}
|
||||
#else
|
||||
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
|
||||
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
|
||||
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
|
||||
#endif
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
|
||||
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
|
||||
MD_SIMD_FLOAT tx1 = select_by_mask(simd_mul(delx1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT ty1 = select_by_mask(simd_mul(dely1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT tz1 = select_by_mask(simd_mul(delz1, force1), cutoff_mask1);
|
||||
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
|
||||
MD_SIMD_FLOAT tx3 = select_by_mask(simd_mul(delx3, force3), cutoff_mask3);
|
||||
MD_SIMD_FLOAT ty3 = select_by_mask(simd_mul(dely3, force3), cutoff_mask3);
|
||||
MD_SIMD_FLOAT tz3 = select_by_mask(simd_mul(delz3, force3), cutoff_mask3);
|
||||
|
||||
fix0 = simd_add(fix0, tx0);
|
||||
fiy0 = simd_add(fiy0, ty0);
|
||||
@ -833,8 +590,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||
return E-S;
|
||||
@ -852,6 +607,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
@ -863,13 +620,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
}
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp for schedule(runtime)
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
@ -880,7 +631,6 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||
@ -907,50 +657,52 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs_masked; k++) {
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 3]);
|
||||
#else
|
||||
#if CLUSTER_M < CLUSTER_N
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
#endif
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0]);
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1]);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2]);
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3]);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
|
||||
#endif
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||
@ -962,87 +714,28 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||
|
||||
fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
|
||||
}
|
||||
|
||||
for(int k = numneighs_masked; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
|
||||
MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
|
||||
MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
|
||||
MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
|
||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
|
||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
|
||||
MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
|
||||
MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
|
||||
|
||||
MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
|
||||
MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
|
||||
MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
|
||||
MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
|
||||
|
||||
fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||
fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
|
||||
fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
|
||||
fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
|
||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||
fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
|
||||
fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
|
||||
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
||||
}
|
||||
|
||||
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
|
||||
@ -1051,13 +744,10 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
|
||||
|
||||
addStat(stats->calculated_forces, 1);
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs));
|
||||
//addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||
return E-S;
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -22,8 +22,25 @@
|
||||
# define KERNEL_NAME "CUDA"
|
||||
# define CLUSTER_M 8
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_J 1
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
# define XX 0
|
||||
# define YY 1
|
||||
# define ZZ 2
|
||||
# define SCLUSTER_SIZE_X 2
|
||||
# define SCLUSTER_SIZE_Y 2
|
||||
# define SCLUSTER_SIZE_Z 2
|
||||
# define SCLUSTER_SIZE (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)
|
||||
# define DIM_COORD(dim,coord) ((dim == XX) ? atom_x(coord) : ((dim == YY) ? atom_y(coord) : atom_z(coord)))
|
||||
# define MIN(a,b) ({int _a = (a), _b = (b); _a < _b ? _a : _b; })
|
||||
# define SCLUSTER_M CLUSTER_M * SCLUSTER_SIZE
|
||||
|
||||
# define computeForceLJ computeForceLJSup_cuda
|
||||
#else
|
||||
# define computeForceLJ computeForceLJ_cuda
|
||||
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
# define initialIntegrate cudaInitialIntegrate
|
||||
# define finalIntegrate cudaFinalIntegrate
|
||||
# define updatePbc cudaUpdatePbc
|
||||
@ -33,15 +50,11 @@
|
||||
# if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# define KERNEL_NAME "Simd2xNN"
|
||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 2
|
||||
# define computeForceLJ computeForceLJ_2xnn
|
||||
// Simd4xN
|
||||
# else
|
||||
# define KERNEL_NAME "Simd4xN"
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define UNROLL_I 4
|
||||
# define UNROLL_J 1
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
# endif
|
||||
# ifdef USE_REFERENCE_VERSION
|
||||
@ -60,16 +73,29 @@
|
||||
# define CJ1_FROM_CI(a) (a)
|
||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
# define CJ1_FROM_SCI(a) (a)
|
||||
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
|
||||
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
|
||||
# define CJ0_FROM_CI(a) ((a) << 1)
|
||||
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
|
||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
|
||||
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_M * SCLUSTER_SIZE * (b))
|
||||
# define SCJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (SCLUSTER_SIZE * CLUSTER_M >> 1))
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
|
||||
# define CJ0_FROM_CI(a) ((a) >> 1)
|
||||
# define CJ1_FROM_CI(a) ((a) >> 1)
|
||||
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
|
||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
# define SCI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (CLUSTER_N * SCLUSTER_SIZE >> 1))
|
||||
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
#else
|
||||
# error "Invalid cluster configuration!"
|
||||
#endif
|
||||
@ -83,14 +109,37 @@
|
||||
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
|
||||
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
#define SCI_SCALAR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 1))
|
||||
#define SCI_VECTOR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 3))
|
||||
#define SCJ_SCALAR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 1))
|
||||
#define SCJ_VECTOR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 3))
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
#if CLUSTER_M >= CLUSTER_N
|
||||
# define CL_X_OFFSET (0 * CLUSTER_M)
|
||||
# define CL_Y_OFFSET (1 * CLUSTER_M)
|
||||
# define CL_Z_OFFSET (2 * CLUSTER_M)
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
# define SCL_CL_X_OFFSET(ci) (ci * CLUSTER_M + 0 * SCLUSTER_M)
|
||||
# define SCL_CL_Y_OFFSET(ci) (ci * CLUSTER_M + 1 * SCLUSTER_M)
|
||||
# define SCL_CL_Z_OFFSET(ci) (ci * CLUSTER_M + 2 * SCLUSTER_M)
|
||||
|
||||
# define SCL_X_OFFSET (0 * SCLUSTER_M)
|
||||
# define SCL_Y_OFFSET (1 * SCLUSTER_M)
|
||||
# define SCL_Z_OFFSET (2 * SCLUSTER_M)
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
#else
|
||||
# define CL_X_OFFSET (0 * CLUSTER_N)
|
||||
# define CL_Y_OFFSET (1 * CLUSTER_N)
|
||||
# define CL_Z_OFFSET (2 * CLUSTER_N)
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
# define SCL_X_OFFSET (0 * SCLUSTER_SIZE * CLUSTER_N)
|
||||
# define SCL_Y_OFFSET (1 * SCLUSTER_SIZE * CLUSTER_N)
|
||||
# define SCL_Z_OFFSET (2 * SCLUSTER_SIZE * CLUSTER_N)
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
@ -100,6 +149,13 @@ typedef struct {
|
||||
MD_FLOAT bbminz, bbmaxz;
|
||||
} Cluster;
|
||||
|
||||
typedef struct {
|
||||
int nclusters;
|
||||
MD_FLOAT bbminx, bbmaxx;
|
||||
MD_FLOAT bbminy, bbmaxy;
|
||||
MD_FLOAT bbminz, bbmaxz;
|
||||
} SuperCluster;
|
||||
|
||||
typedef struct {
|
||||
int Natoms, Nlocal, Nghost, Nmax;
|
||||
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
|
||||
@ -121,17 +177,20 @@ typedef struct {
|
||||
Cluster *iclusters, *jclusters;
|
||||
int *icluster_bin;
|
||||
int dummy_cj;
|
||||
MD_UINT *exclusion_filter;
|
||||
MD_FLOAT *diagonal_4xn_j_minus_i;
|
||||
MD_FLOAT *diagonal_2xnn_j_minus_i;
|
||||
unsigned int masks_2xnn_hn[8];
|
||||
unsigned int masks_2xnn_fn[8];
|
||||
unsigned int masks_4xn_hn[16];
|
||||
unsigned int masks_4xn_fn[16];
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
int Nsclusters, Nsclusters_local, Nsclusters_ghost, Nsclusters_max;
|
||||
MD_FLOAT *scl_x;
|
||||
MD_FLOAT *scl_v;
|
||||
MD_FLOAT *scl_f;
|
||||
int *scl_type;
|
||||
int *icluster_idx;
|
||||
SuperCluster *siclusters;
|
||||
int *sicluster_bin;
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
extern void initMasks(Atom*);
|
||||
extern void createAtom(Atom*, Parameter*);
|
||||
extern int readAtom(Atom*, Parameter*);
|
||||
extern int readAtom_pdb(Atom*, Parameter*);
|
||||
@ -139,6 +198,7 @@ extern int readAtom_gro(Atom*, Parameter*);
|
||||
extern int readAtom_dmp(Atom*, Parameter*);
|
||||
extern void growAtom(Atom*);
|
||||
extern void growClusters(Atom*);
|
||||
extern void growSuperClusters(Atom*);
|
||||
|
||||
#ifdef AOS
|
||||
# define POS_DATA_LAYOUT "AoS"
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
32
gromacs/includes/neighbor.h
Normal file
32
gromacs/includes/neighbor.h
Normal file
@ -0,0 +1,32 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __NEIGHBOR_H_
|
||||
#define __NEIGHBOR_H_
|
||||
typedef struct {
|
||||
int every;
|
||||
int ncalls;
|
||||
int* neighbors;
|
||||
int maxneighs;
|
||||
int* numneigh;
|
||||
int half_neigh;
|
||||
} Neighbor;
|
||||
|
||||
extern void initNeighbor(Neighbor*, Parameter*);
|
||||
extern void setupNeighbor(Parameter*, Atom*);
|
||||
extern void binatoms(Atom*);
|
||||
extern void buildNeighbor(Atom*, Neighbor*);
|
||||
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
|
||||
extern void sortAtom(Atom*);
|
||||
extern void buildClusters(Atom*);
|
||||
extern void buildClustersGPU(Atom*);
|
||||
extern void defineJClusters(Atom*);
|
||||
extern void binClusters(Atom*);
|
||||
extern void updateSingleAtoms(Atom*);
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -16,5 +16,8 @@ extern void setupPbc(Atom*, Parameter*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern void cudaUpdatePbc(Atom*, Parameter*, int);
|
||||
#if defined(USE_SUPER_CLUSTERS)
|
||||
extern void setupPbcGPU(Atom*, Parameter*);
|
||||
#endif //defined(USE_SUPER_CLUSTERS)
|
||||
#endif
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
19
gromacs/includes/utils.h
Normal file
19
gromacs/includes/utils.h
Normal file
@ -0,0 +1,19 @@
|
||||
/*
|
||||
* Temporal functions for debugging, remove before proceeding with pull request
|
||||
*/
|
||||
|
||||
#ifndef MD_BENCH_UTILS_H
|
||||
#define MD_BENCH_UTILS_H
|
||||
|
||||
#include <atom.h>
|
||||
#include <neighbor.h>
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
void verifyClusters(Atom *atom);
|
||||
void verifyLayout(Atom *atom);
|
||||
void checkAlignment(Atom *atom);
|
||||
void showSuperclusters(Atom *atom);
|
||||
void printNeighs(Atom *atom, Neighbor *neighbor);
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
#endif //MD_BENCH_UTILS_H
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -9,6 +9,7 @@
|
||||
#ifndef __VTK_H_
|
||||
#define __VTK_H_
|
||||
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
|
||||
extern int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
||||
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -60,15 +60,18 @@ void init(Parameter *param) {
|
||||
param->eam_file = NULL;
|
||||
}
|
||||
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
|
||||
// Show debug messages
|
||||
#define DEBUG(msg) printf(msg)
|
||||
// Do not show debug messages
|
||||
//#define DEBUG(msg)
|
||||
|
||||
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
|
||||
const int maxneighs = nneighs * nreps;
|
||||
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
const int ncj = atom->Nclusters_local / jfac;
|
||||
const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
|
||||
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
|
||||
neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
|
||||
|
||||
if(pattern == P_RAND && ncj <= nneighs) {
|
||||
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
|
||||
@ -77,7 +80,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
|
||||
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
|
||||
int m = (pattern == P_SEQ) ? ncj : nneighs;
|
||||
int k = 0;
|
||||
@ -88,7 +90,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
|
||||
do {
|
||||
int cj = rand() % ncj;
|
||||
neighptr[k] = cj;
|
||||
neighptr_imask[k] = imask;
|
||||
found = 0;
|
||||
for(int l = 0; l < k; l++) {
|
||||
if(neighptr[l] == cj) {
|
||||
@ -98,7 +99,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
|
||||
} while(found == 1);
|
||||
} else {
|
||||
neighptr[k] = j;
|
||||
neighptr_imask[k] = imask;
|
||||
j = (j + 1) % m;
|
||||
}
|
||||
}
|
||||
@ -106,12 +106,10 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
|
||||
for(int r = 1; r < nreps; r++) {
|
||||
for(int k = 0; k < nneighs; k++) {
|
||||
neighptr[r * nneighs + k] = neighptr[k];
|
||||
neighptr_imask[r * nneighs + k] = neighptr_imask[k];
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = nneighs * nreps;
|
||||
neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -127,13 +125,12 @@ int main(int argc, const char *argv[]) {
|
||||
int niclusters = 256; // Number of local i-clusters
|
||||
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
|
||||
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
|
||||
int masked = 0; // Use masked loop
|
||||
int nreps = 1;
|
||||
int csv = 0;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
DEBUG_MESSAGE("Initializing parameters...\n");
|
||||
DEBUG("Initializing parameters...\n");
|
||||
init(¶m);
|
||||
|
||||
for(int i = 0; i < argc; i++) {
|
||||
@ -159,10 +156,6 @@ int main(int argc, const char *argv[]) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-m") == 0)) {
|
||||
masked = 1;
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
@ -213,11 +206,11 @@ int main(int argc, const char *argv[]) {
|
||||
}
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
DEBUG_MESSAGE("Initializing EAM parameters...\n");
|
||||
DEBUG("Initializing EAM parameters...\n");
|
||||
initEam(&eam, ¶m);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Initializing atoms...\n");
|
||||
DEBUG("Initializing atoms...\n");
|
||||
initAtom(atom);
|
||||
initStats(&stats);
|
||||
|
||||
@ -233,7 +226,7 @@ int main(int argc, const char *argv[]) {
|
||||
atom->cutforcesq[i] = param.cutforce * param.cutforce;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Creating atoms...\n");
|
||||
DEBUG("Creating atoms...\n");
|
||||
while(atom->Nmax < niclusters * iclusters_natoms) {
|
||||
growAtom(atom);
|
||||
}
|
||||
@ -288,13 +281,13 @@ int main(int argc, const char *argv[]) {
|
||||
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Defining j-clusters...\n");
|
||||
DEBUG("Defining j-clusters...\n");
|
||||
defineJClusters(atom);
|
||||
DEBUG_MESSAGE("Initializing neighbor lists...\n");
|
||||
DEBUG("Initializing neighbor lists...\n");
|
||||
initNeighbor(&neighbor, ¶m);
|
||||
DEBUG_MESSAGE("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
|
||||
DEBUG_MESSAGE("Computing forces...\n");
|
||||
DEBUG("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
|
||||
DEBUG("Computing forces...\n");
|
||||
|
||||
double T_accum = 0.0;
|
||||
for(int i = 0; i < param.ntimes; i++) {
|
@ -1,13 +1,11 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <omp.h>
|
||||
//--
|
||||
#include <likwid-marker.h>
|
||||
//--
|
||||
@ -40,7 +38,16 @@ extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighb
|
||||
extern void copyDataToCUDADevice(Atom *atom);
|
||||
extern void copyDataFromCUDADevice(Atom *atom);
|
||||
extern void cudaDeviceFree();
|
||||
#endif
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
#include <utils.h>
|
||||
extern void buildNeighborGPU(Atom *atom, Neighbor *neighbor);
|
||||
extern void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor);
|
||||
extern void alignDataToSuperclusters(Atom *atom);
|
||||
extern void alignDataFromSuperclusters(Atom *atom);
|
||||
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
#endif //CUDA_TARGET
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
@ -64,11 +71,24 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
|
||||
setupNeighbor(param, atom);
|
||||
setupThermo(param, atom->Natoms);
|
||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
buildClustersGPU(atom);
|
||||
#else
|
||||
buildClusters(atom);
|
||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
defineJClusters(atom);
|
||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
setupPbcGPU(atom, param);
|
||||
//setupPbc(atom, param);
|
||||
#else
|
||||
setupPbc(atom, param);
|
||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
binClusters(atom);
|
||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
buildNeighborGPU(atom, neighbor);
|
||||
#else
|
||||
buildNeighbor(atom, neighbor);
|
||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
initDevice(atom, neighbor);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
@ -80,11 +100,24 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
updateSingleAtoms(atom);
|
||||
updateAtomsPbc(atom, param);
|
||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
buildClustersGPU(atom);
|
||||
#else
|
||||
buildClusters(atom);
|
||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
defineJClusters(atom);
|
||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
//setupPbcGPU(atom, param);
|
||||
setupPbc(atom, param);
|
||||
#else
|
||||
setupPbc(atom, param);
|
||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
binClusters(atom);
|
||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
buildNeighborGPU(atom, neighbor);
|
||||
#else
|
||||
buildNeighbor(atom, neighbor);
|
||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
@ -119,7 +152,7 @@ int main(int argc, char** argv) {
|
||||
|
||||
initParameter(¶m);
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
|
||||
if((strcmp(argv[i], "-p") == 0)) {
|
||||
readParameter(¶m, argv[++i]);
|
||||
continue;
|
||||
}
|
||||
@ -211,6 +244,8 @@ int main(int argc, char** argv) {
|
||||
printParameter(¶m);
|
||||
printf(HLINE);
|
||||
|
||||
//verifyNeigh(&atom, &neighbor);
|
||||
|
||||
printf("step\ttemp\t\tpressure\n");
|
||||
computeThermo(0, ¶m, &atom);
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
@ -239,14 +274,24 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
for(int n = 0; n < param.ntimes; n++) {
|
||||
|
||||
//printf("Step:\t%d\r\n", n);
|
||||
|
||||
initialIntegrate(¶m, &atom);
|
||||
|
||||
if((n + 1) % param.reneigh_every) {
|
||||
if(!((n + 1) % param.prune_every)) {
|
||||
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
pruneNeighborGPU(¶m, &atom, &neighbor);
|
||||
#else
|
||||
pruneNeighbor(¶m, &atom, &neighbor);
|
||||
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
|
||||
}
|
||||
|
||||
|
||||
copyDataFromCUDADevice(&atom);
|
||||
updatePbc(&atom, ¶m, 0);
|
||||
copyDataToCUDADevice(&atom);
|
||||
} else {
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
@ -264,14 +309,34 @@ int main(int argc, char** argv) {
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
printf("%d\t%d\r\n", atom.Nsclusters_local, atom.Nclusters_local);
|
||||
copyDataToCUDADevice(&atom);
|
||||
verifyLayout(&atom);
|
||||
|
||||
//printClusterIndices(&atom);
|
||||
|
||||
*/
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
} else {
|
||||
timer[FORCE] += computeForceLJ(¶m, &atom, &neighbor, &stats);
|
||||
}
|
||||
|
||||
/*
|
||||
copyDataFromCUDADevice(&atom);
|
||||
verifyLayout(&atom);
|
||||
|
||||
getchar();
|
||||
*/
|
||||
|
||||
finalIntegrate(¶m, &atom);
|
||||
|
||||
|
||||
|
||||
|
||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||
computeThermo(n + 1, ¶m, &atom);
|
||||
}
|
||||
@ -310,30 +375,6 @@ int main(int argc, char** argv) {
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
|
||||
printf(HLINE);
|
||||
|
||||
int nthreads = 0;
|
||||
int chunkSize = 0;
|
||||
omp_sched_t schedKind;
|
||||
char schedType[10];
|
||||
#pragma omp parallel
|
||||
#pragma omp master
|
||||
{
|
||||
omp_get_schedule(&schedKind, &chunkSize);
|
||||
|
||||
switch (schedKind)
|
||||
{
|
||||
case omp_sched_static: strcpy(schedType, "static"); break;
|
||||
case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
|
||||
case omp_sched_guided: strcpy(schedType, "guided"); break;
|
||||
case omp_sched_auto: strcpy(schedType, "auto"); break;
|
||||
}
|
||||
|
||||
nthreads = omp_get_max_threads();
|
||||
}
|
||||
|
||||
printf("Num threads: %d\n", nthreads);
|
||||
printf("Schedule: (%s,%d)\n", schedType, chunkSize);
|
||||
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
#ifdef COMPUTE_STATS
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -56,9 +56,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
neighbor->half_neigh = param->half_neigh;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->numneigh_masked = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
neighbor->neighbors_imask = NULL;
|
||||
}
|
||||
|
||||
void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
@ -79,8 +77,13 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
|
||||
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
||||
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_X;
|
||||
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_Y;
|
||||
#else
|
||||
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
|
||||
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
|
||||
#endif
|
||||
nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
|
||||
nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
|
||||
binsizex = (xhi - xlo) / nbinx;
|
||||
@ -186,43 +189,30 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
|
||||
static unsigned int get_imask(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||
int atomDistanceInRangeGPU(Atom *atom, int sci, int cj, MD_FLOAT rsq) {
|
||||
for (int ci = 0; ci < atom->siclusters[sci].nclusters; ci++) {
|
||||
const int icluster_idx = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
|
||||
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(icluster_idx);
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[icluster_idx].natoms; cii++) {
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
|
||||
MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
|
||||
MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
|
||||
if(delx * delx + dely * dely + delz * delz < rsq) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
|
||||
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
|
||||
: (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
|
||||
: NBNXN_INTERACTION_MASK_ALL));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
|
||||
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
|
||||
}
|
||||
|
||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
|
||||
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
|
||||
return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
|
||||
: (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
|
||||
: NBNXN_INTERACTION_MASK_ALL));
|
||||
}
|
||||
|
||||
#if VECTOR_WIDTH == 2
|
||||
# define get_imask_simd_4xn get_imask_simd_j2
|
||||
#elif VECTOR_WIDTH== 4
|
||||
# define get_imask_simd_4xn get_imask_simd_j4
|
||||
#elif VECTOR_WIDTH == 8
|
||||
# define get_imask_simd_4xn get_imask_simd_j8
|
||||
# define get_imask_simd_2xnn get_imask_simd_j4
|
||||
#elif VECTOR_WIDTH == 16
|
||||
# define get_imask_simd_2xnn get_imask_simd_j8
|
||||
#else
|
||||
# error "Invalid cluster configuration"
|
||||
#endif
|
||||
|
||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("buildNeighbor start\n");
|
||||
|
||||
@ -230,13 +220,9 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
if(atom->Nclusters_local > nmax) {
|
||||
nmax = atom->Nclusters_local;
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
||||
}
|
||||
|
||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
||||
@ -253,8 +239,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
|
||||
int n = 0, nmasked = 0;
|
||||
int n = 0;
|
||||
int ibin = atom->icluster_bin[ci];
|
||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
|
||||
@ -319,30 +304,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
if(d_bb_sq < cutneighsq) {
|
||||
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
|
||||
// We use true (1) for rdiag because we only care if there are masks
|
||||
// at all, and when this is set to false (0) the self-exclusions are
|
||||
// not accounted for, which makes the optimized version to not work!
|
||||
unsigned int imask;
|
||||
#if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
|
||||
imask = get_imask_simd_2xnn(1, ci, cj);
|
||||
#else // 4xn
|
||||
imask = get_imask_simd_4xn(1, ci, cj);
|
||||
#endif
|
||||
|
||||
if(n < neighbor->maxneighs) {
|
||||
if(imask == NBNXN_INTERACTION_MASK_ALL) {
|
||||
neighptr[n] = cj;
|
||||
neighptr_imask[n] = imask;
|
||||
} else {
|
||||
neighptr[n] = neighptr[nmasked];
|
||||
neighptr_imask[n] = neighptr_imask[nmasked];
|
||||
neighptr[nmasked] = cj;
|
||||
neighptr_imask[nmasked] = imask;
|
||||
nmasked++;
|
||||
}
|
||||
}
|
||||
|
||||
n++;
|
||||
neighptr[n++] = cj;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -364,14 +326,11 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
// Fill neighbor list with dummy values to fit vector width
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighptr_imask[n] = 0;
|
||||
n++;
|
||||
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = n;
|
||||
neighbor->numneigh_masked[ci] = nmasked;
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
@ -382,12 +341,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
free(neighbor->neighbors);
|
||||
free(neighbor->neighbors_imask);
|
||||
neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
|
||||
neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
@ -436,6 +393,198 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("buildNeighbor end\n");
|
||||
}
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
// TODO For future parallelization on GPU
|
||||
void buildNeighborGPU(Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("buildNeighborGPU start\n");
|
||||
|
||||
/* extend atom arrays if necessary */
|
||||
if(atom->Nsclusters_local > nmax) {
|
||||
nmax = atom->Nsclusters_local;
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
||||
}
|
||||
|
||||
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
|
||||
MD_FLOAT bby = 0.5 * (binsizey + binsizey);
|
||||
MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
|
||||
rbb_sq = rbb_sq * rbb_sq;
|
||||
int resize = 1;
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
int new_maxneighs = neighbor->maxneighs;
|
||||
resize = 0;
|
||||
|
||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
||||
int ci_cj1 = CJ1_FROM_SCI(sci);
|
||||
int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
int ibin = atom->sicluster_bin[sci];
|
||||
MD_FLOAT ibb_xmin = atom->siclusters[sci].bbminx;
|
||||
MD_FLOAT ibb_xmax = atom->siclusters[sci].bbmaxx;
|
||||
MD_FLOAT ibb_ymin = atom->siclusters[sci].bbminy;
|
||||
MD_FLOAT ibb_ymax = atom->siclusters[sci].bbmaxy;
|
||||
MD_FLOAT ibb_zmin = atom->siclusters[sci].bbminz;
|
||||
MD_FLOAT ibb_zmax = atom->siclusters[sci].bbmaxz;
|
||||
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
|
||||
int cj, m = -1;
|
||||
MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
|
||||
const int c = bin_nclusters[jbin];
|
||||
|
||||
if(c > 0) {
|
||||
MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
|
||||
|
||||
do {
|
||||
m++;
|
||||
cj = loc_bin[m];
|
||||
if(neighbor->half_neigh && ci_cj1 > cj) {
|
||||
continue;
|
||||
}
|
||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||
dl = ibb_zmin - jbb_zmax;
|
||||
dh = jbb_zmin - ibb_zmax;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq = dm0 * dm0;
|
||||
} while(m + 1 < c && d_bb_sq > cutneighsq);
|
||||
|
||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
||||
|
||||
while(m < c) {
|
||||
if(!neighbor->half_neigh || ci_cj1 <= cj) {
|
||||
dl = ibb_zmin - jbb_zmax;
|
||||
dh = jbb_zmin - ibb_zmax;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq = dm0 * dm0;
|
||||
|
||||
/*if(d_bb_sq > cutneighsq) {
|
||||
break;
|
||||
}*/
|
||||
|
||||
dl = ibb_ymin - jbb_ymax;
|
||||
dh = jbb_ymin - ibb_ymax;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq += dm0 * dm0;
|
||||
|
||||
dl = ibb_xmin - jbb_xmax;
|
||||
dh = jbb_xmin - ibb_xmax;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d_bb_sq += dm0 * dm0;
|
||||
|
||||
if(d_bb_sq < cutneighsq) {
|
||||
if(d_bb_sq < rbb_sq || atomDistanceInRangeGPU(atom, sci, cj, cutneighsq)) {
|
||||
neighptr[n++] = cj;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
m++;
|
||||
if(m < c) {
|
||||
cj = loc_bin[m];
|
||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fill neighbor list with dummy values to fit vector width
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[sci] = n;
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
if(n >= new_maxneighs) {
|
||||
new_maxneighs = n;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
|
||||
//for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
//const int ci = atom->siclusters[sci].iclusters[scii];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
free(neighbor->neighbors);
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
|
||||
for(int ci = 0; ci < 6; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
|
||||
DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
|
||||
ci,
|
||||
atom->iclusters[ci].bbminx,
|
||||
atom->iclusters[ci].bbmaxx,
|
||||
atom->iclusters[ci].bbminy,
|
||||
atom->iclusters[ci].bbmaxy,
|
||||
atom->iclusters[ci].bbminz,
|
||||
atom->iclusters[ci].bbmaxz);
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Neighbors:\n");
|
||||
for(int k = 0; k < neighbor->numneigh[ci]; k++) {
|
||||
int cj = neighptr[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
|
||||
cj,
|
||||
atom->jclusters[cj].bbminx,
|
||||
atom->jclusters[cj].bbmaxx,
|
||||
atom->jclusters[cj].bbminy,
|
||||
atom->jclusters[cj].bbmaxy,
|
||||
atom->jclusters[cj].bbminz,
|
||||
atom->jclusters[cj].bbmaxz);
|
||||
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
DEBUG_MESSAGE("buildNeighborGPU end\n");
|
||||
}
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("pruneNeighbor start\n");
|
||||
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
|
||||
@ -443,9 +592,7 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
int numneighs_masked = neighbor->numneigh_masked[ci];
|
||||
int k = 0;
|
||||
|
||||
// Remove dummy clusters if necessary
|
||||
@ -461,9 +608,6 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
k++;
|
||||
} else {
|
||||
numneighs--;
|
||||
if(k < numneighs_masked) {
|
||||
numneighs_masked--;
|
||||
}
|
||||
neighs[k] = neighs[numneighs];
|
||||
}
|
||||
}
|
||||
@ -471,19 +615,63 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
// Readd dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
neighs_imask[numneighs] = 0;
|
||||
numneighs++;
|
||||
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[ci] = numneighs;
|
||||
neighbor->numneigh_masked[ci] = numneighs_masked;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("pruneNeighbor end\n");
|
||||
}
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("pruneNeighbor start\n");
|
||||
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT cutsq = cutneighsq;
|
||||
|
||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
||||
for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
|
||||
//const int ci = atom->siclusters[sci].iclusters[scii];
|
||||
const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
|
||||
|
||||
int *neighs = &neighbor->neighbors[sci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[sci];
|
||||
int k = 0;
|
||||
|
||||
// Remove dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
||||
numneighs--;
|
||||
}
|
||||
}
|
||||
|
||||
while(k < numneighs) {
|
||||
int cj = neighs[k];
|
||||
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
|
||||
k++;
|
||||
} else {
|
||||
numneighs--;
|
||||
neighs[k] = neighs[numneighs];
|
||||
}
|
||||
}
|
||||
|
||||
// Readd dummy clusters if necessary
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[sci] = numneighs;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("pruneNeighbor end\n");
|
||||
}
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
/* internal subroutines */
|
||||
MD_FLOAT bindist(int i, int j) {
|
||||
MD_FLOAT delx, dely, delz;
|
||||
@ -609,6 +797,36 @@ void sortAtomsByZCoord(Atom *atom) {
|
||||
DEBUG_MESSAGE("sortAtomsByZCoord end\n");
|
||||
}
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
// TODO: Use pigeonhole sorting
|
||||
void sortAtomsByCoord(Atom *atom, int dim, int bin, int start_index, int end_index) {
|
||||
//DEBUG_MESSAGE("sortAtomsByCoord start\n");
|
||||
int *bin_ptr = &bins[bin * atoms_per_bin];
|
||||
|
||||
for(int ac_i = start_index; ac_i <= end_index; ac_i++) {
|
||||
int i = bin_ptr[ac_i];
|
||||
int min_ac = ac_i;
|
||||
int min_idx = i;
|
||||
MD_FLOAT min_coord = DIM_COORD(dim, i);
|
||||
|
||||
for(int ac_j = ac_i + 1; ac_j <= end_index; ac_j++) {
|
||||
int j = bin_ptr[ac_j];
|
||||
MD_FLOAT coordj = DIM_COORD(dim, j);
|
||||
if(coordj < min_coord) {
|
||||
min_ac = ac_j;
|
||||
min_idx = j;
|
||||
min_coord = coordj;
|
||||
}
|
||||
}
|
||||
|
||||
bin_ptr[ac_i] = min_idx;
|
||||
bin_ptr[min_ac] = i;
|
||||
}
|
||||
|
||||
//DEBUG_MESSAGE("sortAtomsByCoord end\n");
|
||||
}
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
void buildClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("buildClusters start\n");
|
||||
atom->Nclusters_local = 0;
|
||||
@ -685,6 +903,175 @@ void buildClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("buildClusters end\n");
|
||||
}
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
void buildClustersGPU(Atom *atom) {
|
||||
DEBUG_MESSAGE("buildClustersGPU start\n");
|
||||
atom->Nclusters_local = 0;
|
||||
|
||||
/* bin local atoms */
|
||||
binAtoms(atom);
|
||||
|
||||
for(int bin = 0; bin < mbins; bin++) {
|
||||
int c = bincount[bin];
|
||||
sortAtomsByCoord(atom, ZZ, bin, 0, c - 1);
|
||||
int ac = 0;
|
||||
int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
|
||||
if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
|
||||
|
||||
int n_super_clusters_xy = nclusters / (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y);
|
||||
if (nclusters % (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y)) n_super_clusters_xy++;
|
||||
int n_super_clusters = n_super_clusters_xy / SCLUSTER_SIZE_Z;
|
||||
if (n_super_clusters_xy % SCLUSTER_SIZE_Z) n_super_clusters++;
|
||||
|
||||
int cl_count = 0;
|
||||
for (int scl = 0; scl < n_super_clusters; scl++) {
|
||||
const int sci = atom->Nsclusters_local;
|
||||
if(sci >= atom->Nsclusters_max) {
|
||||
growSuperClusters(atom);
|
||||
}
|
||||
|
||||
if (cl_count >= nclusters) break;
|
||||
|
||||
int scl_offset = scl * SCLUSTER_SIZE * CLUSTER_M;
|
||||
|
||||
MD_FLOAT sc_bbminx = INFINITY, sc_bbmaxx = -INFINITY;
|
||||
MD_FLOAT sc_bbminy = INFINITY, sc_bbmaxy = -INFINITY;
|
||||
MD_FLOAT sc_bbminz = INFINITY, sc_bbmaxz = -INFINITY;
|
||||
|
||||
for (int scl_z = 0; scl_z < SCLUSTER_SIZE_Z; scl_z++) {
|
||||
|
||||
if (cl_count >= nclusters) break;
|
||||
|
||||
const int atom_scl_z_offset = scl_offset + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M;
|
||||
|
||||
|
||||
const int atom_scl_z_end_idx = MIN(atom_scl_z_offset +
|
||||
SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
|
||||
|
||||
sortAtomsByCoord(atom, YY, bin, atom_scl_z_offset, atom_scl_z_end_idx);
|
||||
|
||||
for (int scl_y = 0; scl_y < SCLUSTER_SIZE_Y; scl_y++) {
|
||||
|
||||
if (cl_count >= nclusters) break;
|
||||
|
||||
const int atom_scl_y_offset = scl_offset +
|
||||
scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M +
|
||||
scl_y * SCLUSTER_SIZE_Y * CLUSTER_M;
|
||||
|
||||
const int atom_scl_y_end_idx = MIN(atom_scl_y_offset +
|
||||
SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
|
||||
|
||||
sortAtomsByCoord(atom, XX, bin, atom_scl_y_offset, atom_scl_y_end_idx);
|
||||
|
||||
for (int scl_x = 0; scl_x < SCLUSTER_SIZE_X; scl_x++) {
|
||||
if (cl_count >= nclusters) break;
|
||||
cl_count++;
|
||||
|
||||
const int cluster_sup_idx = scl_z * SCLUSTER_SIZE_Z * SCLUSTER_SIZE_Y +
|
||||
scl_y * SCLUSTER_SIZE_X + scl_x;
|
||||
|
||||
const int ci = atom->Nclusters_local;
|
||||
if(ci >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
|
||||
int sci_sca_base = SCI_SCALAR_BASE_INDEX(sci);
|
||||
int sci_vec_base = SCI_VECTOR_BASE_INDEX(sci);
|
||||
MD_FLOAT *sci_x = &atom->scl_x[sci_vec_base];
|
||||
MD_FLOAT *sci_v = &atom->scl_v[sci_vec_base];
|
||||
|
||||
int *ci_type = &atom->cl_type[ci_sca_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
atom->iclusters[ci].natoms = 0;
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
if(ac < c) {
|
||||
int i = bins[bin * atoms_per_bin + ac];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
|
||||
ci_x[CL_X_OFFSET + cii] = xtmp;
|
||||
ci_x[CL_Y_OFFSET + cii] = ytmp;
|
||||
ci_x[CL_Z_OFFSET + cii] = ztmp;
|
||||
ci_v[CL_X_OFFSET + cii] = atom->vx[i];
|
||||
ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
|
||||
ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
|
||||
|
||||
sci_x[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = xtmp;
|
||||
sci_x[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = ytmp;
|
||||
sci_x[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = ztmp;
|
||||
|
||||
sci_v[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vx[i];
|
||||
sci_v[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vy[i];
|
||||
sci_v[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vz[i];
|
||||
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
|
||||
ci_type[cii] = atom->type[i];
|
||||
atom->iclusters[ci].natoms++;
|
||||
} else {
|
||||
ci_x[CL_X_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Y_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Z_OFFSET + cii] = INFINITY;
|
||||
}
|
||||
|
||||
ac++;
|
||||
}
|
||||
|
||||
atom->icluster_bin[ci] = bin;
|
||||
atom->iclusters[ci].bbminx = bbminx;
|
||||
atom->iclusters[ci].bbmaxx = bbmaxx;
|
||||
atom->iclusters[ci].bbminy = bbminy;
|
||||
atom->iclusters[ci].bbmaxy = bbmaxy;
|
||||
atom->iclusters[ci].bbminz = bbminz;
|
||||
atom->iclusters[ci].bbmaxz = bbmaxz;
|
||||
atom->Nclusters_local++;
|
||||
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(sc_bbminx > bbminx) { sc_bbminx = bbminx; }
|
||||
if(sc_bbmaxx < bbmaxx) { sc_bbmaxx = bbmaxx; }
|
||||
if(sc_bbminy > bbminy) { sc_bbminy = bbminy; }
|
||||
if(sc_bbmaxy < bbmaxy) { sc_bbmaxy = bbmaxy; }
|
||||
if(sc_bbminz > bbminz) { sc_bbminz = bbminz; }
|
||||
if(sc_bbmaxz < bbmaxz) { sc_bbmaxz = bbmaxz; }
|
||||
|
||||
atom->siclusters[sci].nclusters++;
|
||||
atom->icluster_idx[SCLUSTER_SIZE * sci + cluster_sup_idx] = ci;
|
||||
//atom->siclusters[sci].iclusters[cluster_sup_idx] = ci;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
atom->sicluster_bin[sci] = bin;
|
||||
atom->siclusters[sci].bbminx = sc_bbminx;
|
||||
atom->siclusters[sci].bbmaxx = sc_bbmaxx;
|
||||
atom->siclusters[sci].bbminy = sc_bbminy;
|
||||
atom->siclusters[sci].bbmaxy = sc_bbmaxy;
|
||||
atom->siclusters[sci].bbminz = sc_bbminz;
|
||||
atom->siclusters[sci].bbmaxz = sc_bbmaxz;
|
||||
atom->Nsclusters_local++;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("buildClustersGPU end\n");
|
||||
}
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
void defineJClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("defineJClusters start\n");
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -86,6 +86,98 @@ void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
DEBUG_MESSAGE("updatePbc end\n");
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void gpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
DEBUG_MESSAGE("gpuUpdatePbc start\n");
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
|
||||
const int cj = ncj + cg;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
|
||||
int scj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
|
||||
|
||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
|
||||
|
||||
int sbmap_vec_base = SCJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
|
||||
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
|
||||
|
||||
MD_FLOAT *scj_x = &atom->scl_x[scj_vec_base];
|
||||
MD_FLOAT *sbmap_x = &atom->scl_x[sbmap_vec_base];
|
||||
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
MD_FLOAT sbbminx = INFINITY, sbbmaxx = -INFINITY;
|
||||
MD_FLOAT sbbminy = INFINITY, sbbmaxy = -INFINITY;
|
||||
MD_FLOAT sbbminz = INFINITY, sbbmaxz = -INFINITY;
|
||||
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
|
||||
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
|
||||
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
|
||||
|
||||
MD_FLOAT sxtmp = sbmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
|
||||
MD_FLOAT sytmp = sbmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
|
||||
MD_FLOAT sztmp = sbmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
|
||||
|
||||
cj_x[CL_X_OFFSET + cjj] = xtmp;
|
||||
cj_x[CL_Y_OFFSET + cjj] = ytmp;
|
||||
cj_x[CL_Z_OFFSET + cjj] = ztmp;
|
||||
|
||||
scj_x[SCL_X_OFFSET + cjj] = sxtmp;
|
||||
scj_x[SCL_Y_OFFSET + cjj] = sytmp;
|
||||
scj_x[SCL_Z_OFFSET + cjj] = sztmp;
|
||||
|
||||
if(firstUpdate) {
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
|
||||
if(sbbminx > sxtmp) { sbbminx = sxtmp; }
|
||||
if(sbbmaxx < sxtmp) { sbbmaxx = sxtmp; }
|
||||
if(sbbminy > sytmp) { sbbminy = sytmp; }
|
||||
if(sbbmaxy < sytmp) { sbbmaxy = sytmp; }
|
||||
if(sbbminz > sztmp) { sbbminz = sztmp; }
|
||||
if(sbbmaxz < sztmp) { sbbmaxz = sztmp; }
|
||||
}
|
||||
}
|
||||
|
||||
if(firstUpdate) {
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
|
||||
scj_x[SCL_X_OFFSET + cjj] = INFINITY;
|
||||
scj_x[SCL_Y_OFFSET + cjj] = INFINITY;
|
||||
scj_x[SCL_Z_OFFSET + cjj] = INFINITY;
|
||||
}
|
||||
|
||||
atom->jclusters[cj].bbminx = bbminx;
|
||||
atom->jclusters[cj].bbmaxx = bbmaxx;
|
||||
atom->jclusters[cj].bbminy = bbminy;
|
||||
atom->jclusters[cj].bbmaxy = bbmaxy;
|
||||
atom->jclusters[cj].bbminz = bbminz;
|
||||
atom->jclusters[cj].bbmaxz = bbmaxz;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("gpuUpdatePbc end\n");
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
* to periodic boundary conditions */
|
||||
void updateAtomsPbc(Atom *atom, Parameter *param) {
|
||||
@ -229,3 +321,91 @@ void setupPbc(Atom *atom, Parameter *param) {
|
||||
cpuUpdatePbc(atom, param, 1);
|
||||
DEBUG_MESSAGE("setupPbc end\n");
|
||||
}
|
||||
|
||||
void setupPbcGPU(Atom *atom, Parameter *param) {
|
||||
DEBUG_MESSAGE("setupPbcGPU start\n");
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
MD_FLOAT Cutneigh = param->cutneigh;
|
||||
//int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int jfac = SCLUSTER_M / CLUSTER_M;
|
||||
int ncj = atom->Nsclusters_local * jfac;
|
||||
int Nghost = -1;
|
||||
int Nghost_atoms = 0;
|
||||
|
||||
for(int cj = 0; cj < ncj; cj++) {
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
if(atom->Nsclusters_local + (Nghost + (jfac - 1) + 7) / jfac >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
//growSuperClusters(atom);
|
||||
}
|
||||
|
||||
if((Nghost + 7) * CLUSTER_M >= NmaxGhost) {
|
||||
growPbc(atom);
|
||||
}
|
||||
|
||||
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
|
||||
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
|
||||
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
|
||||
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
|
||||
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
|
||||
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
|
||||
|
||||
/* Setup ghost atoms */
|
||||
/* 6 planes */
|
||||
if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); }
|
||||
if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
|
||||
if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
|
||||
if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); }
|
||||
if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
|
||||
/* 8 corners */
|
||||
if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); }
|
||||
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); }
|
||||
if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
|
||||
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
|
||||
/* 12 edges */
|
||||
if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); }
|
||||
if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); }
|
||||
if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
|
||||
if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); }
|
||||
if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
|
||||
if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); }
|
||||
if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
||||
}
|
||||
}
|
||||
|
||||
if(ncj + (Nghost + (jfac - 1) + 1) / jfac >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
//growSuperClusters(atom);
|
||||
}
|
||||
|
||||
// Add dummy cluster at the end
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
}
|
||||
|
||||
// increase by one to make it the ghost atom count
|
||||
atom->dummy_cj = ncj + Nghost + 1;
|
||||
atom->Nghost = Nghost_atoms;
|
||||
atom->Nclusters_ghost = Nghost + 1;
|
||||
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
|
||||
|
||||
// Update created ghost clusters positions
|
||||
gpuUpdatePbc(atom, param, 1);
|
||||
DEBUG_MESSAGE("setupPbcGPU end\n");
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -14,7 +14,6 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
||||
INDEX_TRACER_INIT;
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
unsigned int *neighs_imask;
|
||||
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
|
||||
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
|
||||
@ -35,8 +34,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
|
||||
DIST_TRACE(neighs, numneighs);
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MEM_TRACE(j, 'R');
|
||||
MEM_TRACE(neighs[k], 'R');
|
||||
MEM_TRACE(atom_x(j), 'R');
|
||||
MEM_TRACE(atom_y(j), 'R');
|
||||
MEM_TRACE(atom_z(j), 'R');
|
332
gromacs/utils.c
Normal file
332
gromacs/utils.c
Normal file
@ -0,0 +1,332 @@
|
||||
|
||||
/*
|
||||
* Temporal functions for debugging, remove before proceeding with pull request
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <utils.h>
|
||||
|
||||
extern void alignDataToSuperclusters(Atom *atom);
|
||||
extern void alignDataFromSuperclusters(Atom *atom);
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
/*
|
||||
void verifyClusters(Atom *atom) {
|
||||
unsigned int count = 0;
|
||||
|
||||
for (int i = 0; i < atom->Nsclusters_local; i++) {
|
||||
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++, count++);
|
||||
}
|
||||
}
|
||||
|
||||
MD_FLOAT *x = malloc(count * sizeof(MD_FLOAT));
|
||||
MD_FLOAT *y = malloc(count * sizeof(MD_FLOAT));
|
||||
MD_FLOAT *z = malloc(count * sizeof(MD_FLOAT));
|
||||
|
||||
count = 0;
|
||||
unsigned int diffs = 0;
|
||||
|
||||
printf("######### %d #########\r\n", atom->Nsclusters_local);
|
||||
for (int i = 0; i < atom->Nsclusters_local; i++) {
|
||||
printf("######### %d\t #########\r\n", atom->siclusters[i].nclusters);
|
||||
|
||||
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
|
||||
//printf("%d\t", atom.siclusters[i].iclusters[j]);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[i].iclusters[j])];
|
||||
|
||||
if (atom->iclusters[atom->siclusters[i].iclusters[j]].bbminx < atom->siclusters[i].bbminx ||
|
||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxx > atom->siclusters[i].bbmaxx ||
|
||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminy < atom->siclusters[i].bbminy ||
|
||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxy > atom->siclusters[i].bbmaxy ||
|
||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminz < atom->siclusters[i].bbminz ||
|
||||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxz > atom->siclusters[i].bbmaxz) diffs++;
|
||||
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
|
||||
x[count] = ci_x[CL_X_OFFSET + cii];
|
||||
y[count] = ci_x[CL_Y_OFFSET + cii];
|
||||
z[count] = ci_x[CL_Z_OFFSET + cii];
|
||||
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
}
|
||||
printf("######### \t #########\r\n");
|
||||
}
|
||||
|
||||
printf("######### Diffs: %d\t #########\r\n", diffs);
|
||||
|
||||
printf("\r\n");
|
||||
|
||||
count = 0;
|
||||
diffs = 0;
|
||||
|
||||
for (int i = 0; i < atom->Nclusters_local; i++) {
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
|
||||
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
|
||||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
|
||||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
|
||||
}
|
||||
}
|
||||
|
||||
printf("######### Diffs: %d\t #########\r\n", diffs);
|
||||
}
|
||||
*/
|
||||
|
||||
void verifyLayout(Atom *atom) {
|
||||
|
||||
printf("verifyLayout start\r\n");
|
||||
|
||||
/*
|
||||
unsigned int count = 0;
|
||||
|
||||
for (int i = 0; i < atom->Nsclusters_local; i++) {
|
||||
for (int j = 0; j < atom->siclusters[i].nclusters; j++, count++);
|
||||
}
|
||||
|
||||
MD_FLOAT *scl_x = malloc(atom->Nsclusters_local * SCLUSTER_SIZE * 3 * CLUSTER_M * sizeof(MD_FLOAT));
|
||||
|
||||
|
||||
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
|
||||
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
|
||||
|
||||
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
|
||||
|
||||
const unsigned int atom_offset = scci;
|
||||
|
||||
/*
|
||||
for(int cii = 0, scii = atom_offset; cii < CLUSTER_M; cii++, scii += 3) {
|
||||
scl_x[CL_X_OFFSET + scii] = ci_x[CL_X_OFFSET + cii];
|
||||
scl_x[CL_Y_OFFSET + scii] = ci_x[CL_Y_OFFSET + cii];
|
||||
scl_x[CL_Z_OFFSET + scii] = ci_x[CL_Z_OFFSET + cii];
|
||||
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
|
||||
memcpy(&scl_x[atom_offset], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&scl_x[atom_offset + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
memcpy(&scl_x[atom_offset + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
//alignDataToSuperclusters(atom);
|
||||
|
||||
//for (int sci = 0; sci < 2; sci++) {
|
||||
for (int sci = 4; sci < 6; sci++) {
|
||||
const unsigned int scl_offset = sci * SCLUSTER_SIZE;
|
||||
|
||||
MD_FLOAT *sci_x = &atom->scl_f[SCI_VECTOR_BASE_INDEX(sci)];
|
||||
|
||||
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
||||
|
||||
const unsigned int cl_idx = cii / CLUSTER_M;
|
||||
const unsigned int ciii = cii % CLUSTER_M;
|
||||
|
||||
/*
|
||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[cii],
|
||||
sci_x[cii + SCLUSTER_SIZE * CLUSTER_M], sci_x[cii + 2 * SCLUSTER_SIZE * CLUSTER_M]);
|
||||
*/
|
||||
|
||||
printf("%d\t%d\t%f\t%f\t%f\r\n", atom->icluster_idx[SCLUSTER_SIZE * sci + cl_idx], cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
|
||||
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
//for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
||||
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
||||
|
||||
const unsigned int cl_idx = cii / CLUSTER_M;
|
||||
const unsigned int ciii = cii % CLUSTER_M;
|
||||
|
||||
/*
|
||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + cii],
|
||||
sci_x[SCL_Y_OFFSET(cl_idx) + cii], sci_x[SCL_Z_OFFSET(cl_idx) + cii]);
|
||||
*/
|
||||
|
||||
/*
|
||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + ciii],
|
||||
sci_x[SCL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_Z_OFFSET(cl_idx) + ciii]);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
for (int scii = scl_offset; scii < scl_offset + SCLUSTER_SIZE; scii++) {
|
||||
|
||||
for (int cii = 0; cii < CLUSTER_M; ++cii) {
|
||||
printf("%f\t%f\t%f\r\n", sci_x[SCL_X_OFFSET(scii) + cii],
|
||||
sci_x[SCL_Y_OFFSET(scii) + cii], sci_x[SCL_Z_OFFSET(scii) + cii]);
|
||||
}
|
||||
/*
|
||||
|
||||
const unsigned int cl_offset = scii * 3 * CLUSTER_M;
|
||||
//MD_FLOAT *sci_x = &scl_x[CI_VECTOR_BASE_INDEX(scii)];
|
||||
|
||||
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
|
||||
printf("%f\t%f\t%f\r\n", sci_x[CL_X_OFFSET + cii],
|
||||
sci_x[CL_Y_OFFSET + cii], sci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
|
||||
printf("%f\t%f\t%f\r\n", scl_x[CL_X_OFFSET + cii],
|
||||
scl_x[CL_Y_OFFSET + cii], scl_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
*/
|
||||
|
||||
//}
|
||||
|
||||
printf("##########\t##########\r\n");
|
||||
}
|
||||
|
||||
printf("\r\n");
|
||||
|
||||
//for (int ci = 0; ci < 16; ci++) {
|
||||
for (int ci = 35; ci < 37; ci++) {
|
||||
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
|
||||
MD_FLOAT *ci_x = &atom->cl_f[CI_VECTOR_BASE_INDEX(ci)];
|
||||
|
||||
//for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
|
||||
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
|
||||
ci_x[CL_Y_OFFSET + cii],
|
||||
ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
printf("##########\t##########\r\n");
|
||||
}
|
||||
|
||||
printf("verifyLayout end\r\n");
|
||||
|
||||
/*
|
||||
for (int i = 0; i < atom->Nclusters_local; i++) {
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
|
||||
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
|
||||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
|
||||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
void checkAlignment(Atom *atom) {
|
||||
alignDataToSuperclusters(atom);
|
||||
|
||||
for (int sci = 4; sci < 6; sci++) {
|
||||
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
|
||||
|
||||
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
||||
|
||||
const unsigned int cl_idx = cii / CLUSTER_M;
|
||||
const unsigned int ciii = cii % CLUSTER_M;
|
||||
|
||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
|
||||
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (int ci = 35; ci < 37; ci++) {
|
||||
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(ci)];
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
|
||||
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
|
||||
ci_x[CL_Y_OFFSET + cii],
|
||||
ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
printf("##########\t##########\r\n");
|
||||
}
|
||||
}
|
||||
|
||||
void showSuperclusters(Atom *atom) {
|
||||
for (int sci = 4; sci < 6; sci++) {
|
||||
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
|
||||
|
||||
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
|
||||
|
||||
const unsigned int cl_idx = cii / CLUSTER_M;
|
||||
const unsigned int ciii = cii % CLUSTER_M;
|
||||
|
||||
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
|
||||
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void printNeighs(Atom *atom, Neighbor *neighbor) {
|
||||
for (int i = 0; i < atom->Nclusters_local; ++i) {
|
||||
int neigh_num = neighbor->numneigh[i];
|
||||
for (int j = 0; j < neigh_num; j++) {
|
||||
printf("%d ", neighbor->neighbors[ i * neighbor->maxneighs + j]);
|
||||
}
|
||||
printf("\r\n");
|
||||
}
|
||||
}
|
||||
|
||||
void printClusterIndices(Atom *atom) {
|
||||
for (int i = 0; i < atom->Nsclusters_local; ++i) {
|
||||
int clusters_num = atom->siclusters[i].nclusters;
|
||||
for (int j = 0; j < clusters_num; j++) {
|
||||
printf("%d ", atom->icluster_idx[j + SCLUSTER_SIZE * i]);
|
||||
}
|
||||
printf("\r\n");
|
||||
}
|
||||
}
|
||||
|
||||
void verifyNeigh(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
buildNeighbor(atom, neighbor);
|
||||
int *numneigh = (int*) malloc(atom->Nclusters_local * sizeof(int));
|
||||
int *neighbors = (int*) malloc(atom->Nclusters_local * neighbor->maxneighs * sizeof(int*));
|
||||
|
||||
for (int i = 0; i < atom->Nclusters_local; ++i) {
|
||||
int neigh_num = neighbor->numneigh[i];
|
||||
numneigh[i] = neighbor->numneigh[i];
|
||||
neighbor->numneigh[i] = 0;
|
||||
for (int j = 0; j < neigh_num; j++) {
|
||||
neighbors[i * neighbor->maxneighs + j] = neighbor->neighbors[i * neighbor->maxneighs + j];
|
||||
neighbor->neighbors[i * neighbor->maxneighs + j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
buildNeighborGPU(atom, neighbor);
|
||||
|
||||
unsigned int num_diff = 0;
|
||||
unsigned int neigh_diff = 0;
|
||||
|
||||
for (int i = 0; i < atom->Nclusters_local; ++i) {
|
||||
int neigh_num = neighbor->numneigh[i];
|
||||
if (numneigh[i] != neigh_num) num_diff++;
|
||||
for (int j = 0; j < neigh_num; j++) {
|
||||
if (neighbors[i * neighbor->maxneighs + j] !=
|
||||
neighbor->neighbors[ i * neighbor->maxneighs + j]) neigh_diff++;
|
||||
}
|
||||
}
|
||||
|
||||
printf("%d\t%d\r\n", num_diff, neigh_diff);
|
||||
}
|
||||
|
||||
#endif //USE_SUPER_CLUSTERS
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -15,8 +15,61 @@ void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
|
||||
write_ghost_atoms_to_vtk_file(filename, atom, timestep);
|
||||
write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
|
||||
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
write_super_clusters_to_vtk_file(filename, atom, timestep);
|
||||
#endif //#ifdef USE_SUPER_CLUSTERS
|
||||
}
|
||||
|
||||
#ifdef USE_SUPER_CLUSTERS
|
||||
int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_sup_%d.vtk", filename, timestep);
|
||||
FILE* fp = fopen(timestep_filename, "wb");
|
||||
|
||||
if(fp == NULL) {
|
||||
fprintf(stderr, "Could not open VTK file for writing!\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fprintf(fp, "# vtk DataFile Version 2.0\n");
|
||||
fprintf(fp, "Particle data\n");
|
||||
fprintf(fp, "ASCII\n");
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nsclusters_local * SCLUSTER_M);
|
||||
for(int ci = 0; ci < atom->Nsclusters_local; ++ci) {
|
||||
|
||||
int factor = (rand() % 1000) + 1;
|
||||
//double factor = ci * 10;
|
||||
|
||||
int ci_vec_base = SCI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->scl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < SCLUSTER_M; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[SCL_X_OFFSET + cii] * factor, ci_x[SCL_Y_OFFSET + cii] * factor, ci_x[SCL_Z_OFFSET + cii] * factor);
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "1 %d\n", i);
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
|
||||
for(int i = 0; i < atom->Nlocal; ++i) {
|
||||
fprintf(fp, "1\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
|
||||
fprintf(fp, "SCALARS mass double\n");
|
||||
fprintf(fp, "LOOKUP_TABLE default\n");
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
fprintf(fp, "1.0\n");
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
#endif //USE_SUPER_CLUSTERS
|
||||
|
||||
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
|
||||
char timestep_filename[128];
|
||||
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
16
include_CLANG.mk
Normal file
16
include_CLANG.mk
Normal file
@ -0,0 +1,16 @@
|
||||
CC = clang
|
||||
LINKER = $(CC)
|
||||
|
||||
ANSI_CFLAGS = -ansi
|
||||
ANSI_CFLAGS += -std=c99
|
||||
ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
|
||||
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
|
||||
ASFLAGS = -masm=intel
|
||||
LFLAGS =
|
||||
DEFINES = -D_GNU_SOURCE
|
||||
INCLUDES =
|
||||
LIBS = -lm #-lomp
|
20
include_GCC.mk
Normal file
20
include_GCC.mk
Normal file
@ -0,0 +1,20 @@
|
||||
CC = gcc
|
||||
LINKER = $(CC)
|
||||
|
||||
ANSI_CFLAGS = -ansi
|
||||
ANSI_CFLAGS += -std=c99
|
||||
ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
|
||||
CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
|
||||
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS =
|
||||
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||
INCLUDES = $(LIKWID_INC)
|
||||
LIBS = -lm
|
11
include_GROMACS.mk
Normal file
11
include_GROMACS.mk
Normal file
@ -0,0 +1,11 @@
|
||||
GROMACS_PATH=/apps/Gromacs/2018.1-mkl
|
||||
GROMACS_INC ?= -I${GROMACS_PATH}/include
|
||||
GROMACS_DEFINES ?=
|
||||
GROMACS_LIB ?= -L${GROMACS_PATH}/lib64
|
||||
|
||||
ifeq ($(strip $(XTC_OUTPUT)),true)
|
||||
INCLUDES += ${GROMACS_INC}
|
||||
DEFINES += ${GROMACS_DEFINES}
|
||||
LIBS += -lgromacs
|
||||
LFLAGS += ${GROMACS_LIB}
|
||||
endif
|
@ -1,27 +1,13 @@
|
||||
CC = icc
|
||||
LINKER = $(CC)
|
||||
|
||||
OPENMP = -qopenmp
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
|
||||
ifeq ($(ISA),AVX512)
|
||||
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX2)
|
||||
OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -march=core-avx2 $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),AVX)
|
||||
OPTS = -Ofast -xAVX $(PROFILE)
|
||||
endif
|
||||
|
||||
ifeq ($(ISA),SSE)
|
||||
OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
endif
|
||||
|
||||
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
#OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
|
18
include_ICX.mk
Normal file
18
include_ICX.mk
Normal file
@ -0,0 +1,18 @@
|
||||
CC = icx
|
||||
LINKER = $(CC)
|
||||
|
||||
OPENMP = #-qopenmp
|
||||
PROFILE = #-profile-functions -g -pg
|
||||
#OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
|
||||
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX $(PROFILE)
|
||||
#OPTS = -Ofast -xAVX2 $(PROFILE)
|
||||
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
|
||||
#OPTS = -Ofast -no-vec $(PROFILE)
|
||||
OPTS = -Ofast -xHost $(PROFILE)
|
||||
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
|
||||
ASFLAGS = #-masm=intel
|
||||
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
|
||||
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN
|
||||
INCLUDES =
|
||||
LIBS = -lm
|
@ -9,15 +9,13 @@ else ifeq ($(strip $(ISA)), AVX_FMA)
|
||||
__ISA_AVX_FMA__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX2)
|
||||
#__SIMD_KERNEL__=true
|
||||
__ISA_AVX2__=true
|
||||
#__SIMD_KERNEL__=true
|
||||
__SIMD_WIDTH_DBL__=4
|
||||
else ifeq ($(strip $(ISA)), AVX512)
|
||||
__ISA_AVX512__=true
|
||||
__SIMD_WIDTH_DBL__=8
|
||||
ifeq ($(strip $(DATA_TYPE)), DP)
|
||||
__SIMD_KERNEL__=true
|
||||
endif
|
||||
__SIMD_WIDTH_DBL__=8
|
||||
endif
|
||||
|
||||
# SIMD width is specified in double-precision, hence it may
|
@ -8,7 +8,8 @@ ANSI_CFLAGS += -Wextra
|
||||
|
||||
#
|
||||
# A100 + Native
|
||||
CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||
#CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||
CFLAGS = -O3 -arch=compute_61 -code=sm_61,sm_80,sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||
# A40 + Native
|
||||
#CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
|
||||
# Cascade Lake
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -502,21 +502,6 @@ int readAtom_in(Atom* atom, Parameter* param) {
|
||||
return natoms;
|
||||
}
|
||||
|
||||
void writeAtom(Atom *atom, Parameter *param) {
|
||||
FILE *fp = fopen(param->write_atom_file, "w");
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n",
|
||||
atom->type[i], 1.0,
|
||||
atom_x(i), atom_y(i), atom_z(i),
|
||||
atom_vx(i), atom_vy(i), atom_vz(i));
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n",
|
||||
param->write_atom_file, param->xprd, param->yprd, param->zprd);
|
||||
}
|
||||
|
||||
void growAtom(Atom *atom) {
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
int nold = atom->Nmax;
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -29,7 +29,7 @@ extern "C" {
|
||||
}
|
||||
|
||||
// cuda kernel
|
||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
|
||||
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= Nlocal) {
|
||||
return;
|
||||
@ -46,10 +46,6 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neigh_neighbors[Nlocal * k + i];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
@ -59,7 +55,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * ntypes + type_j;
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
@ -113,7 +109,7 @@ extern "C" {
|
||||
|
||||
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
|
||||
@ -127,7 +123,7 @@ void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
|
||||
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
|
||||
@ -140,11 +136,13 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
|
||||
}
|
||||
|
||||
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
int Nlocal = atom->Nlocal;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
|
||||
/*
|
||||
int nDevices;
|
||||
@ -167,7 +165,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
|
||||
cuda_assert("calc_force", cudaPeekAtLastError());
|
||||
cuda_assert("calc_force", cudaDeviceSynchronize());
|
||||
cudaProfilerStop();
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins
|
||||
|
||||
__global__ void compute_neighborhood(
|
||||
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(i >= nlocal) {
|
||||
@ -157,7 +157,7 @@ __global__ void compute_neighborhood(
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
@ -206,7 +206,7 @@ void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbo
|
||||
|
||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
cudaProfilerStart();
|
||||
@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
|
||||
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
|
||||
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
||||
c_new_maxneighs,
|
||||
cutneighsq, atom->ntypes);
|
||||
cutneighsq);
|
||||
|
||||
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
|
||||
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -65,7 +65,7 @@ __global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
|
||||
if(reneigh) {
|
||||
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
@ -98,7 +98,7 @@ void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
|
||||
}
|
||||
|
||||
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
|
||||
const int num_threads_per_block = get_cuda_num_threads();
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -14,7 +14,6 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutneighsq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
|
||||
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
|
||||
@ -23,7 +22,6 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutneighsq, atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -31,12 +31,8 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
|
||||
double S = getTimeStamp();
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam_fp");
|
||||
|
||||
#pragma omp for
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@ -99,19 +95,13 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam_fp");
|
||||
}
|
||||
|
||||
// We still need to update fp for PBC atoms
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
fp[Nlocal + i] = fp[atom->border_map[i]];
|
||||
}
|
||||
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_START("force_eam");
|
||||
|
||||
#pragma omp for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
@ -202,8 +192,6 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force_eam");
|
||||
}
|
||||
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
250
lammps/force_lj.c
Normal file
250
lammps/force_lj.c
Normal file
@ -0,0 +1,250 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <likwid-marker.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
|
||||
#ifdef __SIMD_KERNEL__
|
||||
#include <simd.h>
|
||||
#endif
|
||||
|
||||
double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
addStat(stats->atoms_within_cutoff, 1);
|
||||
} else {
|
||||
addStat(stats->atoms_outside_cutoff, 1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) += fix;
|
||||
atom_fy(i) += fiy;
|
||||
atom_fz(i) += fiz;
|
||||
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("forceLJ-halfneigh");
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
// Pragma required to vectorize the inner loop
|
||||
#ifdef ENABLE_OMP_SIMD
|
||||
#pragma omp simd reduction(+: fix,fiy,fiz)
|
||||
#endif
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
|
||||
// We do not need to update forces for ghost atoms
|
||||
if(j < Nlocal) {
|
||||
atom_fx(j) -= delx * force;
|
||||
atom_fy(j) -= dely * force;
|
||||
atom_fz(j) -= delz * force;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) += fix;
|
||||
atom_fy(i) += fiy;
|
||||
atom_fz(i) += fiz;
|
||||
|
||||
addStat(stats->total_force_neighs, numneighs);
|
||||
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("forceLJ-halfneigh");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
atom_fx(i) = 0.0;
|
||||
atom_fy(i) = 0.0;
|
||||
atom_fz(i) = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#ifndef __SIMD_KERNEL__
|
||||
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
|
||||
exit(-1);
|
||||
#else
|
||||
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs);
|
||||
MD_SIMD_FLOAT xtmp = simd_broadcast(atom_x(i));
|
||||
MD_SIMD_FLOAT ytmp = simd_broadcast(atom_y(i));
|
||||
MD_SIMD_FLOAT ztmp = simd_broadcast(atom_z(i));
|
||||
MD_SIMD_FLOAT fix = simd_zero();
|
||||
MD_SIMD_FLOAT fiy = simd_zero();
|
||||
MD_SIMD_FLOAT fiz = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k += VECTOR_WIDTH) {
|
||||
// If the last iteration of this loop is separated from the rest, this mask can be set only there
|
||||
MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt(simd_int_add(simd_int_broadcast(k), simd_int_seq()), numneighs_vec);
|
||||
MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs);
|
||||
#ifdef AOS
|
||||
MD_SIMD_INT j3 = simd_int_add(simd_int_add(j, j), j); // j * 3
|
||||
MD_SIMD_FLOAT delx = xtmp - simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT));
|
||||
MD_SIMD_FLOAT dely = ytmp - simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT));
|
||||
MD_SIMD_FLOAT delz = ztmp - simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT));
|
||||
#else
|
||||
MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT));
|
||||
MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT));
|
||||
MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT));
|
||||
#endif
|
||||
MD_SIMD_FLOAT rsq = simd_fma(delx, delx, simd_fma(dely, dely, simd_mul(delz, delz)));
|
||||
MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs, simd_mask_cond_lt(rsq, cutforcesq_vec));
|
||||
MD_SIMD_FLOAT sr2 = simd_reciprocal(rsq);
|
||||
MD_SIMD_FLOAT sr6 = simd_mul(sr2, simd_mul(sr2, simd_mul(sr2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT force = simd_mul(c48_vec, simd_mul(sr6, simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec))));
|
||||
|
||||
fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask);
|
||||
fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask);
|
||||
fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask);
|
||||
}
|
||||
|
||||
atom_fx(i) += simd_h_reduce_sum(fix);
|
||||
atom_fy(i) += simd_h_reduce_sum(fiy);
|
||||
atom_fz(i) += simd_h_reduce_sum(fiz);
|
||||
}
|
||||
#endif
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
102
lammps/includes/atom.h
Normal file
102
lammps/includes/atom.h
Normal file
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <parameter.h>
|
||||
|
||||
#ifndef __ATOM_H_
|
||||
#define __ATOM_H_
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
# define KERNEL_NAME "CUDA"
|
||||
# define computeForceLJFullNeigh computeForceLJFullNeigh_cuda
|
||||
# define initialIntegrate initialIntegrate_cuda
|
||||
# define finalIntegrate finalIntegrate_cuda
|
||||
# define buildNeighbor buildNeighbor_cuda
|
||||
# define updatePbc updatePbc_cuda
|
||||
# define updateAtomsPbc updateAtomsPbc_cuda
|
||||
#else
|
||||
# ifdef USE_SIMD_KERNEL
|
||||
# define KERNEL_NAME "SIMD"
|
||||
# define computeForceLJFullNeigh computeForceLJFullNeigh_simd
|
||||
# else
|
||||
# define KERNEL_NAME "plain-C"
|
||||
# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c
|
||||
# endif
|
||||
# define initialIntegrate initialIntegrate_cpu
|
||||
# define finalIntegrate finalIntegrate_cpu
|
||||
# define buildNeighbor buildNeighbor_cpu
|
||||
# define updatePbc updatePbc_cpu
|
||||
# define updateAtomsPbc updateAtomsPbc_cpu
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
MD_FLOAT *x, *y, *z;
|
||||
MD_FLOAT *vx, *vy, *vz;
|
||||
MD_FLOAT *fx, *fy, *fz;
|
||||
int *border_map;
|
||||
int *type;
|
||||
MD_FLOAT *epsilon;
|
||||
MD_FLOAT *sigma6;
|
||||
MD_FLOAT *cutforcesq;
|
||||
MD_FLOAT *cutneighsq;
|
||||
} DeviceAtom;
|
||||
|
||||
typedef struct {
|
||||
int Natoms, Nlocal, Nghost, Nmax;
|
||||
MD_FLOAT *x, *y, *z;
|
||||
MD_FLOAT *vx, *vy, *vz;
|
||||
MD_FLOAT *fx, *fy, *fz;
|
||||
int *border_map;
|
||||
int *type;
|
||||
int ntypes;
|
||||
MD_FLOAT *epsilon;
|
||||
MD_FLOAT *sigma6;
|
||||
MD_FLOAT *cutforcesq;
|
||||
MD_FLOAT *cutneighsq;
|
||||
|
||||
// DEM
|
||||
MD_FLOAT *radius;
|
||||
MD_FLOAT *av;
|
||||
MD_FLOAT *r;
|
||||
|
||||
// Device data
|
||||
DeviceAtom d_atom;
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
extern void createAtom(Atom*, Parameter*);
|
||||
extern int readAtom(Atom*, Parameter*);
|
||||
extern int readAtom_pdb(Atom*, Parameter*);
|
||||
extern int readAtom_gro(Atom*, Parameter*);
|
||||
extern int readAtom_dmp(Atom*, Parameter*);
|
||||
extern int readAtom_in(Atom*, Parameter*);
|
||||
extern void growAtom(Atom*);
|
||||
|
||||
#ifdef AOS
|
||||
# define POS_DATA_LAYOUT "AoS"
|
||||
# define atom_x(i) atom->x[(i) * 3 + 0]
|
||||
# define atom_y(i) atom->x[(i) * 3 + 1]
|
||||
# define atom_z(i) atom->x[(i) * 3 + 2]
|
||||
# define atom_vx(i) atom->vx[(i) * 3 + 0]
|
||||
# define atom_vy(i) atom->vx[(i) * 3 + 1]
|
||||
# define atom_vz(i) atom->vx[(i) * 3 + 2]
|
||||
# define atom_fx(i) atom->fx[(i) * 3 + 0]
|
||||
# define atom_fy(i) atom->fx[(i) * 3 + 1]
|
||||
# define atom_fz(i) atom->fx[(i) * 3 + 2]
|
||||
#else
|
||||
# define POS_DATA_LAYOUT "SoA"
|
||||
# define atom_x(i) atom->x[i]
|
||||
# define atom_y(i) atom->y[i]
|
||||
# define atom_z(i) atom->z[i]
|
||||
# define atom_vx(i) atom->vx[i]
|
||||
# define atom_vy(i) atom->vy[i]
|
||||
# define atom_vz(i) atom->vz[i]
|
||||
# define atom_fx(i) atom->fx[i]
|
||||
# define atom_fy(i) atom->fy[i]
|
||||
# define atom_fz(i) atom->fz[i]
|
||||
#endif
|
||||
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -11,7 +11,7 @@
|
||||
|
||||
#ifndef __PBC_H_
|
||||
#define __PBC_H_
|
||||
extern void initPbc(Atom*);
|
||||
extern void initPbc();
|
||||
extern void updatePbc_cpu(Atom*, Parameter*, bool);
|
||||
extern void updateAtomsPbc_cpu(Atom*, Parameter*);
|
||||
extern void setupPbc(Atom*, Parameter*);
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
@ -59,6 +59,12 @@ void init(Parameter *param) {
|
||||
param->eam_file = NULL;
|
||||
}
|
||||
|
||||
// Show debug messages
|
||||
#define DEBUG(msg) printf(msg)
|
||||
// Do not show debug messages
|
||||
//#define DEBUG(msg)
|
||||
|
||||
|
||||
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
|
||||
const int maxneighs = nneighs * nreps;
|
||||
neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
|
||||
@ -119,7 +125,7 @@ int main(int argc, const char *argv[]) {
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
DEBUG_MESSAGE("Initializing parameters...\n");
|
||||
DEBUG("Initializing parameters...\n");
|
||||
init(¶m);
|
||||
|
||||
for(int i = 0; i < argc; i++) {
|
||||
@ -190,11 +196,11 @@ int main(int argc, const char *argv[]) {
|
||||
}
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
DEBUG_MESSAGE("Initializing EAM parameters...\n");
|
||||
DEBUG("Initializing EAM parameters...\n");
|
||||
initEam(&eam, ¶m);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Initializing atoms...\n");
|
||||
DEBUG("Initializing atoms...\n");
|
||||
initAtom(atom);
|
||||
initStats(&stats);
|
||||
|
||||
@ -210,7 +216,7 @@ int main(int argc, const char *argv[]) {
|
||||
atom->cutforcesq[i] = param.cutforce * param.cutforce;
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Creating atoms...\n");
|
||||
DEBUG("Creating atoms...\n");
|
||||
for(int i = 0; i < natoms; ++i) {
|
||||
while(atom->Nlocal > atom->Nmax - natoms) {
|
||||
growAtom(atom);
|
||||
@ -241,11 +247,11 @@ int main(int argc, const char *argv[]) {
|
||||
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Initializing neighbor lists...\n");
|
||||
DEBUG("Initializing neighbor lists...\n");
|
||||
initNeighbor(&neighbor, ¶m);
|
||||
DEBUG_MESSAGE("Creating neighbor lists...\n");
|
||||
DEBUG("Creating neighbor lists...\n");
|
||||
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
|
||||
DEBUG_MESSAGE("Computing forces...\n");
|
||||
DEBUG("Computing forces...\n");
|
||||
|
||||
double T_accum = 0.0;
|
||||
for(int i = 0; i < param.ntimes; i++) {
|
285
lammps/main.c
Normal file
285
lammps/main.c
Normal file
@ -0,0 +1,285 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
|
||||
#include <likwid-marker.h>
|
||||
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <eam.h>
|
||||
#include <integrate.h>
|
||||
#include <thermo.h>
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <pbc.h>
|
||||
#include <stats.h>
|
||||
#include <timers.h>
|
||||
#include <util.h>
|
||||
#include <vtk.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
|
||||
#endif
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||
param->xprd = param->nx * param->lattice;
|
||||
param->yprd = param->ny * param->lattice;
|
||||
param->zprd = param->nz * param->lattice;
|
||||
|
||||
S = getTimeStamp();
|
||||
initAtom(atom);
|
||||
initPbc(atom);
|
||||
initStats(stats);
|
||||
initNeighbor(neighbor, param);
|
||||
if(param->input_file == NULL) {
|
||||
createAtom(atom, param);
|
||||
} else {
|
||||
readAtom(atom, param);
|
||||
}
|
||||
|
||||
setupNeighbor(param);
|
||||
setupThermo(param, atom->Natoms);
|
||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||
setupPbc(atom, param);
|
||||
initDevice(atom, neighbor);
|
||||
updatePbc(atom, param, true);
|
||||
buildNeighbor(atom, neighbor);
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
double S, E;
|
||||
S = getTimeStamp();
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
updateAtomsPbc(atom, param);
|
||||
setupPbc(atom, param);
|
||||
updatePbc(atom, param, true);
|
||||
//sortAtom(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void printAtomState(Atom *atom) {
|
||||
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
|
||||
// int nall = atom->Nlocal + atom->Nghost;
|
||||
// for (int i=0; i<nall; i++) {
|
||||
// printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]);
|
||||
// }
|
||||
}
|
||||
|
||||
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) {
|
||||
return computeForceEam(eam, param, atom, neighbor, stats);
|
||||
} else if(param->force_field == FF_DEM) {
|
||||
if(param->half_neigh) {
|
||||
fprintf(stderr, "Error: DEM cannot use half neighbor-lists!\n");
|
||||
return 0.0;
|
||||
} else {
|
||||
return computeForceDemFullNeigh(param, atom, neighbor, stats);
|
||||
}
|
||||
}
|
||||
|
||||
if(param->half_neigh) {
|
||||
return computeForceLJHalfNeigh(param, atom, neighbor, stats);
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
return computeForceLJFullNeigh(param, atom, neighbor);
|
||||
#else
|
||||
return computeForceLJFullNeigh(param, atom, neighbor, stats);
|
||||
#endif
|
||||
}
|
||||
|
||||
void writeInput(Parameter *param, Atom *atom) {
|
||||
FILE *fpin = fopen("input.in", "w");
|
||||
fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd);
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
fprintf(fpin, "1,%f,%f,%f,%f,%f,%f\n", atom_x(i), atom_y(i), atom_z(i), atom_vx(i), atom_vy(i), atom_vz(i));
|
||||
}
|
||||
|
||||
fclose(fpin);
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
double timer[NUMTIMER];
|
||||
Eam eam;
|
||||
Atom atom;
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
#pragma omp parallel
|
||||
{
|
||||
LIKWID_MARKER_REGISTER("force");
|
||||
//LIKWID_MARKER_REGISTER("reneighbour");
|
||||
//LIKWID_MARKER_REGISTER("pbc");
|
||||
}
|
||||
|
||||
initParameter(¶m);
|
||||
for(int i = 0; i < argc; i++) {
|
||||
if((strcmp(argv[i], "-p") == 0)) {
|
||||
readParameter(¶m, argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-f") == 0)) {
|
||||
if((param.force_field = str2ff(argv[++i])) < 0) {
|
||||
fprintf(stderr, "Invalid force field!\n");
|
||||
exit(-1);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-i") == 0)) {
|
||||
param.input_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-e") == 0)) {
|
||||
param.eam_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
|
||||
param.ntimes = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nx") == 0)) {
|
||||
param.nx = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-ny") == 0)) {
|
||||
param.ny = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-nz") == 0)) {
|
||||
param.nz = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-half") == 0)) {
|
||||
param.half_neigh = atoi(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
|
||||
param.cutforce = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
|
||||
param.skin = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--freq") == 0)) {
|
||||
param.proc_freq = atof(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "--vtk") == 0)) {
|
||||
param.vtk_file = strdup(argv[++i]);
|
||||
continue;
|
||||
}
|
||||
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
|
||||
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
|
||||
printf(HLINE);
|
||||
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
|
||||
printf("-f <string>: force field (lj, eam or dem), default lj\n");
|
||||
printf("-i <string>: input file with atom positions (dump)\n");
|
||||
printf("-e <string>: input file for EAM\n");
|
||||
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
|
||||
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
|
||||
printf("-r / --radius <real>: set cutoff radius\n");
|
||||
printf("-s / --skin <real>: set skin (verlet buffer)\n");
|
||||
printf("--freq <real>: processor frequency (GHz)\n");
|
||||
printf("--vtk <string>: VTK file for visualization\n");
|
||||
printf(HLINE);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
param.cutneigh = param.cutforce + param.skin;
|
||||
setup(¶m, &eam, &atom, &neighbor, &stats);
|
||||
printParameter(¶m);
|
||||
printf(HLINE);
|
||||
|
||||
printf("step\ttemp\t\tpressure\n");
|
||||
computeThermo(0, ¶m, &atom);
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
//writeInput(¶m, &atom);
|
||||
|
||||
timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[TOTAL] = getTimeStamp();
|
||||
|
||||
if(param.vtk_file != NULL) {
|
||||
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
|
||||
}
|
||||
|
||||
for(int n = 0; n < param.ntimes; n++) {
|
||||
bool reneigh = (n + 1) % param.reneigh_every == 0;
|
||||
initialIntegrate(reneigh, ¶m, &atom);
|
||||
if((n + 1) % param.reneigh_every) {
|
||||
updatePbc(&atom, ¶m, false);
|
||||
} else {
|
||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||
}
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
finalIntegrate(reneigh, ¶m, &atom);
|
||||
|
||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||
#ifdef CUDA_TARGET
|
||||
memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
|
||||
#endif
|
||||
computeThermo(n + 1, ¶m, &atom);
|
||||
}
|
||||
|
||||
if(param.vtk_file != NULL) {
|
||||
write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
|
||||
}
|
||||
}
|
||||
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
|
||||
printf(HLINE);
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
#ifdef COMPUTE_STATS
|
||||
displayStatistics(&atom, ¶m, &stats, timer);
|
||||
#endif
|
||||
LIKWID_MARKER_CLOSE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
171
lammps/pbc.c
Normal file
171
lammps/pbc.c
Normal file
@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
//---
|
||||
#include <pbc.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
int NmaxGhost;
|
||||
int *PBCx, *PBCy, *PBCz;
|
||||
|
||||
static void growPbc(Atom*);
|
||||
|
||||
/* exported subroutines */
|
||||
void initPbc(Atom* atom) {
|
||||
NmaxGhost = 0;
|
||||
atom->border_map = NULL;
|
||||
PBCx = NULL; PBCy = NULL; PBCz = NULL;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc_cpu(Atom *atom, Parameter *param, bool doReneighbor) {
|
||||
int *border_map = atom->border_map;
|
||||
int nlocal = atom->Nlocal;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||
}
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
* to periodic boundary conditions */
|
||||
void updateAtomsPbc_cpu(Atom *atom, Parameter *param) {
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
if(atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if(atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if(atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if(atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if(atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if(atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup periodic boundary conditions by
|
||||
* defining ghost atoms around domain
|
||||
* only creates mapping and coordinate corrections
|
||||
* that are then enforced in updatePbc */
|
||||
#define ADDGHOST(dx,dy,dz) \
|
||||
Nghost++; \
|
||||
border_map[Nghost] = i; \
|
||||
PBCx[Nghost] = dx; \
|
||||
PBCy[Nghost] = dy; \
|
||||
PBCz[Nghost] = dz; \
|
||||
atom->type[atom->Nlocal + Nghost] = atom->type[i]
|
||||
|
||||
void setupPbc(Atom *atom, Parameter *param) {
|
||||
int *border_map = atom->border_map;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
MD_FLOAT Cutneigh = param->cutneigh;
|
||||
int Nghost = -1;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
|
||||
if (Nghost + 7 >= NmaxGhost) {
|
||||
growPbc(atom);
|
||||
border_map = atom->border_map;
|
||||
}
|
||||
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
|
||||
/* Setup ghost atoms */
|
||||
/* 6 planes */
|
||||
if(param->pbc_x != 0) {
|
||||
if (x < Cutneigh) { ADDGHOST(+1,0,0); }
|
||||
if (x >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
|
||||
}
|
||||
|
||||
if(param->pbc_y != 0) {
|
||||
if (y < Cutneigh) { ADDGHOST(0,+1,0); }
|
||||
if (y >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
|
||||
}
|
||||
|
||||
if(param->pbc_z != 0) {
|
||||
if (z < Cutneigh) { ADDGHOST(0,0,+1); }
|
||||
if (z >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
|
||||
}
|
||||
|
||||
/* 8 corners */
|
||||
if(param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
|
||||
if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1,+1,+1); }
|
||||
if (x < Cutneigh && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(+1,-1,+1); }
|
||||
if (x < Cutneigh && y >= Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
|
||||
if (x < Cutneigh && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
|
||||
if (x >= (xprd-Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1,+1,+1); }
|
||||
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,-1,+1); }
|
||||
if (x >= (xprd-Cutneigh) && y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
|
||||
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
|
||||
}
|
||||
|
||||
/* 12 edges */
|
||||
if(param->pbc_x != 0 && param->pbc_z != 0) {
|
||||
if (x < Cutneigh && z < Cutneigh) { ADDGHOST(+1,0,+1); }
|
||||
if (x < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
|
||||
if (x >= (xprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,0,+1); }
|
||||
if (x >= (xprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
|
||||
}
|
||||
|
||||
if(param->pbc_y != 0 && param->pbc_z != 0) {
|
||||
if (y < Cutneigh && z < Cutneigh) { ADDGHOST(0,+1,+1); }
|
||||
if (y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
|
||||
if (y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(0,-1,+1); }
|
||||
if (y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
|
||||
}
|
||||
|
||||
if(param->pbc_x != 0 && param->pbc_y != 0) {
|
||||
if (y < Cutneigh && x < Cutneigh) { ADDGHOST(+1,+1,0); }
|
||||
if (y < Cutneigh && x >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
|
||||
if (y >= (yprd-Cutneigh) && x < Cutneigh) { ADDGHOST(+1,-1,0); }
|
||||
if (y >= (yprd-Cutneigh) && x >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
||||
}
|
||||
}
|
||||
// increase by one to make it the ghost atom count
|
||||
atom->Nghost = Nghost + 1;
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
void growPbc(Atom* atom) {
|
||||
int nold = NmaxGhost;
|
||||
NmaxGhost += DELTA;
|
||||
|
||||
atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCx = (int*) reallocate(PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCy = (int*) reallocate(PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCz = (int*) reallocate(PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user