Compare commits

..

No commits in common. "main" and "gromacs_masking" have entirely different histories.

156 changed files with 31874 additions and 1660 deletions

View File

@ -1,176 +0,0 @@
---
Language: Cpp
# BasedOnStyle: WebKit
AccessModifierOffset: -4
AlignAfterOpenBracket: DontAlign
AlignArrayOfStructures: None
AlignConsecutiveAssignments: Consecutive
AlignConsecutiveBitFields: None
AlignConsecutiveDeclarations: None
AlignConsecutiveMacros: Consecutive
AlignEscapedNewlines: Right
AlignOperands: Align
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortEnumsOnASingleLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: OnlyFirstIf
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: MultiLine
AttributeMacros:
- __capability
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterFunction: true
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: WebKit
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeComma
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 90
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: false
DeriveLineEnding: true
DerivePointerAlignment: false
DisableFormat: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: LogicalBlock
ExperimentalAutoDetectBinPacking: false
BasedOnStyle: ''
ConstructorInitializerAllOnOneLineOrOnePerLine: false
AllowAllConstructorInitializersOnNextLine: true
FixNamespaceComments: false
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IfMacros:
- KJ_IF_MAYBE
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
Priority: 2
SortPriority: 0
CaseSensitive: false
- Regex: '^(<|"(gtest|gmock|isl|json)/)'
Priority: 3
SortPriority: 0
CaseSensitive: false
- Regex: '.*'
Priority: 1
SortPriority: 0
CaseSensitive: false
IncludeIsMainRegex: '(Test)?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseLabels: false
IndentCaseBlocks: false
IndentGotoLabels: true
IndentPPDirectives: None
IndentExternBlock: AfterExternBlock
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertTrailingCommas: None
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: true
LambdaBodyIndentation: Signature
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: Inner
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 4
ObjCBreakBeforeNestedBlockParam: true
ObjCSpaceAfterProperty: true
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 200
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PenaltyIndentedWhitespace: 0
PointerAlignment: Left
PPIndentWidth: -1
ReferenceAlignment: Pointer
ReflowComments: true
ShortNamespaceLines: 1
SortIncludes: CaseSensitive
SortJavaStaticImport: Before
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: true
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceAroundPointerQualifiers: Default
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: Never
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
BitFieldColonSpacing: Both
Standard: Latest
StatementAttributeLikeMacros:
- Q_EMIT
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseCRLF: false
UseTab: Never
WhitespaceSensitiveMacros:
- STRINGIZE
- PP_STRINGIZE
- BOOST_PP_STRINGIZE
- NS_SWIFT_NAME
- CF_SWIFT_NAME
...

View File

@ -1,14 +0,0 @@
---
Checks: 'clang-diagnostic-*,clang-analyzer-*,clang-bugprone-*,readability-identifier-naming'
WarningsAsErrors: true
HeaderFilterRegex: '.*'
AnalyzeTemporaryDtors: false
CheckOptions:
- key: readability-identifier-naming.StructCase
value: 'CamelCase'
- key: readability-identifier-naming.FunctionCase
value: 'camelBack'
- key: readability-identifier-naming.VariableCase
value: 'camelBack'
- key: readability-identifier-naming.GlobalConstantCase
value: 'UPPER_CASE'

View File

@ -1,3 +0,0 @@
CompileFlags:
Add: [-I/Users/jan/prg/MD-Bench/src/verletlist/, -I/Users/jan/prg/MD-Bench/src/common/, -DALIGNMENT=64]
Compiler: clang

129
Makefile
View File

@ -1,32 +1,116 @@
#CONFIGURE BUILD SYSTEM
TAG = $(OPT_TAG)-$(TOOLCHAIN)-$(DATA_TYPE)
TARGET = MDBench-$(TAG)
BUILD_DIR = ./build/build-$(TAG)
SRC_ROOT = ./src
SRC_DIR = $(SRC_ROOT)/$(OPT_SCHEME)
COMMON_DIR = $(SRC_ROOT)/common
CUDA_DIR = $(SRC_DIR)/cuda
MAKE_DIR = ./make
IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
TARGET = MDBench-$(IDENTIFIER)
BUILD_DIR = ./build-$(IDENTIFIER)
SRC_DIR = ./$(OPT_SCHEME)
ASM_DIR = ./asm
COMMON_DIR = ./common
CUDA_DIR = ./$(SRC_DIR)/cuda
MAKE_DIR = ./
Q ?= @
#DO NOT EDIT BELOW
include config.mk
include $(MAKE_DIR)/include_$(TOOLCHAIN).mk
include $(MAKE_DIR)/config.mk
include $(MAKE_DIR)/include_$(TAG).mk
include $(MAKE_DIR)/include_LIKWID.mk
ifneq ($(strip $(ISA)),NONE)
include $(MAKE_DIR)/include_ISA.mk
endif
INCLUDES += -I./$(SRC_DIR) -I./$(COMMON_DIR)
include $(MAKE_DIR)/include_GROMACS.mk
INCLUDES += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes
VPATH = $(SRC_DIR) $(COMMON_DIR) $(CUDA_DIR)
ifeq ($(strip $(DATA_LAYOUT)),AOS)
DEFINES += -DAOS
endif
ifeq ($(strip $(DATA_TYPE)),SP)
DEFINES += -DPRECISION=1
else
DEFINES += -DPRECISION=2
endif
ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
ifeq ($(strip $(MEM_TRACER)),true)
DEFINES += -DMEM_TRACER
endif
ifeq ($(strip $(INDEX_TRACER)),true)
DEFINES += -DINDEX_TRACER
endif
ifeq ($(strip $(COMPUTE_STATS)),true)
DEFINES += -DCOMPUTE_STATS
endif
ifeq ($(strip $(XTC_OUTPUT)),true)
DEFINES += -DXTC_OUTPUT
endif
ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
DEFINES += -DUSE_REFERENCE_VERSION
endif
ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true)
DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ
endif
ifeq ($(strip $(DEBUG)),true)
DEFINES += -DDEBUG
endif
ifneq ($(VECTOR_WIDTH),)
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
endif
ifeq ($(strip $(__SIMD_KERNEL__)),true)
DEFINES += -D__SIMD_KERNEL__
endif
ifeq ($(strip $(__SSE__)),true)
DEFINES += -D__ISA_SSE__
endif
ifeq ($(strip $(__ISA_AVX__)),true)
DEFINES += -D__ISA_AVX__
endif
ifeq ($(strip $(__ISA_AVX_FMA__)),true)
DEFINES += -D__ISA_AVX_FMA__
endif
ifeq ($(strip $(__ISA_AVX2__)),true)
DEFINES += -D__ISA_AVX2__
endif
ifeq ($(strip $(__ISA_AVX512__)),true)
DEFINES += -D__ISA_AVX512__
endif
ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
DEFINES += -DENABLE_OMP_SIMD
endif
ifeq ($(strip $(USE_SIMD_KERNEL)),true)
DEFINES += -DUSE_SIMD_KERNEL
endif
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OBJ = $(filter-out $(BUILD_DIR)/main%, $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(COMMON_DIR)/*.c))
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%-common.o,$(wildcard $(COMMON_DIR)/*.c))
ifeq ($(strip $(TAG)),NVCC)
OBJ += $(patsubst $(CUDA_DIR)/%.cu, $(BUILD_DIR)/%-cuda.o,$(wildcard $(CUDA_DIR)/*.cu))
endif
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
# $(warning $(OBJ))
ifneq ($(VARIANT),)
.DEFAULT_GOAL := ${TARGET}-$(VARIANT)
DEFINES += -DVARIANT=$(VARIANT)
@ -45,6 +129,11 @@ $(BUILD_DIR)/%.o: %.c
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%-common.o: $(COMMON_DIR)/%.c
$(info ===> COMPILE $@)
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%-cuda.o: %.cu
$(info ===> COMPILE $@)
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
@ -63,16 +152,18 @@ $(BUILD_DIR)/%.o: %.s
clean:
$(info ===> CLEAN)
@rm -rf $(BUILD_DIR)
@rm -rf MDBench-$(IDENTIFIER)
@rm -f tags
cleanall:
$(info ===> CLEAN)
@rm -rf build
@rm -rf build-*
@rm -rf MDBench-*
@rm -f tags
distclean: clean
$(info ===> DIST CLEAN)
@rm -f $(TARGET)
@rm -f $(TARGET)*
@rm -f tags
info:
@ -86,6 +177,6 @@ tags:
$(Q)ctags -R
$(BUILD_DIR):
@mkdir -p $(BUILD_DIR)
@mkdir $(BUILD_DIR)
-include $(OBJ:.o=.d)

View File

@ -1,14 +1,34 @@
# MD-Bench
MD-Bench is a toolbox for the performance engineering of short-range force
calculation kernels on molecular-dynamics applications. It aims at covering all
available state-of-the-art algorithms from different community codes such as
LAMMPS and GROMACS.
![Image](figures/features-v3.png "MD-Bench Features")
MD-Bench is a toolbox for the performance engineering of short-range force calculation kernels on molecular-dynamics applications.
It aims at covering all available state-of-the-art algorithms from different community codes such as LAMMPS and GROMACS.
Apart from that, many tools to study and evaluate the in-depth performance of such kernels on distinct hardware are offered, like gather-bench, a standalone benchmark that mimics the data movement from MD kernels and the stubbed force calculation cases that focus on isolating the impacts caused by memory latencies and control flow divergence contributions in the overall performance.
<table>
<thead>
<tr>
<th>Verlet Lists</th>
<th>GROMACS MxN</th>
<th>Stubbed cases</th>
</tr>
</thead>
<tbody>
<tr>
<td><a target="_blank" rel="noopener noreferrer" href="figures/verlet_v2.png"><img src="figures/verlet_v2.png" alt="Image" title="Verlet Lists" style="width: 100%;"></a></td>
<td><a target="_blank" rel="noopener noreferrer" href="figures/gromacs_mxn_v2.png"><img src="figures/gromacs_mxn_v2.png" alt="Image" title="GROMACS MxN" style="width: 90%;"></a></td>
<td><a target="_blank" rel="noopener noreferrer" href="figures/stub_new_v3.png"><img src="figures/stub_new_v3.png" alt="Image" title="Stubbed cases" style="width: 100%;"></a></td>
</tr>
</tbody>
</table>
<!-- ![Image](figures/gather_bench.png "gather-bench") -->
## Build instructions
Properly configure your building by changing `config.mk` file. The following
options are available:
Properly configure your building by changing `config.mk` file. The following options are available:
- **TAG:** Compiler tag (available options: GCC, CLANG, ICC, ONEAPI, NVCC).
- **ISA:** Instruction set (available options: SSE, AVX, AVX\_FMA, AVX2, AVX512).
@ -25,18 +45,15 @@ options are available:
- **COMPUTE\_STATS:** Compute statistics.
Configurations for LAMMPS Verlet Lists optimization scheme:
- **ENABLE\_OMP\_SIMD:** Use omp simd pragma on half neighbor-lists kernels.
- **USE\_SIMD\_KERNEL:** Compile kernel with explicit SIMD intrinsics.
Configurations for GROMACS MxN optimization scheme:
- **USE\_REFERENCE\_VERSION:** Use reference version (only for correction purposes).
- **XTC\_OUTPUT:** Enable XTC output.
- **HALF\_NEIGHBOR\_LISTS\_CHECK\_CJ:** Check if j-clusters are local when decreasing the reaction force.
Configurations for CUDA:
- **USE\_CUDA\_HOST\_MEMORY:** Use CUDA host memory to optimize host-device transfers.
When done, just use `make` to compile the code.
@ -51,14 +68,11 @@ Use the following command to run a simulation:
./MD-Bench-<TAG>-<OPT_SCHEME> [OPTION]...
```
Where `TAG` and `OPT_SCHEME` correspond to the building options with the same
name. Without any options, a Copper FCC lattice system with size 32x32x32
(131072 atoms) over 200 time-steps using the Lennard-Jones potential (sigma=1.0,
epsilon=1.0) is simulated.
Where `TAG` and `OPT_SCHEME` correspond to the building options with the same name.
Without any options, a Copper FCC lattice system with size 32x32x32 (131072 atoms) over 200 time-steps using the Lennard-Jones potential (sigma=1.0, epsilon=1.0) is simulated.
The default behavior and other options can be changed using the following parameters:
```sh
```
-p <string>: file to read parameters from (can be specified more than once)
-f <string>: force field (lj or eam), default lj
-i <string>: input file with atom positions (dump)
@ -78,17 +92,11 @@ TBD
## Citations
Rafael Ravedutti Lucio Machado, Jan Eitzinger, Harald Köstler, and Gerhard
Wellein: MD-Bench: A generic proxy-app toolbox for state-of-the-art molecular
dynamics algorithms. Accepted for [PPAM](https://ppam.edu.pl/) 2022, the 14th
International Conference on Parallel Processing and Applied Mathematics, Gdansk,
Poland, September 11-14, 2022. PPAM 2022 Best Paper Award. Preprint:
[arXiv:2207.13094](https://arxiv.org/abs/2207.13094)
Rafael Ravedutti Lucio Machado, Jan Eitzinger, Harald Köstler, and Gerhard Wellein: MD-Bench: A generic proxy-app toolbox for state-of-the-art molecular dynamics algorithms. Accepted for [PPAM](https://ppam.edu.pl/) 2022, the 14th International Conference on Parallel Processing and Applied Mathematics, Gdansk, Poland, September 11-14, 2022. PPAM 2022 Best Paper Award. Preprint: [arXiv:2207.13094](https://arxiv.org/abs/2207.13094)
## Credits
MD-Bench is developed by the Erlangen National High Performance Computing Center
([NHR@FAU](https://hpc.fau.de/)) at the University of Erlangen-Nürnberg.
MD-Bench is developed by the Erlangen National High Performance Computing Center ([NHR@FAU](https://hpc.fau.de/)) at the University of Erlangen-Nürnberg.
## License

0
asm/.gitkeep Normal file
View File

View File

@ -0,0 +1,626 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
# mark_description "-I/mnt/opt/likwid-5.2-dev/include -I./src/includes -S -D_GNU_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DN";
# mark_description "EIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ";
# mark_description "ICC/force.s";
.file "force.c"
.text
..TXTST0:
.L_2__routine_start_computeForce_0:
# -- Begin computeForce
.text
# mark_begin;
.align 16,0x90
.globl computeForce
# --- computeForce(Parameter *, Atom *, Neighbor *, int, int, int)
computeForce:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %ecx
# parameter 5: %r8d
# parameter 6: %r9d
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_computeForce.1:
..L2:
#121.112
pushq %rbp #121.112
.cfi_def_cfa_offset 16
movq %rsp, %rbp #121.112
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-64, %rsp #121.112
pushq %r12 #121.112
pushq %r13 #121.112
pushq %r14 #121.112
pushq %r15 #121.112
pushq %rbx #121.112
subq $88, %rsp #121.112
xorl %eax, %eax #124.16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
movq %rdx, %r15 #121.112
movq %rsi, %r12 #121.112
movq %rdi, %rbx #121.112
..___tag_value_computeForce.11:
# getTimeStamp()
call getTimeStamp #124.16
..___tag_value_computeForce.12:
# LOE rbx r12 r15 xmm0
..B1.51: # Preds ..B1.1
# Execution count [1.00e+00]
vmovsd %xmm0, 24(%rsp) #124.16[spill]
# LOE rbx r12 r15
..B1.2: # Preds ..B1.51
# Execution count [1.00e+00]
movl 4(%r12), %r13d #125.18
movq 64(%r12), %r9 #127.20
movq 72(%r12), %r14 #127.45
movq 80(%r12), %r8 #127.70
vmovsd 72(%rbx), %xmm2 #129.27
vmovsd 8(%rbx), %xmm1 #130.23
vmovsd (%rbx), %xmm0 #131.24
testl %r13d, %r13d #134.24
jle ..B1.43 # Prob 50% #134.24
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.3: # Preds ..B1.2
# Execution count [1.00e+00]
xorl %ebx, %ebx #134.5
movl %r13d, %edx #134.5
xorl %ecx, %ecx #134.5
movl $1, %esi #134.5
xorl %eax, %eax #135.17
shrl $1, %edx #134.5
je ..B1.7 # Prob 9% #134.5
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.5: # Preds ..B1.3 ..B1.5
# Execution count [2.50e+00]
movq %rax, (%rcx,%r9) #135.9
incq %rbx #134.5
movq %rax, (%rcx,%r14) #136.9
movq %rax, (%rcx,%r8) #137.9
movq %rax, 8(%rcx,%r9) #135.9
movq %rax, 8(%rcx,%r14) #136.9
movq %rax, 8(%rcx,%r8) #137.9
addq $16, %rcx #134.5
cmpq %rdx, %rbx #134.5
jb ..B1.5 # Prob 63% #134.5
# LOE rax rdx rcx rbx r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.6: # Preds ..B1.5
# Execution count [9.00e-01]
lea 1(%rbx,%rbx), %esi #135.9
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.7: # Preds ..B1.3 ..B1.6
# Execution count [1.00e+00]
lea -1(%rsi), %edx #134.5
cmpl %r13d, %edx #134.5
jae ..B1.9 # Prob 9% #134.5
# LOE rax r8 r9 r12 r14 r15 esi r13d xmm0 xmm1 xmm2
..B1.8: # Preds ..B1.7
# Execution count [9.00e-01]
movslq %esi, %rsi #134.5
movq %rax, -8(%r9,%rsi,8) #135.9
movq %rax, -8(%r14,%rsi,8) #136.9
movq %rax, -8(%r8,%rsi,8) #137.9
# LOE r8 r9 r12 r14 r15 r13d xmm0 xmm1 xmm2
..B1.9: # Preds ..B1.7 ..B1.8
# Execution count [5.00e-01]
movl $.L_2__STRING.0, %edi #141.5
movq %r8, 32(%rsp) #141.5[spill]
movq %r9, 80(%rsp) #141.5[spill]
vmovsd %xmm2, (%rsp) #141.5[spill]
vmovsd %xmm1, 8(%rsp) #141.5[spill]
vmovsd %xmm0, 16(%rsp) #141.5[spill]
..___tag_value_computeForce.18:
# likwid_markerStartRegion(const char *)
call likwid_markerStartRegion #141.5
..___tag_value_computeForce.19:
# LOE r12 r14 r15 r13d
..B1.10: # Preds ..B1.9
# Execution count [9.00e-01]
vmovsd 16(%rsp), %xmm0 #[spill]
xorl %esi, %esi #143.15
vmovsd (%rsp), %xmm2 #[spill]
xorl %eax, %eax #143.5
vmulsd %xmm2, %xmm2, %xmm13 #129.45
xorl %edi, %edi #143.5
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #173.13
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #197.45
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #173.13
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #197.58
vmovsd 8(%rsp), %xmm1 #[spill]
vbroadcastsd %xmm13, %zmm14 #129.25
vbroadcastsd %xmm1, %zmm13 #130.21
vbroadcastsd %xmm0, %zmm9 #197.45
movslq %r13d, %r13 #143.5
movq 24(%r15), %r10 #145.25
movslq 16(%r15), %rdx #144.43
movq 8(%r15), %rcx #144.19
movq 32(%rsp), %r8 #[spill]
movq 16(%r12), %rbx #146.25
shlq $2, %rdx #126.5
movq %r13, 64(%rsp) #143.5[spill]
movq %r10, 72(%rsp) #143.5[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.11: # Preds ..B1.41 ..B1.10
# Execution count [5.00e+00]
movq 72(%rsp), %r9 #145.25[spill]
vxorpd %xmm24, %xmm24, %xmm24 #149.22
vmovapd %xmm24, %xmm18 #150.22
movl (%r9,%rax,4), %r10d #145.25
vmovapd %xmm18, %xmm4 #151.22
vmovsd (%rdi,%rbx), %xmm10 #146.25
vmovsd 8(%rdi,%rbx), %xmm6 #147.25
vmovsd 16(%rdi,%rbx), %xmm12 #148.25
testl %r10d, %r10d #173.32
jle ..B1.41 # Prob 50% #173.32
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
vpxord %zmm8, %zmm8, %zmm8 #149.22
vmovaps %zmm8, %zmm7 #150.22
vmovaps %zmm7, %zmm11 #151.22
cmpl $8, %r10d #173.13
jl ..B1.48 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.13: # Preds ..B1.12
# Execution count [4.50e+00]
cmpl $1200, %r10d #173.13
jl ..B1.47 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.14: # Preds ..B1.13
# Execution count [4.50e+00]
movq %rdx, %r15 #144.43
imulq %rsi, %r15 #144.43
addq %rcx, %r15 #126.5
movq %r15, %r11 #173.13
andq $63, %r11 #173.13
testl $3, %r11d #173.13
je ..B1.16 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.15: # Preds ..B1.14
# Execution count [2.25e+00]
xorl %r11d, %r11d #173.13
jmp ..B1.18 # Prob 100% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.16: # Preds ..B1.14
# Execution count [2.25e+00]
testl %r11d, %r11d #173.13
je ..B1.18 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.17: # Preds ..B1.16
# Execution count [2.50e+01]
negl %r11d #173.13
addl $64, %r11d #173.13
shrl $2, %r11d #173.13
cmpl %r11d, %r10d #173.13
cmovl %r10d, %r11d #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.18: # Preds ..B1.15 ..B1.17 ..B1.16
# Execution count [5.00e+00]
movl %r10d, %r13d #173.13
subl %r11d, %r13d #173.13
andl $7, %r13d #173.13
negl %r13d #173.13
addl %r10d, %r13d #173.13
cmpl $1, %r11d #173.13
jb ..B1.26 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.19: # Preds ..B1.18
# Execution count [4.50e+00]
vmovdqa %ymm15, %ymm4 #173.13
xorl %r12d, %r12d #173.13
vpbroadcastd %r11d, %ymm3 #173.13
vbroadcastsd %xmm10, %zmm2 #146.23
vbroadcastsd %xmm6, %zmm1 #147.23
vbroadcastsd %xmm12, %zmm0 #148.23
movslq %r11d, %r9 #173.13
movq %r8, 32(%rsp) #173.13[spill]
movq %r14, (%rsp) #173.13[spill]
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.20: # Preds ..B1.24 ..B1.19
# Execution count [2.50e+01]
vpcmpgtd %ymm4, %ymm3, %k3 #173.13
vmovdqu32 (%r15,%r12,4), %ymm17{%k3}{z} #174.25
kmovw %k3, %r14d #173.13
vpaddd %ymm17, %ymm17, %ymm18 #175.40
vpaddd %ymm18, %ymm17, %ymm17 #175.40
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.23: # Preds ..B1.20
# Execution count [1.25e+01]
kmovw %k3, %k1 #175.40
kmovw %k3, %k2 #175.40
vpxord %zmm18, %zmm18, %zmm18 #175.40
vpxord %zmm19, %zmm19, %zmm19 #175.40
vpxord %zmm20, %zmm20, %zmm20 #175.40
vgatherdpd 16(%rbx,%ymm17,8), %zmm18{%k1} #175.40
vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #175.40
vgatherdpd (%rbx,%ymm17,8), %zmm20{%k3} #175.40
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
..B1.24: # Preds ..B1.23
# Execution count [2.50e+01]
addq $8, %r12 #173.13
#vpaddd %ymm16, %ymm4, %ymm4 #173.13
#vsubpd %zmm18, %zmm0, %zmm29 #177.40
#vsubpd %zmm19, %zmm1, %zmm27 #176.40
#vsubpd %zmm20, %zmm2, %zmm26 #175.40
#vmulpd %zmm27, %zmm27, %zmm25 #178.53
#vfmadd231pd %zmm26, %zmm26, %zmm25 #178.53
#vfmadd231pd %zmm29, %zmm29, %zmm25 #178.67
#vrcp14pd %zmm25, %zmm24 #195.42
#vcmppd $1, %zmm14, %zmm25, %k2 #194.26
#vfpclasspd $30, %zmm24, %k0 #195.42
#kmovw %k2, %r8d #194.26
#knotw %k0, %k1 #195.42
#vmovaps %zmm25, %zmm17 #195.42
#andl %r8d, %r14d #194.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #195.42
#kmovw %r14d, %k3 #198.21
#vmulpd %zmm17, %zmm17, %zmm18 #195.42
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #195.42
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #195.42
#vmulpd %zmm13, %zmm24, %zmm19 #196.42
#vmulpd %zmm9, %zmm24, %zmm21 #197.58
#vmulpd %zmm19, %zmm24, %zmm22 #196.48
#vmulpd %zmm22, %zmm24, %zmm20 #196.54
#vfmsub213pd %zmm5, %zmm22, %zmm24 #197.58
#vmulpd %zmm21, %zmm20, %zmm23 #197.65
#vmulpd %zmm24, %zmm23, %zmm28 #197.71
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #198.21
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #199.21
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #200.21
cmpq %r9, %r12 #173.13
jb ..B1.20 # Prob 82% #173.13
# LOE rax rdx rcx rbx rsi rdi r9 r12 r15 r10d r11d r13d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.25: # Preds ..B1.24
# Execution count [4.50e+00]
movq 32(%rsp), %r8 #[spill]
movq (%rsp), %r14 #[spill]
cmpl %r11d, %r10d #173.13
je ..B1.40 # Prob 10% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.26: # Preds ..B1.25 ..B1.18 ..B1.47
# Execution count [2.50e+01]
lea 8(%r11), %r9d #173.13
cmpl %r9d, %r13d #173.13
jl ..B1.34 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.27: # Preds ..B1.26
# Execution count [4.50e+00]
movq %rdx, %r12 #144.43
imulq %rsi, %r12 #144.43
vbroadcastsd %xmm10, %zmm1 #146.23
vbroadcastsd %xmm6, %zmm0 #147.23
vbroadcastsd %xmm12, %zmm2 #148.23
movslq %r11d, %r9 #173.13
addq %rcx, %r12 #126.5
movq %rdi, 8(%rsp) #126.5[spill]
movq %rdx, 16(%rsp) #126.5[spill]
movq %rcx, 40(%rsp) #126.5[spill]
movq %rax, 48(%rsp) #126.5[spill]
movq %rsi, 56(%rsp) #126.5[spill]
movq %r8, 32(%rsp) #126.5[spill]
movq %r14, (%rsp) #126.5[spill]
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.28: # Preds ..B1.32 ..B1.27
# Execution count [2.50e+01]
vmovdqu (%r12,%r9,4), %ymm3 #174.25
vpaddd %ymm3, %ymm3, %ymm4 #175.40
vpaddd %ymm4, %ymm3, %ymm3 #175.40
movl (%r12,%r9,4), %r14d #174.25
movl 4(%r12,%r9,4), %r8d #174.25
movl 8(%r12,%r9,4), %edi #174.25
movl 12(%r12,%r9,4), %esi #174.25
lea (%r14,%r14,2), %r14d #175.40
movl 16(%r12,%r9,4), %ecx #174.25
lea (%r8,%r8,2), %r8d #175.40
movl 20(%r12,%r9,4), %edx #174.25
lea (%rdi,%rdi,2), %edi #175.40
movl 24(%r12,%r9,4), %eax #174.25
lea (%rsi,%rsi,2), %esi #175.40
movl 28(%r12,%r9,4), %r15d #174.25
lea (%rcx,%rcx,2), %ecx #175.40
lea (%rdx,%rdx,2), %edx #175.40
lea (%rax,%rax,2), %eax #175.40
lea (%r15,%r15,2), %r15d #175.40
# LOE rbx r9 r12 eax edx ecx esi edi r8d r10d r11d r13d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.31: # Preds ..B1.28
# Execution count [1.25e+01]
vpcmpeqb %xmm0, %xmm0, %k1 #175.40
vpcmpeqb %xmm0, %xmm0, %k2 #175.40
vpcmpeqb %xmm0, %xmm0, %k3 #175.40
vpxord %zmm4, %zmm4, %zmm4 #175.40
vpxord %zmm17, %zmm17, %zmm17 #175.40
vpxord %zmm18, %zmm18, %zmm18 #175.40
vgatherdpd 16(%rbx,%ymm3,8), %zmm4{%k1} #175.40
vgatherdpd 8(%rbx,%ymm3,8), %zmm17{%k2} #175.40
vgatherdpd (%rbx,%ymm3,8), %zmm18{%k3} #175.40
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
..B1.32: # Preds ..B1.31
# Execution count [2.50e+01]
addl $8, %r11d #173.13
addq $8, %r9 #173.13
#vsubpd %zmm4, %zmm2, %zmm26 #177.40
#vsubpd %zmm17, %zmm0, %zmm24 #176.40
#vsubpd %zmm18, %zmm1, %zmm23 #175.40
#vmulpd %zmm24, %zmm24, %zmm3 #178.53
#vfmadd231pd %zmm23, %zmm23, %zmm3 #178.53
#vfmadd231pd %zmm26, %zmm26, %zmm3 #178.67
#vrcp14pd %zmm3, %zmm22 #195.42
#vcmppd $1, %zmm14, %zmm3, %k2 #194.26
#vfpclasspd $30, %zmm22, %k0 #195.42
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #195.42
#knotw %k0, %k1 #195.42
#vmulpd %zmm3, %zmm3, %zmm4 #195.42
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #195.42
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #195.42
#vmulpd %zmm13, %zmm22, %zmm17 #196.42
#vmulpd %zmm9, %zmm22, %zmm19 #197.58
#vmulpd %zmm17, %zmm22, %zmm20 #196.48
#vmulpd %zmm20, %zmm22, %zmm18 #196.54
#vfmsub213pd %zmm5, %zmm20, %zmm22 #197.58
#vmulpd %zmm19, %zmm18, %zmm21 #197.65
#vmulpd %zmm22, %zmm21, %zmm25 #197.71
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #198.21
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #199.21
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #200.21
cmpl %r13d, %r11d #173.13
jb ..B1.28 # Prob 82% #173.13
# LOE rbx r9 r12 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32
# Execution count [4.50e+00]
movq 8(%rsp), %rdi #[spill]
movq 16(%rsp), %rdx #[spill]
movq 40(%rsp), %rcx #[spill]
movq 48(%rsp), %rax #[spill]
movq 56(%rsp), %rsi #[spill]
movq 32(%rsp), %r8 #[spill]
movq (%rsp), %r14 #[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.34: # Preds ..B1.33 ..B1.26 ..B1.48
# Execution count [5.00e+00]
lea 1(%r13), %r9d #173.13
cmpl %r10d, %r9d #173.13
ja ..B1.40 # Prob 50% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.35: # Preds ..B1.34
# Execution count [2.50e+01]
imulq %rdx, %rsi #144.43
vbroadcastsd %xmm10, %zmm4 #146.23
subl %r13d, %r10d #173.13
addq %rcx, %rsi #126.5
vpbroadcastd %r10d, %ymm0 #173.13
vpcmpgtd %ymm15, %ymm0, %k3 #173.13
movslq %r13d, %r13 #173.13
kmovw %k3, %r9d #173.13
vmovdqu32 (%rsi,%r13,4), %ymm1{%k3}{z} #174.25
vpaddd %ymm1, %ymm1, %ymm2 #175.40
vpaddd %ymm2, %ymm1, %ymm0 #175.40
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.38: # Preds ..B1.35
# Execution count [1.25e+01]
kmovw %k3, %k1 #175.40
kmovw %k3, %k2 #175.40
vpxord %zmm1, %zmm1, %zmm1 #175.40
vpxord %zmm2, %zmm2, %zmm2 #175.40
vpxord %zmm3, %zmm3, %zmm3 #175.40
vgatherdpd 16(%rbx,%ymm0,8), %zmm1{%k1} #175.40
vgatherdpd 8(%rbx,%ymm0,8), %zmm2{%k2} #175.40
vgatherdpd (%rbx,%ymm0,8), %zmm3{%k3} #175.40
# LOE rax rdx rcx rbx rdi r8 r14 r9d xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.39: # Preds ..B1.38
# Execution count [2.50e+01]
#vbroadcastsd %xmm6, %zmm6 #147.23
#vbroadcastsd %xmm12, %zmm12 #148.23
#vsubpd %zmm1, %zmm12, %zmm23 #177.40
#vsubpd %zmm2, %zmm6, %zmm21 #176.40
#vsubpd %zmm3, %zmm4, %zmm20 #175.40
#vmulpd %zmm21, %zmm21, %zmm19 #178.53
#vfmadd231pd %zmm20, %zmm20, %zmm19 #178.53
#vfmadd231pd %zmm23, %zmm23, %zmm19 #178.67
#vrcp14pd %zmm19, %zmm18 #195.42
#vcmppd $1, %zmm14, %zmm19, %k2 #194.26
#vfpclasspd $30, %zmm18, %k0 #195.42
#kmovw %k2, %esi #194.26
#knotw %k0, %k1 #195.42
#vmovaps %zmm19, %zmm0 #195.42
#andl %esi, %r9d #194.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #195.42
#kmovw %r9d, %k3 #198.21
#vmulpd %zmm0, %zmm0, %zmm1 #195.42
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #195.42
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #195.42
#vmulpd %zmm13, %zmm18, %zmm2 #196.42
#vmulpd %zmm9, %zmm18, %zmm4 #197.58
#vmulpd %zmm2, %zmm18, %zmm10 #196.48
#vmulpd %zmm10, %zmm18, %zmm3 #196.54
#vfmsub213pd %zmm5, %zmm10, %zmm18 #197.58
#vmulpd %zmm4, %zmm3, %zmm17 #197.65
#vmulpd %zmm18, %zmm17, %zmm22 #197.71
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #198.21
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #199.21
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #200.21
# LOE rax rdx rcx rbx rdi r8 r14 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.40: # Preds ..B1.25 ..B1.39 ..B1.34
# Execution count [4.50e+00]
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #151.22
vpermd %zmm11, %zmm19, %zmm0 #151.22
vpermd %zmm7, %zmm19, %zmm6 #150.22
vpermd %zmm8, %zmm19, %zmm20 #149.22
vaddpd %zmm11, %zmm0, %zmm11 #151.22
vaddpd %zmm7, %zmm6, %zmm7 #150.22
vaddpd %zmm8, %zmm20, %zmm8 #149.22
vpermpd $78, %zmm11, %zmm1 #151.22
vpermpd $78, %zmm7, %zmm10 #150.22
vpermpd $78, %zmm8, %zmm21 #149.22
vaddpd %zmm1, %zmm11, %zmm2 #151.22
vaddpd %zmm10, %zmm7, %zmm12 #150.22
vaddpd %zmm21, %zmm8, %zmm22 #149.22
vpermpd $177, %zmm2, %zmm3 #151.22
vpermpd $177, %zmm12, %zmm17 #150.22
vpermpd $177, %zmm22, %zmm23 #149.22
vaddpd %zmm3, %zmm2, %zmm4 #151.22
vaddpd %zmm17, %zmm12, %zmm18 #150.22
vaddpd %zmm23, %zmm22, %zmm24 #149.22
# LOE rax rdx rcx rbx rdi r8 r14 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.41: # Preds ..B1.40 ..B1.11
# Execution count [5.00e+00]
movq 80(%rsp), %rsi #208.9[spill]
addq $24, %rdi #143.5
vaddsd (%rsi,%rax,8), %xmm24, %xmm0 #208.9
vmovsd %xmm0, (%rsi,%rax,8) #208.9
movslq %eax, %rsi #143.32
vaddsd (%r14,%rax,8), %xmm18, %xmm1 #209.9
vmovsd %xmm1, (%r14,%rax,8) #209.9
incq %rsi #143.32
vaddsd (%r8,%rax,8), %xmm4, %xmm2 #210.9
vmovsd %xmm2, (%r8,%rax,8) #210.9
incq %rax #143.5
cmpq 64(%rsp), %rax #143.5[spill]
jb ..B1.11 # Prob 82% #143.5
jmp ..B1.44 # Prob 100% #143.5
# LOE rax rdx rcx rbx rsi rdi r8 r14 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.43: # Preds ..B1.2
# Execution count [5.00e-01]
movl $.L_2__STRING.0, %edi #141.5
..___tag_value_computeForce.48:
# likwid_markerStartRegion(const char *)
call likwid_markerStartRegion #141.5
..___tag_value_computeForce.49:
# LOE
..B1.44: # Preds ..B1.41 ..B1.43
# Execution count [1.00e+00]
movl $.L_2__STRING.0, %edi #219.5
vzeroupper #219.5
..___tag_value_computeForce.50:
# likwid_markerStopRegion(const char *)
call likwid_markerStopRegion #219.5
..___tag_value_computeForce.51:
# LOE
..B1.45: # Preds ..B1.44
# Execution count [1.00e+00]
xorl %eax, %eax #221.16
..___tag_value_computeForce.52:
# getTimeStamp()
call getTimeStamp #221.16
..___tag_value_computeForce.53:
# LOE xmm0
..B1.46: # Preds ..B1.45
# Execution count [1.00e+00]
vsubsd 24(%rsp), %xmm0, %xmm0 #224.14[spill]
addq $88, %rsp #224.14
.cfi_restore 3
popq %rbx #224.14
.cfi_restore 15
popq %r15 #224.14
.cfi_restore 14
popq %r14 #224.14
.cfi_restore 13
popq %r13 #224.14
.cfi_restore 12
popq %r12 #224.14
movq %rbp, %rsp #224.14
popq %rbp #224.14
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #224.14
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.47: # Preds ..B1.13
# Execution count [4.50e-01]: Infreq
movl %r10d, %r13d #173.13
xorl %r11d, %r11d #173.13
andl $-8, %r13d #173.13
jmp ..B1.26 # Prob 100% #173.13
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r11d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.48: # Preds ..B1.12
# Execution count [4.50e-01]: Infreq
xorl %r13d, %r13d #173.13
jmp ..B1.34 # Prob 100% #173.13
.align 16,0x90
# LOE rax rdx rcx rbx rsi rdi r8 r14 r10d r13d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
.cfi_endproc
# mark_end;
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.5:
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 64
.L_2il0floatpacket.7:
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
.type .L_2il0floatpacket.7,@object
.size .L_2il0floatpacket.7,64
.align 64
.L_2il0floatpacket.8:
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
.type .L_2il0floatpacket.8,@object
.size .L_2il0floatpacket.8,64
.align 64
.L_2il0floatpacket.10:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.10,@object
.size .L_2il0floatpacket.10,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.9:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.9,@object
.size .L_2il0floatpacket.9,8
.section .rodata.str1.4, "aMS",@progbits,1
.align 4
.align 4
.L_2__STRING.0:
.long 1668444006
.word 101
.type .L_2__STRING.0,@object
.size .L_2__STRING.0,6
.data
.section .note.GNU-stack, ""
# End

585
asm/unused/force-mem-only.s Normal file
View File

@ -0,0 +1,585 @@
# mark_description "Intel(R) C Intel(R) 64 Compiler for applications running on Intel(R) 64, Version 19.0.5.281 Build 20190815";
# mark_description "-I./src/includes -S -D_GNU_SOURCE -DAOS -DPRECISION=2 -DNEIGHBORS_LOOP_RUNS=1 -DVECTOR_WIDTH=8 -DALIGNMENT=6";
# mark_description "4 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o ICC/force.s";
.file "force.c"
.text
..TXTST0:
.L_2__routine_start_computeForce_0:
# -- Begin computeForce
.text
# mark_begin;
.align 16,0x90
.globl computeForce
# --- computeForce(Parameter *, Atom *, Neighbor *, int)
computeForce:
# parameter 1: %rdi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %ecx
..B1.1: # Preds ..B1.0
# Execution count [1.00e+00]
.cfi_startproc
..___tag_value_computeForce.1:
..L2:
#103.87
pushq %rbp #103.87
.cfi_def_cfa_offset 16
movq %rsp, %rbp #103.87
.cfi_def_cfa 6, 16
.cfi_offset 6, -16
andq $-64, %rsp #103.87
pushq %r12 #103.87
pushq %r13 #103.87
pushq %r14 #103.87
subq $104, %rsp #103.87
xorl %eax, %eax #106.16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
movq %rdx, %r14 #103.87
movq %rsi, %r13 #103.87
movq %rdi, %r12 #103.87
..___tag_value_computeForce.9:
# getTimeStamp()
call getTimeStamp #106.16
..___tag_value_computeForce.10:
# LOE rbx r12 r13 r14 r15 xmm0
..B1.48: # Preds ..B1.1
# Execution count [1.00e+00]
vmovsd %xmm0, 16(%rsp) #106.16[spill]
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.48
# Execution count [1.00e+00]
movl 4(%r13), %ecx #107.18
movq 64(%r13), %r11 #109.20
movq 72(%r13), %r10 #109.45
movq 80(%r13), %r9 #109.70
vmovsd 72(%r12), %xmm2 #111.27
vmovsd 8(%r12), %xmm1 #112.23
vmovsd (%r12), %xmm0 #113.24
testl %ecx, %ecx #116.24
jle ..B1.42 # Prob 50% #116.24
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.3: # Preds ..B1.2
# Execution count [1.00e+00]
xorl %edi, %edi #116.5
movl %ecx, %edx #116.5
xorl %esi, %esi #116.5
movl $1, %r8d #116.5
xorl %eax, %eax #117.17
shrl $1, %edx #116.5
je ..B1.7 # Prob 9% #116.5
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.5: # Preds ..B1.3 ..B1.5
# Execution count [2.50e+00]
movq %rax, (%rsi,%r11) #117.9
incq %rdi #116.5
movq %rax, (%rsi,%r10) #118.9
movq %rax, (%rsi,%r9) #119.9
movq %rax, 8(%rsi,%r11) #117.9
movq %rax, 8(%rsi,%r10) #118.9
movq %rax, 8(%rsi,%r9) #119.9
addq $16, %rsi #116.5
cmpq %rdx, %rdi #116.5
jb ..B1.5 # Prob 63% #116.5
# LOE rax rdx rbx rsi rdi r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.6: # Preds ..B1.5
# Execution count [9.00e-01]
lea 1(%rdi,%rdi), %r8d #117.9
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.7: # Preds ..B1.3 ..B1.6
# Execution count [1.00e+00]
lea -1(%r8), %edx #116.5
cmpl %ecx, %edx #116.5
jae ..B1.9 # Prob 9% #116.5
# LOE rax rbx r9 r10 r11 r13 r14 r15 ecx r8d xmm0 xmm1 xmm2
..B1.8: # Preds ..B1.7
# Execution count [9.00e-01]
movslq %r8d, %r8 #116.5
movq %rax, -8(%r11,%r8,8) #117.9
movq %rax, -8(%r10,%r8,8) #118.9
movq %rax, -8(%r9,%r8,8) #119.9
# LOE rbx r9 r10 r11 r13 r14 r15 ecx xmm0 xmm1 xmm2
..B1.9: # Preds ..B1.7 ..B1.8
# Execution count [9.00e-01]
vmulsd %xmm2, %xmm2, %xmm13 #111.45
xorl %edi, %edi #124.15
vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #153.13
vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm0 #177.45
vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #153.13
vmovups .L_2il0floatpacket.4(%rip), %zmm5 #177.58
vbroadcastsd %xmm13, %zmm14 #111.25
vbroadcastsd %xmm1, %zmm13 #112.21
vbroadcastsd %xmm0, %zmm9 #177.45
movq 16(%r13), %rdx #127.25
xorl %r8d, %r8d #124.5
movslq %ecx, %r12 #124.5
xorl %eax, %eax #124.5
movq 24(%r14), %r13 #126.25
movslq 16(%r14), %rcx #125.43
movq 8(%r14), %rsi #125.19
shlq $2, %rcx #108.5
movq %r12, 80(%rsp) #124.5[spill]
movq %r13, 88(%rsp) #124.5[spill]
movq %r11, 96(%rsp) #124.5[spill]
movq %r15, 8(%rsp) #124.5[spill]
movq %rbx, (%rsp) #124.5[spill]
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.10: # Preds ..B1.40 ..B1.9
# Execution count [5.00e+00]
movq 88(%rsp), %rbx #126.25[spill]
vxorpd %xmm24, %xmm24, %xmm24 #130.22
vmovapd %xmm24, %xmm18 #131.22
movl (%rbx,%r8,4), %r11d #126.25
vmovapd %xmm18, %xmm4 #132.22
vmovsd (%rax,%rdx), %xmm10 #127.25
vmovsd 8(%rax,%rdx), %xmm6 #128.25
vmovsd 16(%rax,%rdx), %xmm12 #129.25
testl %r11d, %r11d #153.32
jle ..B1.40 # Prob 50% #153.32
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm4 xmm6 xmm10 xmm12 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.11: # Preds ..B1.10
# Execution count [4.50e+00]
vpxord %zmm8, %zmm8, %zmm8 #130.22
vmovaps %zmm8, %zmm7 #131.22
vmovaps %zmm7, %zmm11 #132.22
cmpl $8, %r11d #153.13
jl ..B1.45 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.12: # Preds ..B1.11
# Execution count [4.50e+00]
cmpl $1200, %r11d #153.13
jl ..B1.44 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.13: # Preds ..B1.12
# Execution count [4.50e+00]
movq %rcx, %r15 #125.43
imulq %rdi, %r15 #125.43
addq %rsi, %r15 #108.5
movq %r15, %r12 #153.13
andq $63, %r12 #153.13
testl $3, %r12d #153.13
je ..B1.15 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.14: # Preds ..B1.13
# Execution count [2.25e+00]
xorl %r12d, %r12d #153.13
jmp ..B1.17 # Prob 100% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.15: # Preds ..B1.13
# Execution count [2.25e+00]
testl %r12d, %r12d #153.13
je ..B1.17 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.16: # Preds ..B1.15
# Execution count [2.50e+01]
negl %r12d #153.13
addl $64, %r12d #153.13
shrl $2, %r12d #153.13
cmpl %r12d, %r11d #153.13
cmovl %r11d, %r12d #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.17: # Preds ..B1.14 ..B1.16 ..B1.15
# Execution count [5.00e+00]
movl %r11d, %r14d #153.13
subl %r12d, %r14d #153.13
andl $7, %r14d #153.13
negl %r14d #153.13
addl %r11d, %r14d #153.13
cmpl $1, %r12d #153.13
jb ..B1.25 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.18: # Preds ..B1.17
# Execution count [4.50e+00]
vmovdqa %ymm15, %ymm4 #153.13
xorl %r13d, %r13d #153.13
vpbroadcastd %r12d, %ymm3 #153.13
vbroadcastsd %xmm10, %zmm2 #127.23
vbroadcastsd %xmm6, %zmm1 #128.23
vbroadcastsd %xmm12, %zmm0 #129.23
movslq %r12d, %rbx #153.13
movq %r9, 24(%rsp) #153.13[spill]
movq %r10, 32(%rsp) #153.13[spill]
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.19: # Preds ..B1.23 ..B1.18
# Execution count [2.50e+01]
vpcmpgtd %ymm4, %ymm3, %k3 #153.13
vmovdqu32 (%r15,%r13,4), %ymm17{%k3}{z} #154.25
kmovw %k3, %r10d #153.13
vpaddd %ymm17, %ymm17, %ymm18 #155.40
vpaddd %ymm18, %ymm17, %ymm17 #155.40
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 ymm17 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.22: # Preds ..B1.19
# Execution count [1.25e+01]
kmovw %k3, %k1 #155.40
kmovw %k3, %k2 #155.40
vpxord %zmm18, %zmm18, %zmm18 #155.40
vpxord %zmm19, %zmm19, %zmm19 #155.40
vpxord %zmm20, %zmm20, %zmm20 #155.40
vgatherdpd 16(%rdx,%ymm17,8), %zmm18{%k1} #155.40
vgatherdpd 8(%rdx,%ymm17,8), %zmm19{%k2} #155.40
vgatherdpd (%rdx,%ymm17,8), %zmm20{%k3} #155.40
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r10d r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm18 zmm19 zmm20
..B1.23: # Preds ..B1.22
# Execution count [2.50e+01]
addq $8, %r13 #153.13
#vpaddd %ymm16, %ymm4, %ymm4 #153.13
#vsubpd %zmm18, %zmm0, %zmm29 #157.40
#vsubpd %zmm19, %zmm1, %zmm27 #156.40
#vsubpd %zmm20, %zmm2, %zmm26 #155.40
#vmulpd %zmm27, %zmm27, %zmm25 #158.53
#vfmadd231pd %zmm26, %zmm26, %zmm25 #158.53
#vfmadd231pd %zmm29, %zmm29, %zmm25 #158.67
#vrcp14pd %zmm25, %zmm24 #175.42
#vcmppd $1, %zmm14, %zmm25, %k2 #174.26
#vfpclasspd $30, %zmm24, %k0 #175.42
#kmovw %k2, %r9d #174.26
#knotw %k0, %k1 #175.42
#vmovaps %zmm25, %zmm17 #175.42
#andl %r9d, %r10d #174.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm24, %zmm17 #175.42
#kmovw %r10d, %k3 #178.21
#vmulpd %zmm17, %zmm17, %zmm18 #175.42
#vfmadd213pd %zmm24, %zmm17, %zmm24{%k1} #175.42
#vfmadd213pd %zmm24, %zmm18, %zmm24{%k1} #175.42
#vmulpd %zmm13, %zmm24, %zmm19 #176.42
#vmulpd %zmm9, %zmm24, %zmm21 #177.58
#vmulpd %zmm19, %zmm24, %zmm22 #176.48
#vmulpd %zmm22, %zmm24, %zmm20 #176.54
#vfmsub213pd %zmm5, %zmm22, %zmm24 #177.58
#vmulpd %zmm21, %zmm20, %zmm23 #177.65
#vmulpd %zmm24, %zmm23, %zmm28 #177.71
#vfmadd231pd %zmm26, %zmm28, %zmm8{%k3} #178.21
#vfmadd231pd %zmm27, %zmm28, %zmm7{%k3} #179.21
#vfmadd231pd %zmm29, %zmm28, %zmm11{%k3} #180.21
cmpq %rbx, %r13 #153.13
jb ..B1.19 # Prob 82% #153.13
# LOE rax rdx rcx rbx rsi rdi r8 r13 r15 r11d r12d r14d xmm6 xmm10 xmm12 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.24: # Preds ..B1.23
# Execution count [4.50e+00]
movq 24(%rsp), %r9 #[spill]
movq 32(%rsp), %r10 #[spill]
cmpl %r12d, %r11d #153.13
je ..B1.39 # Prob 10% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.25: # Preds ..B1.24 ..B1.17 ..B1.44
# Execution count [2.50e+01]
lea 8(%r12), %ebx #153.13
cmpl %ebx, %r14d #153.13
jl ..B1.33 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.26: # Preds ..B1.25
# Execution count [4.50e+00]
movq %rcx, %r13 #125.43
imulq %rdi, %r13 #125.43
vbroadcastsd %xmm10, %zmm1 #127.23
vbroadcastsd %xmm6, %zmm0 #128.23
vbroadcastsd %xmm12, %zmm2 #129.23
movslq %r12d, %rbx #153.13
addq %rsi, %r13 #108.5
movq %rax, 40(%rsp) #108.5[spill]
movq %rcx, 48(%rsp) #108.5[spill]
movq %rsi, 56(%rsp) #108.5[spill]
movq %r8, 64(%rsp) #108.5[spill]
movq %rdi, 72(%rsp) #108.5[spill]
movq %r9, 24(%rsp) #108.5[spill]
movq %r10, 32(%rsp) #108.5[spill]
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.27: # Preds ..B1.31 ..B1.26
# Execution count [2.50e+01]
vmovdqu (%r13,%rbx,4), %ymm3 #154.25
vpaddd %ymm3, %ymm3, %ymm4 #155.40
vpaddd %ymm4, %ymm3, %ymm3 #155.40
movl (%r13,%rbx,4), %r10d #154.25
movl 4(%r13,%rbx,4), %r9d #154.25
movl 8(%r13,%rbx,4), %r8d #154.25
movl 12(%r13,%rbx,4), %edi #154.25
lea (%r10,%r10,2), %r10d #155.40
movl 16(%r13,%rbx,4), %esi #154.25
lea (%r9,%r9,2), %r9d #155.40
movl 20(%r13,%rbx,4), %ecx #154.25
lea (%r8,%r8,2), %r8d #155.40
movl 24(%r13,%rbx,4), %eax #154.25
lea (%rdi,%rdi,2), %edi #155.40
movl 28(%r13,%rbx,4), %r15d #154.25
lea (%rsi,%rsi,2), %esi #155.40
lea (%rcx,%rcx,2), %ecx #155.40
lea (%rax,%rax,2), %eax #155.40
lea (%r15,%r15,2), %r15d #155.40
# LOE rdx rbx r13 eax ecx esi edi r8d r9d r10d r11d r12d r14d r15d xmm6 xmm10 xmm12 ymm3 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.30: # Preds ..B1.27
# Execution count [1.25e+01]
vpcmpeqb %xmm0, %xmm0, %k1 #155.40
vpcmpeqb %xmm0, %xmm0, %k2 #155.40
vpcmpeqb %xmm0, %xmm0, %k3 #155.40
vpxord %zmm4, %zmm4, %zmm4 #155.40
vpxord %zmm17, %zmm17, %zmm17 #155.40
vpxord %zmm18, %zmm18, %zmm18 #155.40
vgatherdpd 16(%rdx,%ymm3,8), %zmm4{%k1} #155.40
vgatherdpd 8(%rdx,%ymm3,8), %zmm17{%k2} #155.40
vgatherdpd (%rdx,%ymm3,8), %zmm18{%k3} #155.40
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 zmm17 zmm18
..B1.31: # Preds ..B1.30
# Execution count [2.50e+01]
addl $8, %r12d #153.13
addq $8, %rbx #153.13
#vsubpd %zmm4, %zmm2, %zmm26 #157.40
#vsubpd %zmm17, %zmm0, %zmm24 #156.40
#vsubpd %zmm18, %zmm1, %zmm23 #155.40
#vmulpd %zmm24, %zmm24, %zmm3 #158.53
#vfmadd231pd %zmm23, %zmm23, %zmm3 #158.53
#vfmadd231pd %zmm26, %zmm26, %zmm3 #158.67
#vrcp14pd %zmm3, %zmm22 #175.42
#vcmppd $1, %zmm14, %zmm3, %k2 #174.26
#vfpclasspd $30, %zmm22, %k0 #175.42
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm22, %zmm3 #175.42
#knotw %k0, %k1 #175.42
#vmulpd %zmm3, %zmm3, %zmm4 #175.42
#vfmadd213pd %zmm22, %zmm3, %zmm22{%k1} #175.42
#vfmadd213pd %zmm22, %zmm4, %zmm22{%k1} #175.42
#vmulpd %zmm13, %zmm22, %zmm17 #176.42
#vmulpd %zmm9, %zmm22, %zmm19 #177.58
#vmulpd %zmm17, %zmm22, %zmm20 #176.48
#vmulpd %zmm20, %zmm22, %zmm18 #176.54
#vfmsub213pd %zmm5, %zmm20, %zmm22 #177.58
#vmulpd %zmm19, %zmm18, %zmm21 #177.65
#vmulpd %zmm22, %zmm21, %zmm25 #177.71
#vfmadd231pd %zmm23, %zmm25, %zmm8{%k2} #178.21
#vfmadd231pd %zmm24, %zmm25, %zmm7{%k2} #179.21
#vfmadd231pd %zmm26, %zmm25, %zmm11{%k2} #180.21
cmpl %r14d, %r12d #153.13
jb ..B1.27 # Prob 82% #153.13
# LOE rdx rbx r13 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.32: # Preds ..B1.31
# Execution count [4.50e+00]
movq 40(%rsp), %rax #[spill]
movq 48(%rsp), %rcx #[spill]
movq 56(%rsp), %rsi #[spill]
movq 64(%rsp), %r8 #[spill]
movq 72(%rsp), %rdi #[spill]
movq 24(%rsp), %r9 #[spill]
movq 32(%rsp), %r10 #[spill]
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32 ..B1.25 ..B1.45
# Execution count [5.00e+00]
lea 1(%r14), %ebx #153.13
cmpl %r11d, %ebx #153.13
ja ..B1.39 # Prob 50% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.34: # Preds ..B1.33
# Execution count [2.50e+01]
imulq %rcx, %rdi #125.43
vbroadcastsd %xmm10, %zmm4 #127.23
subl %r14d, %r11d #153.13
addq %rsi, %rdi #108.5
vpbroadcastd %r11d, %ymm0 #153.13
vpcmpgtd %ymm15, %ymm0, %k3 #153.13
movslq %r14d, %r14 #153.13
vmovdqu32 (%rdi,%r14,4), %ymm1{%k3}{z} #154.25
kmovw %k3, %edi #153.13
vpaddd %ymm1, %ymm1, %ymm2 #155.40
vpaddd %ymm2, %ymm1, %ymm0 #155.40
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm0 ymm15 ymm16 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14 k3
..B1.37: # Preds ..B1.34
# Execution count [1.25e+01]
kmovw %k3, %k1 #155.40
kmovw %k3, %k2 #155.40
vpxord %zmm1, %zmm1, %zmm1 #155.40
vpxord %zmm2, %zmm2, %zmm2 #155.40
vpxord %zmm3, %zmm3, %zmm3 #155.40
vgatherdpd 16(%rdx,%ymm0,8), %zmm1{%k1} #155.40
vgatherdpd 8(%rdx,%ymm0,8), %zmm2{%k2} #155.40
vgatherdpd (%rdx,%ymm0,8), %zmm3{%k3} #155.40
# LOE rax rdx rcx rsi r8 r9 r10 edi xmm6 xmm12 ymm15 ymm16 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.38: # Preds ..B1.37
# Execution count [2.50e+01]
#vbroadcastsd %xmm6, %zmm6 #128.23
#vbroadcastsd %xmm12, %zmm12 #129.23
#vsubpd %zmm1, %zmm12, %zmm23 #157.40
#vsubpd %zmm2, %zmm6, %zmm21 #156.40
#vsubpd %zmm3, %zmm4, %zmm20 #155.40
#vmulpd %zmm21, %zmm21, %zmm19 #158.53
#vfmadd231pd %zmm20, %zmm20, %zmm19 #158.53
#vfmadd231pd %zmm23, %zmm23, %zmm19 #158.67
#vrcp14pd %zmm19, %zmm18 #175.42
#vcmppd $1, %zmm14, %zmm19, %k2 #174.26
#vfpclasspd $30, %zmm18, %k0 #175.42
#kmovw %k2, %ebx #174.26
#knotw %k0, %k1 #175.42
#vmovaps %zmm19, %zmm0 #175.42
#andl %ebx, %edi #174.26
#vfnmadd213pd .L_2il0floatpacket.9(%rip){1to8}, %zmm18, %zmm0 #175.42
#kmovw %edi, %k3 #178.21
#vmulpd %zmm0, %zmm0, %zmm1 #175.42
#vfmadd213pd %zmm18, %zmm0, %zmm18{%k1} #175.42
#vfmadd213pd %zmm18, %zmm1, %zmm18{%k1} #175.42
#vmulpd %zmm13, %zmm18, %zmm2 #176.42
#vmulpd %zmm9, %zmm18, %zmm4 #177.58
#vmulpd %zmm2, %zmm18, %zmm10 #176.48
#vmulpd %zmm10, %zmm18, %zmm3 #176.54
#vfmsub213pd %zmm5, %zmm10, %zmm18 #177.58
#vmulpd %zmm4, %zmm3, %zmm17 #177.65
#vmulpd %zmm18, %zmm17, %zmm22 #177.71
#vfmadd231pd %zmm20, %zmm22, %zmm8{%k3} #178.21
#vfmadd231pd %zmm21, %zmm22, %zmm7{%k3} #179.21
#vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #180.21
# LOE rax rdx rcx rsi r8 r9 r10 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.39: # Preds ..B1.24 ..B1.38 ..B1.33
# Execution count [4.50e+00]
vmovups .L_2il0floatpacket.10(%rip), %zmm19 #132.22
vpermd %zmm11, %zmm19, %zmm0 #132.22
vpermd %zmm7, %zmm19, %zmm6 #131.22
vpermd %zmm8, %zmm19, %zmm20 #130.22
vaddpd %zmm11, %zmm0, %zmm11 #132.22
vaddpd %zmm7, %zmm6, %zmm7 #131.22
vaddpd %zmm8, %zmm20, %zmm8 #130.22
vpermpd $78, %zmm11, %zmm1 #132.22
vpermpd $78, %zmm7, %zmm10 #131.22
vpermpd $78, %zmm8, %zmm21 #130.22
vaddpd %zmm1, %zmm11, %zmm2 #132.22
vaddpd %zmm10, %zmm7, %zmm12 #131.22
vaddpd %zmm21, %zmm8, %zmm22 #130.22
vpermpd $177, %zmm2, %zmm3 #132.22
vpermpd $177, %zmm12, %zmm17 #131.22
vpermpd $177, %zmm22, %zmm23 #130.22
vaddpd %zmm3, %zmm2, %zmm4 #132.22
vaddpd %zmm17, %zmm12, %zmm18 #131.22
vaddpd %zmm23, %zmm22, %zmm24 #130.22
# LOE rax rdx rcx rsi r8 r9 r10 xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.40: # Preds ..B1.39 ..B1.10
# Execution count [5.00e+00]
movq 96(%rsp), %rbx #188.9[spill]
addq $24, %rax #124.5
movslq %r8d, %rdi #124.32
incq %rdi #124.32
#vaddsd (%rbx,%r8,8), %xmm24, %xmm0 #188.9
#vmovsd %xmm0, (%rbx,%r8,8) #188.9
#vaddsd (%r10,%r8,8), %xmm18, %xmm1 #189.9
#vmovsd %xmm1, (%r10,%r8,8) #189.9
#vaddsd (%r9,%r8,8), %xmm4, %xmm2 #190.9
#vmovsd %xmm2, (%r9,%r8,8) #190.9
incq %r8 #124.5
cmpq 80(%rsp), %r8 #124.5[spill]
jb ..B1.10 # Prob 82% #124.5
# LOE rax rdx rcx rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm9 zmm13 zmm14
..B1.41: # Preds ..B1.40
# Execution count [9.00e-01]
movq 8(%rsp), %r15 #[spill]
.cfi_restore 15
movq (%rsp), %rbx #[spill]
.cfi_restore 3
# LOE rbx r15
..B1.42: # Preds ..B1.2 ..B1.41
# Execution count [1.00e+00]
xorl %eax, %eax #201.16
vzeroupper #201.16
..___tag_value_computeForce.43:
# getTimeStamp()
call getTimeStamp #201.16
..___tag_value_computeForce.44:
# LOE rbx r15 xmm0
..B1.43: # Preds ..B1.42
# Execution count [1.00e+00]
vsubsd 16(%rsp), %xmm0, %xmm0 #204.14[spill]
addq $104, %rsp #204.14
.cfi_restore 14
popq %r14 #204.14
.cfi_restore 13
popq %r13 #204.14
.cfi_restore 12
popq %r12 #204.14
movq %rbp, %rsp #204.14
popq %rbp #204.14
.cfi_def_cfa 7, 8
.cfi_restore 6
ret #204.14
.cfi_def_cfa 6, 16
.cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x80, 0xff, 0xff, 0xff, 0x22
.cfi_offset 6, -16
.cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22
.cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22
# LOE
..B1.44: # Preds ..B1.12
# Execution count [4.50e-01]: Infreq
movl %r11d, %r14d #153.13
xorl %r12d, %r12d #153.13
andl $-8, %r14d #153.13
jmp ..B1.25 # Prob 100% #153.13
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r12d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
..B1.45: # Preds ..B1.11
# Execution count [4.50e-01]: Infreq
xorl %r14d, %r14d #153.13
jmp ..B1.33 # Prob 100% #153.13
.align 16,0x90
# LOE rax rdx rcx rsi rdi r8 r9 r10 r11d r14d xmm6 xmm10 xmm12 ymm15 ymm16 zmm5 zmm7 zmm8 zmm9 zmm11 zmm13 zmm14
.cfi_endproc
# mark_end;
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.5:
.long 0x02010101,0x04040202,0x08080804,0x20101010,0x40402020,0x80808040,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000,0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 64
.L_2il0floatpacket.7:
.long 0x00000001,0x00000000,0x00000005,0x00000000,0x00000009,0x00000000,0x0000000d,0x00000000,0x00000000,0x00000000,0x00000004,0x00000000,0x00000008,0x00000000,0x0000000c,0x00000000
.type .L_2il0floatpacket.7,@object
.size .L_2il0floatpacket.7,64
.align 64
.L_2il0floatpacket.8:
.long 0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000,0x00000002,0x00000000,0x00000006,0x00000000,0x0000000a,0x00000000,0x0000000e,0x00000000
.type .L_2il0floatpacket.8,@object
.size .L_2il0floatpacket.8,64
.align 64
.L_2il0floatpacket.10:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.10,@object
.size .L_2il0floatpacket.10,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.9:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.9,@object
.size .L_2il0floatpacket.9,8
.data
.section .note.GNU-stack, ""
# End

324
asm/unused/force.s Normal file
View File

@ -0,0 +1,324 @@
.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForce
computeForce:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push rbp
push r12
push r13
push r14
push r15
push rbx
#call getTimeStamp # xmm0 <- getTimeStamp()
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..atom_loop_exit
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..zero_last_element # ecx == 0
# Init forces to zero loop (unroll factor = 2)
..init_force_loop:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..init_force_loop
# Trick to make r11d contain value of last element to be zeroed plus 1
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..zero_last_element:
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
cmp ecx, r9d # i >= Nlocal
jae ..before_atom_loop
# Set last element to zero
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
# Initialize registers to be used within atom loop
..before_atom_loop:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
# Register spilling
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
..atom_loop_begin:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
### SOA
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
test r13d, r13d # numneighs <= 0
jle ..atom_loop_exit
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
xor r9d, r9d # r9d <- 0 (k)
mov r14d, r13d # r14d <- numneighs
cmp r14d, 8
jl ..compute_forces_remainder
..compute_forces:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
sub r14d, 8
add r9, 8
cmp r14d, 8
jge ..compute_forces
# Check if there are remaining neighbors to be computed
..compute_forces_remainder:
test r14d, r14d
jle ..sum_up_forces
vpbroadcastd ymm4, r14d
vpcmpgtd k1, ymm4, ymm17
kmovw r15d, k1
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
kmovw k2, k1
kmovw k3, k1
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
#### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
kmovw r9d, k5 # r9d <- rsq < cutforcesq
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
# and add them (reduction) to obtain the final contribution for the current atom
..sum_up_forces:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
..atom_loop_exit:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..atom_loop_begin
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
#call getTimeStamp # xmm0 <- getTimeStamp()
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
pop rbx
pop r15
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
pop rbp #93.12
ret #93.12
.type computeForce,@function
.size computeForce,.-computeForce
..LNcomputeForce.0:
.data
# -- End computeForce
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End

326
asm/unused/force_lj.s Normal file
View File

@ -0,0 +1,326 @@
.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForceLJ
computeForceLJ:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push rbp
push r12
push r13
push r14
push r15
push rbx
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..atom_loop_exit
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..zero_last_element # ecx == 0
# Init forces to zero loop (unroll factor = 2)
..init_force_loop:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..init_force_loop
# Trick to make r11d contain value of last element to be zeroed plus 1
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..zero_last_element:
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
cmp ecx, r9d # i >= Nlocal
jae ..before_atom_loop
# Set last element to zero
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
# Initialize registers to be used within atom loop
..before_atom_loop:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
# Register spilling
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
#sub rsp, 64
#call getTimeStamp # xmm0 <- getTimeStamp()
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
#add rsp, 64
..atom_loop_begin:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
### SOA
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
test r13d, r13d # numneighs <= 0
jle ..atom_loop_exit
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
xor r9d, r9d # r9d <- 0 (k)
mov r14d, r13d # r14d <- numneighs
cmp r14d, 8
jl ..compute_forces_remainder
..compute_forces:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
sub r14d, 8
add r9, 8
cmp r14d, 8
jge ..compute_forces
# Check if there are remaining neighbors to be computed
..compute_forces_remainder:
test r14d, r14d
jle ..sum_up_forces
vpbroadcastd ymm4, r14d
vpcmpgtd k1, ymm4, ymm17
kmovw r15d, k1
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
kmovw k2, k1
kmovw k3, k1
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
#### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
kmovw r9d, k5 # r9d <- rsq < cutforcesq
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
# and add them (reduction) to obtain the final contribution for the current atom
..sum_up_forces:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
..atom_loop_exit:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..atom_loop_begin
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
#call getTimeStamp # xmm0 <- getTimeStamp()
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
pop rbx
pop r15
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
pop rbp #93.12
ret #93.12
.type computeForceLJ,@function
.size computeForceLJ,.-computeForceLJ
..LNcomputeForce.0:
.data
# -- End computeForceLJ
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -21,7 +21,6 @@ typedef struct {
char* input_file;
char* vtk_file;
char* xtc_file;
char* write_atom_file;
MD_FLOAT epsilon;
MD_FLOAT sigma;
MD_FLOAT sigma6;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -48,13 +48,11 @@ static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_S
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
t0 = _mm256_add_pd(t0, t2);
t1 = _mm256_add_pd(t1, t2);
t0 = _mm256_blend_pd(t0, t1, 0xC);
//t0 = _mm256_blend_pd(t0, t1, 0b1100);
t0 = _mm256_blend_pd(t0, t1, 0b1100);
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
_mm256_store_pd(m, t1);
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0x5));
//t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
a0 = _mm256_castpd256_pd128(t0);
a1 = _mm256_extractf128_pd(t0, 0x1);
a0 = _mm_add_sd(a0, a1);
@ -93,7 +91,7 @@ static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1,
}
// Functions used in LAMMPS kernel
#define simd_gather(vidx, m, s) _mm256_i32gather_pd(m, vidx, s);
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -7,7 +7,8 @@
#ifndef __TIMING_H_
#define __TIMING_H_
extern double getTimeStamp(void);
extern double getTimeResolution(void);
extern double getTimeStamp();
extern double getTimeResolution();
extern double getTimeStamp_();
#endif

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -7,41 +7,40 @@
#ifndef __UTIL_H_
#define __UTIL_H_
#include <stdio.h>
#ifndef MIN
#define MIN(x, y) ((x) < (y) ? (x) : (y))
# define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
#define MAX(x, y) ((x) > (y) ? (x) : (y))
# define MAX(x,y) ((x)>(y)?(x):(y))
#endif
#ifndef ABS
#define ABS(a) ((a) >= 0 ? (a) : -(a))
# define ABS(a) ((a) >= 0 ? (a) : -(a))
#endif
#define DEBUG_MESSAGE debug_printf
#ifndef MAXLINE
#define MAXLINE 4096
# define MAXLINE 4096
#endif
#define FF_LJ 0
#define FF_EAM 1
#define FF_DEM 2
#define FF_LJ 0
#define FF_EAM 1
#define FF_DEM 2
#if PRECISION == 1
#define PRECISION_STRING "single"
# define PRECISION_STRING "single"
#else
#define PRECISION_STRING "double"
# define PRECISION_STRING "double"
#endif
extern double myrandom(int *);
extern double myrandom(int*);
extern void random_reset(int *seed, int ibase, double *coord);
extern int str2ff(const char *string);
extern const char *ff2str(int ff);
extern const char* ff2str(int ff);
extern int get_num_threads();
extern void readline(char *line, FILE *fp);
extern void debug_printf(const char *format, ...);
extern int get_cuda_num_threads(void);
#endif

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -17,7 +17,6 @@ void initParameter(Parameter *param) {
param->vtk_file = NULL;
param->xtc_file = NULL;
param->eam_file = NULL;
param->write_atom_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma = 1.0;
@ -170,11 +169,6 @@ void printParameter(Parameter *param) {
printf("\tNumber of timesteps: %d\n", param->ntimes);
printf("\tReport stats every (timesteps): %d\n", param->nstat);
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
#ifdef SORT_ATOMS
printf("\tSort atoms when reneighboring: yes\n");
#else
printf("\tSort atoms when reneighboring: no\n");
#endif
printf("\tPrune every (timesteps): %d\n", param->prune_every);
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,21 +1,27 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <time.h>
double getTimeStamp(void)
double getTimeStamp()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
}
double getTimeResolution(void)
double getTimeResolution()
{
struct timespec ts;
clock_getres(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
}
double getTimeStamp_()
{
return getTimeStamp();
}

View File

@ -1,39 +1,38 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <util.h>
/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
#define IA 16807
#define IM 2147483647
#define AM (1.0 / IM)
#define IQ 127773
#define IR 2836
#define IA 16807
#define IM 2147483647
#define AM (1.0/IM)
#define IQ 127773
#define IR 2836
#define MASK 123459876
double myrandom(int* seed)
{
int k = (*seed) / IQ;
double myrandom(int* seed) {
int k= (*seed) / IQ;
double ans;
*seed = IA * (*seed - k * IQ) - IR * k;
if (*seed < 0) *seed += IM;
if(*seed < 0) *seed += IM;
ans = AM * (*seed);
return ans;
}
void random_reset(int* seed, int ibase, double* coord)
{
void random_reset(int *seed, int ibase, double *coord) {
int i;
char* str = (char*)&ibase;
int n = sizeof(int);
char *str = (char *) &ibase;
int n = sizeof(int);
unsigned int hash = 0;
for (i = 0; i < n; i++) {
@ -42,8 +41,8 @@ void random_reset(int* seed, int ibase, double* coord)
hash ^= (hash >> 6);
}
str = (char*)coord;
n = 3 * sizeof(double);
str = (char *) coord;
n = 3 * sizeof(double);
for (i = 0; i < n; i++) {
hash += str[i];
hash += (hash << 10);
@ -62,59 +61,45 @@ void random_reset(int* seed, int ibase, double* coord)
// warm up the RNG
for (i = 0; i < 5; i++)
myrandom(seed);
// save = 0;
for (i = 0; i < 5; i++) myrandom(seed);
//save = 0;
}
int str2ff(const char* string)
{
if (strncmp(string, "lj", 2) == 0) return FF_LJ;
if (strncmp(string, "eam", 3) == 0) return FF_EAM;
if (strncmp(string, "dem", 3) == 0) return FF_DEM;
int str2ff(const char *string) {
if(strncmp(string, "lj", 2) == 0) return FF_LJ;
if(strncmp(string, "eam", 3) == 0) return FF_EAM;
if(strncmp(string, "dem", 3) == 0) return FF_DEM;
return -1;
}
const char* ff2str(int ff)
{
if (ff == FF_LJ) {
return "lj";
}
if (ff == FF_EAM) {
return "eam";
}
if (ff == FF_DEM) {
return "dem";
}
const char* ff2str(int ff) {
if(ff == FF_LJ) { return "lj"; }
if(ff == FF_EAM) { return "eam"; }
if(ff == FF_DEM) { return "dem"; }
return "invalid";
}
int get_cuda_num_threads(void)
{
const char* num_threads_env = getenv("NUM_THREADS");
int get_num_threads() {
const char *num_threads_env = getenv("NUM_THREADS");
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
}
void readline(char* line, FILE* fp)
{
if (fgets(line, MAXLINE, fp) == NULL) {
if (errno != 0) {
void readline(char *line, FILE *fp) {
if(fgets(line, MAXLINE, fp) == NULL) {
if(errno != 0) {
perror("readline()");
exit(-1);
}
}
}
void debug_printf(const char* format, ...)
{
#ifdef DEBUG
void debug_printf(const char *format, ...) {
#ifdef DEBUG
va_list arg;
int ret;
va_start(arg, format);
if ((vfprintf(stdout, format, arg)) < 0) {
perror("debug_printf()");
}
if((vfprintf(stdout, format, arg)) < 0) { perror("debug_printf()"); }
va_end(arg);
#endif
#endif
}

113
config.mk
View File

@ -1,23 +1,20 @@
# Compiler tool chain (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
TOOLCHAIN ?= CLANG
# Instruction set for instrinsic kernels (NONE/SSE/AVX/AVX_FMA/AVX2/AVX512)
ISA ?= ARM
SIMD ?= NONE
# Optimization scheme (verletlist/clusterpair/clusters_per_bin)
OPT_SCHEME ?= verletlist
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
TAG ?= ICC
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
ISA ?= AVX512
# Optimization scheme (lammps/gromacs/clusters_per_bin)
OPT_SCHEME ?= gromacs
# Enable likwid (true or false)
ENABLE_LIKWID ?= false
ENABLE_LIKWID ?= true
# SP or DP
DATA_TYPE ?= DP
# AOS or SOA
DATA_LAYOUT ?= AOS
# Assembly syntax to generate (ATT/INTEL)
ASM_SYNTAX ?= INTEL
ASM_SYNTAX ?= ATT
# Debug
DEBUG ?= false
# Sort atoms when reneighboring (true or false)
SORT_ATOMS ?= true
# Explicitly store and load atom types (true or false)
EXPLICIT_TYPES ?= false
# Trace memory addresses for cache simulator (true or false)
@ -29,7 +26,7 @@ COMPUTE_STATS ?= true
# Configurations for lammps optimization scheme
# Use omp simd pragma when running with half neighbor-lists
ENABLE_OMP_SIMD ?= false
ENABLE_OMP_SIMD ?= true
# Use kernel with explicit SIMD intrinsics
USE_SIMD_KERNEL ?= false
@ -39,7 +36,7 @@ USE_REFERENCE_VERSION ?= false
# Enable XTC output
XTC_OUTPUT ?= false
# Check if cj is local when decreasing reaction force
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= true
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
# Configurations for CUDA
# Use CUDA host memory to optimize transfers
@ -48,93 +45,3 @@ USE_CUDA_HOST_MEMORY ?= false
#Feature options
OPTIONS = -DALIGNMENT=64
#OPTIONS += More options
#DO NOT EDIT BELOW
ifeq ($(strip $(DATA_LAYOUT)),AOS)
DEFINES += -DAOS
endif
ifeq ($(strip $(DATA_TYPE)),SP)
DEFINES += -DPRECISION=1
else
DEFINES += -DPRECISION=2
endif
ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifeq ($(strip $(SORT_ATOMS)),true)
DEFINES += -DSORT_ATOMS
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
ifeq ($(strip $(MEM_TRACER)),true)
DEFINES += -DMEM_TRACER
endif
ifeq ($(strip $(INDEX_TRACER)),true)
DEFINES += -DINDEX_TRACER
endif
ifeq ($(strip $(COMPUTE_STATS)),true)
DEFINES += -DCOMPUTE_STATS
endif
ifeq ($(strip $(XTC_OUTPUT)),true)
DEFINES += -DXTC_OUTPUT
endif
ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
DEFINES += -DUSE_REFERENCE_VERSION
endif
ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true)
DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ
endif
ifeq ($(strip $(DEBUG)),true)
DEFINES += -DDEBUG
endif
ifneq ($(VECTOR_WIDTH),)
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
endif
ifeq ($(strip $(__SIMD_KERNEL__)),true)
DEFINES += -D__SIMD_KERNEL__
endif
ifeq ($(strip $(__SSE__)),true)
DEFINES += -D__ISA_SSE__
endif
ifeq ($(strip $(__ISA_AVX__)),true)
DEFINES += -D__ISA_AVX__
endif
ifeq ($(strip $(__ISA_AVX_FMA__)),true)
DEFINES += -D__ISA_AVX_FMA__
endif
ifeq ($(strip $(__ISA_AVX2__)),true)
DEFINES += -D__ISA_AVX2__
endif
ifeq ($(strip $(__ISA_AVX512__)),true)
DEFINES += -D__ISA_AVX512__
endif
ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
DEFINES += -DENABLE_OMP_SIMD
endif
ifeq ($(strip $(OPT_SCHEME)),verletlist)
OPT_TAG = VL
endif
ifneq ($(strip $(SIMD)),NONE)
TOOLCHAIN = $(TOOLCHAIN)-$(ISA)-$(SIMD)
endif

View File

@ -6,7 +6,7 @@ dt 0.001
temp 80
x_out_freq 500
v_out_freq 5
cutforce 1.8
skin 0.1
cutforce 0.9
skin 0.05
reneigh_every 100
nstat 125000

BIN
figures/features-v3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 273 KiB

BIN
figures/gather_bench.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

523
figures/gather_bench.svg Normal file
View File

@ -0,0 +1,523 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="297mm"
height="210mm"
viewBox="0 0 297 210"
version="1.1"
id="svg5"
inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
sodipodi:docname="gather_bench.svg"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<sodipodi:namedview
id="namedview7"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:document-units="mm"
showgrid="false"
inkscape:zoom="0.73508842"
inkscape:cx="551.63432"
inkscape:cy="348.25743"
inkscape:window-width="1920"
inkscape:window-height="1011"
inkscape:window-x="0"
inkscape:window-y="165"
inkscape:window-maximized="1"
inkscape:current-layer="layer1" />
<defs
id="defs2">
<rect
x="144.01516"
y="304.36604"
width="248.99777"
height="100.91557"
id="rect79475" />
<rect
x="309.01869"
y="43.698615"
width="552.19421"
height="71.390348"
id="rect65238" />
<rect
x="762.55856"
y="341.3838"
width="277.62756"
height="105.0235"
id="rect47632" />
<linearGradient
inkscape:collect="always"
id="linearGradient40704">
<stop
style="stop-color:#ccffaa;stop-opacity:1;"
offset="0"
id="stop40700" />
<stop
style="stop-color:#ccffaa;stop-opacity:0;"
offset="1"
id="stop40702" />
</linearGradient>
<marker
style="overflow:visible;"
id="Arrow2Mend"
refX="0.0"
refY="0.0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(0.6) rotate(180) translate(0,0)"
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
style="stroke:context-stroke;fill-rule:evenodd;fill:context-stroke;stroke-width:0.62500000;stroke-linejoin:round;"
id="path39486" />
</marker>
<marker
style="overflow:visible;"
id="Arrow1Mend"
refX="0.0"
refY="0.0"
orient="auto"
inkscape:stockid="Arrow1Mend"
inkscape:isstock="true">
<path
transform="scale(0.4) rotate(180) translate(10,0)"
style="fill-rule:evenodd;fill:context-stroke;stroke:context-stroke;stroke-width:1.0pt;"
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
id="path39468" />
</marker>
<marker
style="overflow:visible;"
id="Arrow1Lend"
refX="0.0"
refY="0.0"
orient="auto"
inkscape:stockid="Arrow1Lend"
inkscape:isstock="true">
<path
transform="scale(0.8) rotate(180) translate(12.5,0)"
style="fill-rule:evenodd;fill:context-stroke;stroke:context-stroke;stroke-width:1.0pt;"
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
id="path39462" />
</marker>
<rect
x="707.09731"
y="616.36746"
width="407.71288"
height="417.08306"
id="rect24254" />
<rect
x="47.404365"
y="100.3268"
width="398.49855"
height="110.16514"
id="rect5050" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3-5" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3-5-6" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3-5-6-1" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-0" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-0-6" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-0-6-2" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-0-6-2-8" />
<marker
style="overflow:visible"
id="Arrow2Mend-2"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(-0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path39486-3" />
</marker>
<marker
style="overflow:visible"
id="Arrow2Mend-2-5"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(-0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path39486-3-9" />
</marker>
<marker
style="overflow:visible"
id="Arrow2Mend-2-5-2"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(-0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path39486-3-9-8" />
</marker>
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient40704"
id="linearGradient40706"
x1="324.58157"
y1="127.35331"
x2="363.61096"
y2="98.957848"
gradientUnits="userSpaceOnUse" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3-5-6-1-7" />
<rect
x="309.01868"
y="43.698616"
width="552.19421"
height="71.39035"
id="rect65238-1" />
</defs>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1">
<rect
style="fill:#d5d5ff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
id="rect55834"
width="250.31726"
height="74.676537"
x="25.257824"
y="97.277718" />
<rect
style="fill:#d5f6ff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
id="rect55832"
width="250.35208"
height="64.461151"
x="25.256891"
y="32.817505" />
<rect
style="fill:#ccffaa;stroke:#091600;stroke-width:1.31891"
id="rect6462"
width="82.385742"
height="20.525751"
x="28.355024"
y="48.740646" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,17.244577,26.206534)"
id="text5048"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82948"><tspan
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
id="tspan82946">gather-bench</tspan></tspan></text>
<rect
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect6462-9"
width="18.764017"
height="20.965076"
x="39.518955"
y="140.726" />
<text
xml:space="preserve"
transform="matrix(0.33667319,0,0,0.33667319,25.589293,109.42998)"
id="text5048-3"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82950">L1</tspan></text>
<rect
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect6462-9-0"
width="21.653919"
height="24.193966"
x="97.687294"
y="138.51564" />
<text
xml:space="preserve"
transform="matrix(0.3885252,0,0,0.3885252,81.212654,102.39964)"
id="text5048-3-6"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82952">L2</tspan></text>
<rect
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect6462-9-0-6"
width="27.217058"
height="30.409672"
x="149.19933"
y="134.83977" />
<text
xml:space="preserve"
transform="matrix(0.48834178,0,0,0.48834178,128.49215,89.445174)"
id="text5048-3-6-1"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6-2);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82954">L3</tspan></text>
<rect
style="fill:#eeaaff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect6462-9-0-6-7"
width="61.032539"
height="29.96501"
x="204.01265"
y="135.61238" />
<text
xml:space="preserve"
transform="matrix(0.48834178,0,0,0.48834178,182.37007,89.995434)"
id="text5048-3-6-1-9"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6-2-8);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82956">DRAM</tspan></text>
<rect
style="fill:#ffccaa;stroke:#091600;stroke-width:1.10636"
id="rect6462-6"
width="74.980759"
height="15.869514"
x="126.09525"
y="38.773243" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,115.65481,14.295323)"
id="text5048-7"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82958">Single gather</tspan></text>
<rect
style="fill:#ffccaa;stroke:#091600;stroke-width:1.03971"
id="rect6462-6-3"
width="66.071701"
height="15.904838"
x="126.90776"
y="63.642746" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,116.63325,39.114393)"
id="text5048-7-5"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82960">MD gathers</tspan></text>
<rect
style="fill:#afe9dd;stroke:#091600;stroke-width:1.02848"
id="rect6462-6-3-2"
width="64.479698"
height="15.947394"
x="206.65364"
y="52.98967" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,196.01512,28.482594)"
id="text5048-7-5-9"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5-6);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82962">Contiguous</tspan></text>
<rect
style="fill:#afe9dd;stroke:#091600;stroke-width:0.987323"
id="rect6462-6-3-2-2"
width="59.269382"
height="15.988551"
x="208.16559"
y="76.856781" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,197.58604,52.220445)"
id="text5048-7-5-9-7"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5-6-1);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82964">&quot;Random&quot;</tspan></text>
<text
xml:space="preserve"
transform="scale(0.26458333)"
id="text24252"
style="fill:black;fill-opacity:1;stroke:none;font-family:sans-serif;font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect24254)" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="M 193.10512,71.273276 206.30683,61.033513"
id="path39049"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="M 193.08841,71.196939 207.86207,84.43804"
id="path39053"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.39816;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 58.548229,151.24436 38.298093,0.25023"
id="path39219"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.24847;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 119.19252,150.09399 29.28333,0.26095"
id="path39219-2"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 177.02022,150.44367 26.36623,0.26095"
id="path39219-2-0"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend)"
d="m 48.145458,92.71788 -0.644819,47.57709"
id="path39377"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2)"
d="M 48.121208,92.873762 106.60807,137.41946"
id="path39377-7"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2-5)"
d="M 48.073928,92.825143 158.88023,133.04546"
id="path39377-7-2"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2-5-2)"
d="M 48.051946,92.813593 233.0959,134.16596"
id="path39377-7-2-9"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<rect
style="fill:#e9afaf;stroke:#091600;stroke-width:1.34518"
id="rect6462-6-3-2-2-3"
width="65.880661"
height="26.700579"
x="38.104012"
y="80.530182" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 77.365612,69.678744 h 2e-6"
id="path39808"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 111.64767,59.183009 6.84466,0.03069"
id="path41004"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 119.03378,47.056357 -0.58704,25.198541"
id="path41006"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.02423;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 118.07503,72.254897 7.94998,-0.05784"
id="path41008"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.882836;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 118.26666,47.054814 7.69322,0.173925"
id="path41112"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
d="M 68.213642,69.068864 67.910274,80.302728"
id="path55728"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,-1.3782637,4.0412367)"
id="text65236"
style="font-style:normal;font-weight:normal;font-size:53.3333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect65238);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="309.01953"
y="90.886691"
id="tspan82968"><tspan
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
id="tspan82966">Application Level</tspan></tspan></text>
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,2.7015103,160.71919)"
id="text65236-2"
style="font-style:normal;font-weight:normal;font-size:53.3333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect65238-1);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="309.01953"
y="90.886691"
id="tspan82972"><tspan
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
id="tspan82970">Hardware Level</tspan></tspan></text>
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,2.3490396,0.57331532)"
id="text79473"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect79475);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="144.01562"
y="339.75586"
id="tspan82974">vgather </tspan><tspan
x="144.01562"
y="389.75586"
id="tspan82976">instructions</tspan></text>
</g>
</svg>

After

Width:  |  Height:  |  Size: 21 KiB

BIN
figures/gromacs_mxn_v2.pdf Normal file

Binary file not shown.

BIN
figures/gromacs_mxn_v2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 128 KiB

BIN
figures/stub_new_v3.pdf Normal file

Binary file not shown.

BIN
figures/stub_new_v3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

BIN
figures/verlet_v2.pdf Normal file

Binary file not shown.

BIN
figures/verlet_v2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

1
gather-bench Submodule

@ -0,0 +1 @@
Subproject commit 2f654cb043359197be07e0fa362324dab8899a33

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -45,7 +45,7 @@ static inline void gmx_load_simd_4xn_interactions(
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ begin\n");
int Nlocal = atom->Nlocal;
int *neighs;
NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@ -66,7 +66,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
{
LIKWID_MARKER_START("force");
#pragma omp for schedule(runtime)
#pragma omp for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
int ci_cj1 = CJ1_FROM_CI(ci);
@ -77,7 +77,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
int numneighs = neighbor->numneigh[ci];
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int any = 0;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@ -158,7 +158,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal;
int *neighs;
NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@ -213,7 +213,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
#endif
*/
#pragma omp for schedule(runtime)
#pragma omp for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
@ -240,9 +240,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs_masked; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
//int imask = neighs_imask[k];
//int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
//MD_SIMD_MASK interact0;
@ -331,7 +331,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
}
for(int k = numneighs_masked; k < numneighs; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
@ -401,7 +401,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal;
int *neighs;
NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@ -427,7 +427,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
{
LIKWID_MARKER_START("force");
#pragma omp for schedule(runtime)
#pragma omp for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
@ -454,8 +454,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs_masked; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
unsigned int mask0, mask1, mask2, mask3;
@ -506,7 +507,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
}
for(int k = numneighs_masked; k < numneighs; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
@ -569,7 +570,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
int Nlocal = atom->Nlocal;
int *neighs;
NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@ -595,7 +596,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
{
LIKWID_MARKER_START("force");
#pragma omp for schedule(runtime)
#pragma omp for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
@ -634,8 +635,9 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs_masked; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
@ -739,8 +741,9 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
}
for(int k = numneighs_masked; k < numneighs; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
@ -843,7 +846,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
int Nlocal = atom->Nlocal;
int *neighs;
NeighborCluster* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
@ -869,7 +872,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
{
LIKWID_MARKER_START("force");
#pragma omp for schedule(runtime)
#pragma omp for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
@ -908,8 +911,9 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs_masked; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
@ -987,8 +991,9 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
}
for(int k = numneighs_masked; k < numneighs; k++) {
int cj = neighs[k];
int cj = neighs[k].cj;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int imask = neighs[k].imask;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -25,6 +25,11 @@
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
typedef struct {
int cj;
unsigned int imask;
} NeighborCluster;
typedef struct {
int every;
int ncalls;
@ -32,8 +37,7 @@ typedef struct {
int* numneigh;
int* numneigh_masked;
int half_neigh;
int* neighbors;
unsigned int* neighbors_imask;
NeighborCluster* neighbors;
} Neighbor;
extern void initNeighbor(Neighbor*, Parameter*);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -60,15 +60,18 @@ void init(Parameter *param) {
param->eam_file = NULL;
}
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps, int masked) {
// Show debug messages
#define DEBUG(msg) printf(msg)
// Do not show debug messages
//#define DEBUG(msg)
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
const int maxneighs = nneighs * nreps;
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
const int ncj = atom->Nclusters_local / jfac;
const unsigned int imask = NBNXN_INTERACTION_MASK_ALL;
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->numneigh_masked = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int*) malloc(atom->Nclusters_max * maxneighs * sizeof(unsigned int));
if(pattern == P_RAND && ncj <= nneighs) {
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
@ -77,7 +80,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
int m = (pattern == P_SEQ) ? ncj : nneighs;
int k = 0;
@ -88,7 +90,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
do {
int cj = rand() % ncj;
neighptr[k] = cj;
neighptr_imask[k] = imask;
found = 0;
for(int l = 0; l < k; l++) {
if(neighptr[l] == cj) {
@ -98,7 +99,6 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
} while(found == 1);
} else {
neighptr[k] = j;
neighptr_imask[k] = imask;
j = (j + 1) % m;
}
}
@ -106,12 +106,10 @@ void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, i
for(int r = 1; r < nreps; r++) {
for(int k = 0; k < nneighs; k++) {
neighptr[r * nneighs + k] = neighptr[k];
neighptr_imask[r * nneighs + k] = neighptr_imask[k];
}
}
neighbor->numneigh[ci] = nneighs * nreps;
neighbor->numneigh_masked[ci] = (masked == 1) ? (nneighs * nreps) : 0;
}
}
@ -127,13 +125,12 @@ int main(int argc, const char *argv[]) {
int niclusters = 256; // Number of local i-clusters
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
int masked = 0; // Use masked loop
int nreps = 1;
int csv = 0;
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG_MESSAGE("Initializing parameters...\n");
DEBUG("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
@ -159,10 +156,6 @@ int main(int argc, const char *argv[]) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-m") == 0)) {
masked = 1;
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
@ -213,11 +206,11 @@ int main(int argc, const char *argv[]) {
}
if(param.force_field == FF_EAM) {
DEBUG_MESSAGE("Initializing EAM parameters...\n");
DEBUG("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG_MESSAGE("Initializing atoms...\n");
DEBUG("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
@ -233,7 +226,7 @@ int main(int argc, const char *argv[]) {
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG_MESSAGE("Creating atoms...\n");
DEBUG("Creating atoms...\n");
while(atom->Nmax < niclusters * iclusters_natoms) {
growAtom(atom);
}
@ -288,13 +281,13 @@ int main(int argc, const char *argv[]) {
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG_MESSAGE("Defining j-clusters...\n");
DEBUG("Defining j-clusters...\n");
defineJClusters(atom);
DEBUG_MESSAGE("Initializing neighbor lists...\n");
DEBUG("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG_MESSAGE("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps, masked);
DEBUG_MESSAGE("Computing forces...\n");
DEBUG("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
DEBUG("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {

View File

@ -1,13 +1,11 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <omp.h>
//--
#include <likwid-marker.h>
//--
@ -119,7 +117,7 @@ int main(int argc, char** argv) {
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0) || (strcmp(argv[i], "--param") == 0)) {
if((strcmp(argv[i], "-p") == 0)) {
readParameter(&param, argv[++i]);
continue;
}
@ -310,30 +308,6 @@ int main(int argc, char** argv) {
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
int nthreads = 0;
int chunkSize = 0;
omp_sched_t schedKind;
char schedType[10];
#pragma omp parallel
#pragma omp master
{
omp_get_schedule(&schedKind, &chunkSize);
switch (schedKind)
{
case omp_sched_static: strcpy(schedType, "static"); break;
case omp_sched_dynamic: strcpy(schedType, "dynamic"); break;
case omp_sched_guided: strcpy(schedType, "guided"); break;
case omp_sched_auto: strcpy(schedType, "auto"); break;
}
nthreads = omp_get_max_threads();
}
printf("Num threads: %d\n", nthreads);
printf("Schedule: (%s,%d)\n", schedType, chunkSize);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -58,7 +58,6 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
neighbor->numneigh = NULL;
neighbor->numneigh_masked = NULL;
neighbor->neighbors = NULL;
neighbor->neighbors_imask = NULL;
}
void setupNeighbor(Parameter *param, Atom *atom) {
@ -230,13 +229,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
if(atom->Nclusters_local > nmax) {
nmax = atom->Nclusters_local;
if(neighbor->numneigh) free(neighbor->numneigh);
if(neighbor->numneigh_masked) free(neighbor->numneigh_masked);
if(neighbor->neighbors) free(neighbor->neighbors);
if(neighbor->neighbors_imask) free(neighbor->neighbors_imask);
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int*) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
}
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
@ -252,8 +248,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj1 = CJ1_FROM_CI(ci);
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
unsigned int *neighptr_imask = &(neighbor->neighbors_imask[ci * neighbor->maxneighs]);
NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
int n = 0, nmasked = 0;
int ibin = atom->icluster_bin[ci];
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
@ -329,17 +324,15 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
imask = get_imask_simd_4xn(1, ci, cj);
#endif
if(n < neighbor->maxneighs) {
if(imask == NBNXN_INTERACTION_MASK_ALL) {
neighptr[n] = cj;
neighptr_imask[n] = imask;
} else {
neighptr[n] = neighptr[nmasked];
neighptr_imask[n] = neighptr_imask[nmasked];
neighptr[nmasked] = cj;
neighptr_imask[nmasked] = imask;
nmasked++;
}
if(imask == NBNXN_INTERACTION_MASK_ALL) {
neighptr[n].cj = cj;
neighptr[n].imask = imask;
} else {
neighptr[n].cj = neighptr[nmasked].cj;
neighptr[n].imask = neighptr[nmasked].imask;
neighptr[nmasked].cj = cj;
neighptr[nmasked].imask = imask;
nmasked++;
}
n++;
@ -364,8 +357,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
// Fill neighbor list with dummy values to fit vector width
if(CLUSTER_N < VECTOR_WIDTH) {
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
neighptr[n] = atom->dummy_cj; // Last cluster is always a dummy cluster
neighptr_imask[n] = 0;
neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
neighptr[n].imask = 0;
n++;
}
}
@ -382,12 +375,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
}
if(resize) {
neighbor->maxneighs = new_maxneighs * 1.2;
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
neighbor->maxneighs = new_maxneighs * 1.2;
free(neighbor->neighbors);
free(neighbor->neighbors_imask);
neighbor->neighbors = (int *) malloc(nmax * neighbor->maxneighs * sizeof(int));
neighbor->neighbors_imask = (unsigned int *) malloc(nmax * neighbor->maxneighs * sizeof(unsigned int));
neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
}
}
@ -442,21 +433,20 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
MD_FLOAT cutsq = cutneighsq;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
unsigned int *neighs_imask = &neighbor->neighbors_imask[ci * neighbor->maxneighs];
NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
int numneighs_masked = neighbor->numneigh_masked[ci];
int k = 0;
// Remove dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) {
while(neighs[numneighs - 1] == atom->dummy_cj) {
while(neighs[numneighs - 1].cj == atom->dummy_cj) {
numneighs--;
}
}
while(k < numneighs) {
int cj = neighs[k];
int cj = neighs[k].cj;
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
k++;
} else {
@ -471,8 +461,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
// Readd dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) {
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
neighs[numneighs] = atom->dummy_cj; // Last cluster is always a dummy cluster
neighs_imask[numneighs] = 0;
neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
neighs[numneighs].imask = 0;
numneighs++;
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -13,8 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
MEM_TRACER_INIT;
INDEX_TRACER_INIT;
int Nlocal = atom->Nlocal;
int *neighs;
unsigned int *neighs_imask;
NeighborCluster* neighs;
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
@ -35,7 +34,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
DIST_TRACE(neighs, numneighs);
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
int j = neighs[k].cj;
MEM_TRACE(j, 'R');
MEM_TRACE(atom_x(j), 'R');
MEM_TRACE(atom_y(j), 'R');

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,18 +1,17 @@
CC = /opt/homebrew/Cellar/llvm/18.1.5/bin/clang
CC = clang
LINKER = $(CC)
ANSI_CFLAGS = -ansi
ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
# ANSI_CFLAGS += -Wextra
ANSI_CFLAGS += -Wextra
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) -Xpreprocessor -fopenmp #-g
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
ASFLAGS = #-masm=intel
ASFLAGS = -masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE
# MacOSX with Apple Silicon and homebrew
INCLUDES = -I/opt/homebrew/Cellar/libomp/18.1.5/include/
LIBS = -lm -L/opt/homebrew/Cellar/libomp/18.1.5/lib/ -lomp
INCLUDES =
LIBS = -lm #-lomp

11
include_GROMACS.mk Normal file
View File

@ -0,0 +1,11 @@
GROMACS_PATH=/apps/Gromacs/2018.1-mkl
GROMACS_INC ?= -I${GROMACS_PATH}/include
GROMACS_DEFINES ?=
GROMACS_LIB ?= -L${GROMACS_PATH}/lib64
ifeq ($(strip $(XTC_OUTPUT)),true)
INCLUDES += ${GROMACS_INC}
DEFINES += ${GROMACS_DEFINES}
LIBS += -lgromacs
LFLAGS += ${GROMACS_LIB}
endif

View File

@ -1,7 +1,7 @@
CC = icc
LINKER = $(CC)
OPENMP = -qopenmp
OPENMP = #-qopenmp
PROFILE = #-profile-functions -g -pg
ifeq ($(ISA),AVX512)

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -502,21 +502,6 @@ int readAtom_in(Atom* atom, Parameter* param) {
return natoms;
}
void writeAtom(Atom *atom, Parameter *param) {
FILE *fp = fopen(param->write_atom_file, "w");
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fp, "%d,%f,%f,%f,%f,%f,%f,%f,0\n",
atom->type[i], 1.0,
atom_x(i), atom_y(i), atom_z(i),
atom_vx(i), atom_vy(i), atom_vz(i));
}
fclose(fp);
fprintf(stdout, "Wrote input data to %s, grid size: %f, %f, %f\n",
param->write_atom_file, param->xprd, param->yprd, param->zprd);
}
void growAtom(Atom *atom) {
DeviceAtom *d_atom = &(atom->d_atom);
int nold = atom->Nmax;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -29,7 +29,7 @@ extern "C" {
}
// cuda kernel
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh, int ntypes) {
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= Nlocal) {
return;
@ -46,10 +46,6 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
for(int k = 0; k < numneighs; k++) {
int j = neigh_neighbors[Nlocal * k + i];
MD_FLOAT delx = xtmp - atom_x(j);
@ -59,7 +55,7 @@ __global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, M
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * ntypes + type_j;
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
@ -113,7 +109,7 @@ extern "C" {
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
@ -127,7 +123,7 @@ void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
@ -140,11 +136,13 @@ void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
}
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
int Nlocal = atom->Nlocal;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
/*
int nDevices;
@ -167,7 +165,7 @@ double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neig
double S = getTimeStamp();
LIKWID_MARKER_START("force");
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh, atom->ntypes);
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
cuda_assert("calc_force", cudaPeekAtLastError());
cuda_assert("calc_force", cudaDeviceSynchronize());
cudaProfilerStop();

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -120,7 +120,7 @@ __global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins
__global__ void compute_neighborhood(
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq, int ntypes) {
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= nlocal) {
@ -157,7 +157,7 @@ __global__ void compute_neighborhood(
#ifdef EXPLICIT_TYPES
int type_j = atom->type[j];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * ntypes + type_j];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
#else
const MD_FLOAT cutoff = cutneighsq;
#endif
@ -206,7 +206,7 @@ void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbo
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
int nall = atom->Nlocal + atom->Nghost;
cudaProfilerStart();
@ -269,7 +269,7 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
c_new_maxneighs,
cutneighsq, atom->ntypes);
cutneighsq);
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -65,7 +65,7 @@ __global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
if(reneigh) {
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
@ -98,7 +98,7 @@ void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
}
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
const int num_threads_per_block = get_cuda_num_threads();
const int num_threads_per_block = get_num_threads();
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -14,7 +14,6 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->cutneighsq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
@ -23,7 +22,6 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->cutneighsq, atom->cutneighsq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

272
lammps/force_lj.c Normal file
View File

@ -0,0 +1,272 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
//---
#include <atom.h>
#include <likwid-marker.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#ifdef __SIMD_KERNEL__
#include <simd.h>
#endif
double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int* neighs;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
const MD_FLOAT num1 = 1.0;
const MD_FLOAT num48 = 48.0;
const MD_FLOAT num05 = 0.5;
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
#pragma omp parallel
{
LIKWID_MARKER_START("force");
#pragma omp for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = num1 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
#ifdef USE_REFERENCE_VERSION
addStat(stats->atoms_within_cutoff, 1);
} else {
addStat(stats->atoms_outside_cutoff, 1);
#endif
}
}
atom_fx(i) += fix;
atom_fy(i) += fiy;
atom_fz(i) += fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
LIKWID_MARKER_STOP("force");
}
double E = getTimeStamp();
return E-S;
}
double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int* neighs;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
const MD_FLOAT num1 = 1.0;
const MD_FLOAT num48 = 48.0;
const MD_FLOAT num05 = 0.5;
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
#pragma omp parallel
{
LIKWID_MARKER_START("forceLJ-halfneigh");
#pragma omp for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
// Pragma required to vectorize the inner loop
#ifdef ENABLE_OMP_SIMD
#pragma omp simd reduction(+: fix,fiy,fiz)
#endif
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = num1 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
// We do not need to update forces for ghost atoms
if(j < Nlocal) {
atom_fx(j) -= delx * force;
atom_fy(j) -= dely * force;
atom_fz(j) -= delz * force;
}
}
}
atom_fx(i) += fix;
atom_fy(i) += fiy;
atom_fz(i) += fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
LIKWID_MARKER_STOP("forceLJ-halfneigh");
}
double E = getTimeStamp();
return E-S;
}
double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
#ifndef __SIMD_KERNEL__
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
exit(-1);
#else
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
#pragma omp parallel
{
LIKWID_MARKER_START("force");
#pragma omp for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs);
MD_SIMD_FLOAT xtmp = simd_broadcast(atom_x(i));
MD_SIMD_FLOAT ytmp = simd_broadcast(atom_y(i));
MD_SIMD_FLOAT ztmp = simd_broadcast(atom_z(i));
MD_SIMD_FLOAT fix = simd_zero();
MD_SIMD_FLOAT fiy = simd_zero();
MD_SIMD_FLOAT fiz = simd_zero();
for(int k = 0; k < numneighs; k += VECTOR_WIDTH) {
// If the last iteration of this loop is separated from the rest, this mask can be set only there
MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt(simd_int_add(simd_int_broadcast(k), simd_int_seq()), numneighs_vec);
MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs);
#ifdef AOS
MD_SIMD_INT j3 = simd_int_add(simd_int_add(j, j), j); // j * 3
MD_SIMD_FLOAT delx = xtmp - simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT));
MD_SIMD_FLOAT dely = ytmp - simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT));
MD_SIMD_FLOAT delz = ztmp - simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT));
#else
MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT));
MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT));
MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT));
#endif
MD_SIMD_FLOAT rsq = simd_fma(delx, delx, simd_fma(dely, dely, simd_mul(delz, delz)));
MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs, simd_mask_cond_lt(rsq, cutforcesq_vec));
MD_SIMD_FLOAT sr2 = simd_reciprocal(rsq);
MD_SIMD_FLOAT sr6 = simd_mul(sr2, simd_mul(sr2, simd_mul(sr2, sigma6_vec)));
MD_SIMD_FLOAT force = simd_mul(c48_vec, simd_mul(sr6, simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec))));
fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask);
fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask);
fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask);
}
atom_fx(i) += simd_h_reduce_sum(fix);
atom_fy(i) += simd_h_reduce_sum(fiy);
atom_fz(i) += simd_h_reduce_sum(fiz);
}
LIKWID_MARKER_STOP("force");
}
#endif
double E = getTimeStamp();
return E-S;
}

102
lammps/includes/atom.h Normal file
View File

@ -0,0 +1,102 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#ifndef __ATOM_H_
#define __ATOM_H_
#ifdef CUDA_TARGET
# define KERNEL_NAME "CUDA"
# define computeForceLJFullNeigh computeForceLJFullNeigh_cuda
# define initialIntegrate initialIntegrate_cuda
# define finalIntegrate finalIntegrate_cuda
# define buildNeighbor buildNeighbor_cuda
# define updatePbc updatePbc_cuda
# define updateAtomsPbc updateAtomsPbc_cuda
#else
# ifdef USE_SIMD_KERNEL
# define KERNEL_NAME "SIMD"
# define computeForceLJFullNeigh computeForceLJFullNeigh_simd
# else
# define KERNEL_NAME "plain-C"
# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c
# endif
# define initialIntegrate initialIntegrate_cpu
# define finalIntegrate finalIntegrate_cpu
# define buildNeighbor buildNeighbor_cpu
# define updatePbc updatePbc_cpu
# define updateAtomsPbc updateAtomsPbc_cpu
#endif
typedef struct {
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
MD_FLOAT *fx, *fy, *fz;
int *border_map;
int *type;
MD_FLOAT *epsilon;
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
} DeviceAtom;
typedef struct {
int Natoms, Nlocal, Nghost, Nmax;
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
MD_FLOAT *fx, *fy, *fz;
int *border_map;
int *type;
int ntypes;
MD_FLOAT *epsilon;
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
// DEM
MD_FLOAT *radius;
MD_FLOAT *av;
MD_FLOAT *r;
// Device data
DeviceAtom d_atom;
} Atom;
extern void initAtom(Atom*);
extern void createAtom(Atom*, Parameter*);
extern int readAtom(Atom*, Parameter*);
extern int readAtom_pdb(Atom*, Parameter*);
extern int readAtom_gro(Atom*, Parameter*);
extern int readAtom_dmp(Atom*, Parameter*);
extern int readAtom_in(Atom*, Parameter*);
extern void growAtom(Atom*);
#ifdef AOS
# define POS_DATA_LAYOUT "AoS"
# define atom_x(i) atom->x[(i) * 3 + 0]
# define atom_y(i) atom->x[(i) * 3 + 1]
# define atom_z(i) atom->x[(i) * 3 + 2]
# define atom_vx(i) atom->vx[(i) * 3 + 0]
# define atom_vy(i) atom->vx[(i) * 3 + 1]
# define atom_vz(i) atom->vx[(i) * 3 + 2]
# define atom_fx(i) atom->fx[(i) * 3 + 0]
# define atom_fy(i) atom->fx[(i) * 3 + 1]
# define atom_fz(i) atom->fx[(i) * 3 + 2]
#else
# define POS_DATA_LAYOUT "SoA"
# define atom_x(i) atom->x[i]
# define atom_y(i) atom->y[i]
# define atom_z(i) atom->z[i]
# define atom_vx(i) atom->vx[i]
# define atom_vy(i) atom->vy[i]
# define atom_vz(i) atom->vz[i]
# define atom_fx(i) atom->fx[i]
# define atom_fy(i) atom->fy[i]
# define atom_fz(i) atom->fz[i]
#endif
#endif

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -11,7 +11,7 @@
#ifndef __PBC_H_
#define __PBC_H_
extern void initPbc(Atom*);
extern void initPbc();
extern void updatePbc_cpu(Atom*, Parameter*, bool);
extern void updateAtomsPbc_cpu(Atom*, Parameter*);
extern void setupPbc(Atom*, Parameter*);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -59,6 +59,12 @@ void init(Parameter *param) {
param->eam_file = NULL;
}
// Show debug messages
#define DEBUG(msg) printf(msg)
// Do not show debug messages
//#define DEBUG(msg)
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
const int maxneighs = nneighs * nreps;
neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
@ -119,7 +125,7 @@ int main(int argc, const char *argv[]) {
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG_MESSAGE("Initializing parameters...\n");
DEBUG("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
@ -190,11 +196,11 @@ int main(int argc, const char *argv[]) {
}
if(param.force_field == FF_EAM) {
DEBUG_MESSAGE("Initializing EAM parameters...\n");
DEBUG("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG_MESSAGE("Initializing atoms...\n");
DEBUG("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
@ -210,7 +216,7 @@ int main(int argc, const char *argv[]) {
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG_MESSAGE("Creating atoms...\n");
DEBUG("Creating atoms...\n");
for(int i = 0; i < natoms; ++i) {
while(atom->Nlocal > atom->Nmax - natoms) {
growAtom(atom);
@ -241,11 +247,11 @@ int main(int argc, const char *argv[]) {
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG_MESSAGE("Initializing neighbor lists...\n");
DEBUG("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG_MESSAGE("Creating neighbor lists...\n");
DEBUG("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
DEBUG_MESSAGE("Computing forces...\n");
DEBUG("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {

285
lammps/main.c Normal file
View File

@ -0,0 +1,285 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <math.h>
#include <float.h>
#include <likwid-marker.h>
#include <allocate.h>
#include <atom.h>
#include <device.h>
#include <eam.h>
#include <integrate.h>
#include <thermo.h>
#include <timing.h>
#include <neighbor.h>
#include <parameter.h>
#include <pbc.h>
#include <stats.h>
#include <timers.h>
#include <util.h>
#include <vtk.h>
#define HLINE "----------------------------------------------------------------------------\n"
extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
#ifdef CUDA_TARGET
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
#endif
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
param->xprd = param->nx * param->lattice;
param->yprd = param->ny * param->lattice;
param->zprd = param->nz * param->lattice;
S = getTimeStamp();
initAtom(atom);
initPbc(atom);
initStats(stats);
initNeighbor(neighbor, param);
if(param->input_file == NULL) {
createAtom(atom, param);
} else {
readAtom(atom, param);
}
setupNeighbor(param);
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
setupPbc(atom, param);
initDevice(atom, neighbor);
updatePbc(atom, param, true);
buildNeighbor(atom, neighbor);
E = getTimeStamp();
return E-S;
}
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
updateAtomsPbc(atom, param);
setupPbc(atom, param);
updatePbc(atom, param, true);
//sortAtom(atom);
buildNeighbor(atom, neighbor);
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
return E-S;
}
void printAtomState(Atom *atom) {
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
// int nall = atom->Nlocal + atom->Nghost;
// for (int i=0; i<nall; i++) {
// printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]);
// }
}
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) {
return computeForceEam(eam, param, atom, neighbor, stats);
} else if(param->force_field == FF_DEM) {
if(param->half_neigh) {
fprintf(stderr, "Error: DEM cannot use half neighbor-lists!\n");
return 0.0;
} else {
return computeForceDemFullNeigh(param, atom, neighbor, stats);
}
}
if(param->half_neigh) {
return computeForceLJHalfNeigh(param, atom, neighbor, stats);
}
#ifdef CUDA_TARGET
return computeForceLJFullNeigh(param, atom, neighbor);
#else
return computeForceLJFullNeigh(param, atom, neighbor, stats);
#endif
}
void writeInput(Parameter *param, Atom *atom) {
FILE *fpin = fopen("input.in", "w");
fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd);
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fpin, "1,%f,%f,%f,%f,%f,%f\n", atom_x(i), atom_y(i), atom_z(i), atom_vx(i), atom_vy(i), atom_vz(i));
}
fclose(fpin);
}
int main(int argc, char** argv) {
double timer[NUMTIMER];
Eam eam;
Atom atom;
Neighbor neighbor;
Stats stats;
Parameter param;
LIKWID_MARKER_INIT;
#pragma omp parallel
{
LIKWID_MARKER_REGISTER("force");
//LIKWID_MARKER_REGISTER("reneighbour");
//LIKWID_MARKER_REGISTER("pbc");
}
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0)) {
readParameter(&param, argv[++i]);
continue;
}
if((strcmp(argv[i], "-f") == 0)) {
if((param.force_field = str2ff(argv[++i])) < 0) {
fprintf(stderr, "Invalid force field!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-i") == 0)) {
param.input_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-e") == 0)) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nx") == 0)) {
param.nx = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-ny") == 0)) {
param.ny = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nz") == 0)) {
param.nz = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-half") == 0)) {
param.half_neigh = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
param.cutforce = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
param.skin = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--freq") == 0)) {
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--vtk") == 0)) {
param.vtk_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
printf("-f <string>: force field (lj, eam or dem), default lj\n");
printf("-i <string>: input file with atom positions (dump)\n");
printf("-e <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-r / --radius <real>: set cutoff radius\n");
printf("-s / --skin <real>: set skin (verlet buffer)\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
param.cutneigh = param.cutforce + param.skin;
setup(&param, &eam, &atom, &neighbor, &stats);
printParameter(&param);
printf(HLINE);
printf("step\ttemp\t\tpressure\n");
computeThermo(0, &param, &atom);
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
//writeInput(&param, &atom);
timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);
timer[NEIGH] = 0.0;
timer[TOTAL] = getTimeStamp();
if(param.vtk_file != NULL) {
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
}
for(int n = 0; n < param.ntimes; n++) {
bool reneigh = (n + 1) % param.reneigh_every == 0;
initialIntegrate(reneigh, &param, &atom);
if((n + 1) % param.reneigh_every) {
updatePbc(&atom, &param, false);
} else {
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
}
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
timer[FORCE] += computeForce(&eam, &param, &atom, &neighbor, &stats);
finalIntegrate(reneigh, &param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
#ifdef CUDA_TARGET
memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
#endif
computeThermo(n + 1, &param, &atom);
}
if(param.vtk_file != NULL) {
write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
}
}
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
computeThermo(-1, &param, &atom);
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS
displayStatistics(&atom, &param, &stats, timer);
#endif
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
@ -326,45 +326,45 @@ void sortAtom(Atom* atom) {
int Nmax = atom->Nmax;
int* binpos = bincount;
for(int i = 1; i < mbins; i++) {
binpos[i] += binpos[i - 1];
for(int i=1; i<mbins; i++) {
binpos[i] += binpos[i-1];
}
#ifdef AOS
#ifdef AOS
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
#else
#else
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
#endif
#endif
MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
for(int mybin = 0; mybin < mbins; mybin++) {
int start = mybin > 0 ? binpos[mybin - 1] : 0;
for(int mybin = 0; mybin<mbins; mybin++) {
int start = mybin>0?binpos[mybin-1]:0;
int count = binpos[mybin] - start;
for(int k = 0; k < count; k++) {
for(int k=0; k<count; k++) {
int new_i = start + k;
int old_i = bins[mybin * atoms_per_bin + k];
#ifdef AOS
#ifdef AOS
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
new_vx[new_i * 3 + 1] = old_vx[old_i * 3 + 1];
new_vx[new_i * 3 + 2] = old_vx[old_i * 3 + 2];
#else
#else
new_x[new_i] = old_x[old_i];
new_y[new_i] = old_y[old_i];
new_z[new_i] = old_z[old_i];
new_vx[new_i] = old_vx[old_i];
new_vy[new_i] = old_vy[old_i];
new_vz[new_i] = old_vz[old_i];
#endif
#endif
}
}
@ -372,7 +372,7 @@ void sortAtom(Atom* atom) {
free(atom->vx);
atom->x = new_x;
atom->vx = new_vx;
#ifndef AOS
#ifndef AOS
free(atom->y);
free(atom->z);
free(atom->vy);
@ -381,5 +381,5 @@ void sortAtom(Atom* atom) {
atom->z = new_z;
atom->vy = new_vy;
atom->vz = new_vz;
#endif
#endif
}

171
lammps/pbc.c Normal file
View File

@ -0,0 +1,171 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdbool.h>
#include <stdlib.h>
#include <stdio.h>
//---
#include <pbc.h>
#include <atom.h>
#include <allocate.h>
#define DELTA 20000
int NmaxGhost;
int *PBCx, *PBCy, *PBCz;
static void growPbc(Atom*);
/* exported subroutines */
void initPbc(Atom* atom) {
NmaxGhost = 0;
atom->border_map = NULL;
PBCx = NULL; PBCy = NULL; PBCz = NULL;
}
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void updatePbc_cpu(Atom *atom, Parameter *param, bool doReneighbor) {
int *border_map = atom->border_map;
int nlocal = atom->Nlocal;
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int i = 0; i < atom->Nghost; i++) {
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
}
}
/* relocate atoms that have left domain according
* to periodic boundary conditions */
void updateAtomsPbc_cpu(Atom *atom, Parameter *param) {
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int i = 0; i < atom->Nlocal; i++) {
if(atom_x(i) < 0.0) {
atom_x(i) += xprd;
} else if(atom_x(i) >= xprd) {
atom_x(i) -= xprd;
}
if(atom_y(i) < 0.0) {
atom_y(i) += yprd;
} else if(atom_y(i) >= yprd) {
atom_y(i) -= yprd;
}
if(atom_z(i) < 0.0) {
atom_z(i) += zprd;
} else if(atom_z(i) >= zprd) {
atom_z(i) -= zprd;
}
}
}
/* setup periodic boundary conditions by
* defining ghost atoms around domain
* only creates mapping and coordinate corrections
* that are then enforced in updatePbc */
#define ADDGHOST(dx,dy,dz) \
Nghost++; \
border_map[Nghost] = i; \
PBCx[Nghost] = dx; \
PBCy[Nghost] = dy; \
PBCz[Nghost] = dz; \
atom->type[atom->Nlocal + Nghost] = atom->type[i]
void setupPbc(Atom *atom, Parameter *param) {
int *border_map = atom->border_map;
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
MD_FLOAT Cutneigh = param->cutneigh;
int Nghost = -1;
for(int i = 0; i < atom->Nlocal; i++) {
if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
growAtom(atom);
}
if (Nghost + 7 >= NmaxGhost) {
growPbc(atom);
border_map = atom->border_map;
}
MD_FLOAT x = atom_x(i);
MD_FLOAT y = atom_y(i);
MD_FLOAT z = atom_z(i);
/* Setup ghost atoms */
/* 6 planes */
if(param->pbc_x != 0) {
if (x < Cutneigh) { ADDGHOST(+1,0,0); }
if (x >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
}
if(param->pbc_y != 0) {
if (y < Cutneigh) { ADDGHOST(0,+1,0); }
if (y >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
}
if(param->pbc_z != 0) {
if (z < Cutneigh) { ADDGHOST(0,0,+1); }
if (z >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
}
/* 8 corners */
if(param->pbc_x != 0 && param->pbc_y != 0 && param->pbc_z != 0) {
if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1,+1,+1); }
if (x < Cutneigh && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(+1,-1,+1); }
if (x < Cutneigh && y >= Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (x < Cutneigh && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
if (x >= (xprd-Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1,+1,+1); }
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,-1,+1); }
if (x >= (xprd-Cutneigh) && y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
}
/* 12 edges */
if(param->pbc_x != 0 && param->pbc_z != 0) {
if (x < Cutneigh && z < Cutneigh) { ADDGHOST(+1,0,+1); }
if (x < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
if (x >= (xprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,0,+1); }
if (x >= (xprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
}
if(param->pbc_y != 0 && param->pbc_z != 0) {
if (y < Cutneigh && z < Cutneigh) { ADDGHOST(0,+1,+1); }
if (y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
if (y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(0,-1,+1); }
if (y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
}
if(param->pbc_x != 0 && param->pbc_y != 0) {
if (y < Cutneigh && x < Cutneigh) { ADDGHOST(+1,+1,0); }
if (y < Cutneigh && x >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
if (y >= (yprd-Cutneigh) && x < Cutneigh) { ADDGHOST(+1,-1,0); }
if (y >= (yprd-Cutneigh) && x >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
}
}
// increase by one to make it the ghost atom count
atom->Nghost = Nghost + 1;
}
/* internal subroutines */
void growPbc(Atom* atom) {
int nold = NmaxGhost;
NmaxGhost += DELTA;
atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
PBCx = (int*) reallocate(PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
PBCy = (int*) reallocate(PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
PBCz = (int*) reallocate(PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.

View File

@ -0,0 +1,88 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Initializing parameters...
Initializing atoms...
Creating atoms...
Pattern: seq
Number of timesteps: 200
Number of atoms: 256
Number of neighbors per atom: 1024
Number of times to replicate neighbor lists: 1
Estimated total data volume (kB): 1062.9120
Estimated atom data volume (kB): 6.1440
Estimated neighborlist data volume (kB): 1050.6240
Initializing neighbor lists...
Creating neighbor lists...
Computing forces...
Total time: 0.2735, Mega atom updates/s: 0.1872
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
Statistics:
Vector width: 8, Processor frequency: 2.0000 GHz
Average neighbors per atom: 1018.9055
Average SIMD iterations per atom: 127.3632
Total number of computed pair interactions: 52428800
Total number of SIMD iterations: 6553600
Useful read data volume for force computation: 1.47GB
Cycles/SIMD iteration: 83.4598
--------------------------------------------------------------------------------
Region force, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 0.110776 |
| call count | 200 |
+-------------------+------------+
+------------------------------------------+---------+------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+------------+
| INSTR_RETIRED_ANY | FIXC0 | 267036300 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 |
| PWR_PKG_ENERGY | PWR0 | 10.9296 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 |
| CAS_COUNT_RD | MBOX0C0 | 8643 |
| CAS_COUNT_WR | MBOX0C1 | 1367 |
| CAS_COUNT_RD | MBOX1C0 | 9124 |
| CAS_COUNT_WR | MBOX1C1 | 1354 |
| CAS_COUNT_RD | MBOX2C0 | 9138 |
| CAS_COUNT_WR | MBOX2C1 | 1356 |
| CAS_COUNT_RD | MBOX3C0 | 5586 |
| CAS_COUNT_WR | MBOX3C1 | 1297 |
| CAS_COUNT_RD | MBOX4C0 | 5328 |
| CAS_COUNT_WR | MBOX4C1 | 1269 |
| CAS_COUNT_RD | MBOX5C0 | 5280 |
| CAS_COUNT_WR | MBOX5C1 | 1295 |
+------------------------------------------+---------+------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 0.1108 |
| Runtime unhalted [s] | 0.0878 |
| Clock [MHz] | 1995.2564 |
| CPI | 0.8202 |
| Energy [J] | 10.9296 |
| Power [W] | 98.6643 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 14233.3287 |
| AVX DP [MFLOP/s] | 14231.8898 |
| Packed [MUOPS/s] | 1778.9862 |
| Scalar [MUOPS/s] | 1.4389 |
| Memory read bandwidth [MBytes/s] | 24.9001 |
| Memory read data volume [GBytes] | 0.0028 |
| Memory write bandwidth [MBytes/s] | 4.5861 |
| Memory write data volume [GBytes] | 0.0005 |
| Memory bandwidth [MBytes/s] | 29.4863 |
| Memory data volume [GBytes] | 0.0033 |
| Operational intensity | 482.7104 |
+-----------------------------------+------------+

View File

@ -0,0 +1,168 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Parameters:
Force field: lj
Kernel: plain-C
Data layout: AoS
Floating-point precision: double
Unit cells (nx, ny, nz): 32, 32, 32
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
Periodic (x, y, z): 1, 1, 1
Lattice size: 1.679596e+00
Epsilon: 1.000000e+00
Sigma: 1.000000e+00
Spring constant: 1.000000e+00
Damping constant: 1.000000e+00
Temperature: 1.440000e+00
RHO: 8.442000e-01
Mass: 1.000000e+00
Number of types: 4
Number of timesteps: 200
Report stats every (timesteps): 100
Reneighbor every (timesteps): 20
Prune every (timesteps): 1000
Output positions every (timesteps): 20
Output velocities every (timesteps): 5
Delta time (dt): 5.000000e-03
Cutoff radius: 2.500000e+00
Skin: 3.000000e-01
Half neighbor lists: 0
Processor frequency (GHz): 2.0000
----------------------------------------------------------------------------
step temp pressure
0 1.440000e+00 1.215639e+00
100 8.200895e-01 6.923143e-01
200 7.961495e-01 6.721043e-01
----------------------------------------------------------------------------
System: 131072 atoms 47265 ghost atoms, Steps: 200
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
----------------------------------------------------------------------------
Performance: 2.28 million atom updates per second
Statistics:
Vector width: 8, Processor frequency: 2.0000 GHz
Average neighbors per atom: 76.0352
Average SIMD iterations per atom: 9.9181
Total number of computed pair interactions: 2003182862
Total number of SIMD iterations: 261297661
Useful read data volume for force computation: 57.46GB
Cycles/SIMD iteration: 40.4432
--------------------------------------------------------------------------------
Region force, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 5.115807 |
| call count | 201 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 12592470000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 |
| PWR_PKG_ENERGY | PWR0 | 307.9429 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 |
| CAS_COUNT_RD | MBOX0C0 | 22734550 |
| CAS_COUNT_WR | MBOX0C1 | 1147714 |
| CAS_COUNT_RD | MBOX1C0 | 22755180 |
| CAS_COUNT_WR | MBOX1C1 | 1144415 |
| CAS_COUNT_RD | MBOX2C0 | 22762780 |
| CAS_COUNT_WR | MBOX2C1 | 1129051 |
| CAS_COUNT_RD | MBOX3C0 | 22905660 |
| CAS_COUNT_WR | MBOX3C1 | 1143324 |
| CAS_COUNT_RD | MBOX4C0 | 22914860 |
| CAS_COUNT_WR | MBOX4C1 | 1169116 |
| CAS_COUNT_RD | MBOX5C0 | 22890220 |
| CAS_COUNT_WR | MBOX5C1 | 1180739 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 5.1158 |
| Runtime unhalted [s] | 4.0885 |
| Clock [MHz] | 1995.2508 |
| CPI | 0.8098 |
| Energy [J] | 307.9429 |
| Power [W] | 60.1944 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 12644.6041 |
| AVX DP [MFLOP/s] | 12629.1535 |
| Packed [MUOPS/s] | 1578.6442 |
| Scalar [MUOPS/s] | 15.4506 |
| Memory read bandwidth [MBytes/s] | 1713.4438 |
| Memory read data volume [GBytes] | 8.7656 |
| Memory write bandwidth [MBytes/s] | 86.5003 |
| Memory write data volume [GBytes] | 0.4425 |
| Memory bandwidth [MBytes/s] | 1799.9442 |
| Memory data volume [GBytes] | 9.2082 |
| Operational intensity | 7.0250 |
+-----------------------------------+------------+
Region reneighbour, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 5.897385 |
| call count | 10 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 18212540000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 |
| PWR_PKG_ENERGY | PWR0 | 338.9000 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 |
| CAS_COUNT_RD | MBOX0C0 | 2086787 |
| CAS_COUNT_WR | MBOX0C1 | 1115626 |
| CAS_COUNT_RD | MBOX1C0 | 2089964 |
| CAS_COUNT_WR | MBOX1C1 | 1117021 |
| CAS_COUNT_RD | MBOX2C0 | 2103832 |
| CAS_COUNT_WR | MBOX2C1 | 1117965 |
| CAS_COUNT_RD | MBOX3C0 | 2086930 |
| CAS_COUNT_WR | MBOX3C1 | 1102471 |
| CAS_COUNT_RD | MBOX4C0 | 2094688 |
| CAS_COUNT_WR | MBOX4C1 | 1103018 |
| CAS_COUNT_RD | MBOX5C0 | 2097438 |
| CAS_COUNT_WR | MBOX5C1 | 1102525 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 5.8974 |
| Runtime unhalted [s] | 4.7026 |
| Clock [MHz] | 1995.2473 |
| CPI | 0.6440 |
| Energy [J] | 338.9000 |
| Power [W] | 57.4661 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 1059.4978 |
| AVX DP [MFLOP/s] | 1.3335 |
| Packed [MUOPS/s] | 0.1667 |
| Scalar [MUOPS/s] | 1058.1643 |
| Memory read bandwidth [MBytes/s] | 136.3006 |
| Memory read data volume [GBytes] | 0.8038 |
| Memory write bandwidth [MBytes/s] | 72.2612 |
| Memory write data volume [GBytes] | 0.4262 |
| Memory bandwidth [MBytes/s] | 208.5618 |
| Memory data volume [GBytes] | 1.2300 |
| Operational intensity | 5.0800 |
+-----------------------------------+------------+

Some files were not shown because too many files have changed in this diff Show More