150 Commits

Author SHA1 Message Date
Andropov Arsenii
055a009dbd Neighbor list preparation 2023-05-23 16:25:00 +02:00
Andropov Arsenii
182c065fe2 Neighbor list preparation 2023-05-09 00:44:37 +02:00
Andropov Arsenii
ee3f6de050 Building of super clusters complete, force computation kernel WIP 2023-04-11 02:55:30 +02:00
Rafael Ravedutti
b20e8c6986 Adjust script for GROMACS scheme
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-12-14 17:54:18 +01:00
Rafael Ravedutti
15d43dcce5 Explicitly set half_neigh to zero on stubbed versions
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-12-14 17:21:09 +01:00
Rafael Ravedutti
292138b270 Write debug_printf to avoid warnings and fix latency/cfd script
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-12-14 16:17:28 +01:00
Rafael Ravedutti
0e952964f7 Update script
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-12-13 15:47:17 +01:00
Rafael Ravedutti
cd37746d07 Add script to automate latency and CFD evaluation
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-12-13 15:35:41 +01:00
Rafael Ravedutti
d76d044a00 Update build options for each compiler and include ICX
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-12-13 01:06:59 +01:00
Rafael Ravedutti
a7cb888517 Fix compilation for ICX compiler
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-12-13 00:59:31 +01:00
Rafael Ravedutti
416f042fc0 Fix readline() when fgets returns NULL even on success
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-30 17:43:35 +01:00
Rafael Ravedutti
fa4e38c6c4 Add IACA and stubbed measurements for GROMACS 4x8 FN kernel
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-18 01:00:20 +01:00
Rafael Ravedutti
04ea1b027e Print kernel and precision info in gromacs-stub
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-16 16:15:15 +01:00
Rafael Ravedutti
56d9613028 Implement stubbed version for GROMACS
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-15 16:01:13 +01:00
Rafael Ravedutti
bc06220aeb Remove AVX512 reciprocal usage in AVX2 file
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-15 01:40:37 +01:00
Rafael Ravedutti
efa462d0af Add AVX_FMA ISA
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-15 01:24:30 +01:00
Rafael Ravedutti
cd1fbfb3c8 Reorganize SIMD files and split AVX and AVX2
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-15 00:55:46 +01:00
Rafael Ravedutti
f293cec960 Call CPU version of updatePbc within setupPbc
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-14 19:19:57 +01:00
Rafael Ravedutti
6eedf1776e Small fixes into GROMACS GPU code
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-14 18:21:14 +01:00
Rafael Ravedutti
93188d1383 Adjust NVCC flags to avoid issues with atomicAdd with doubles
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-14 18:01:46 +01:00
Rafael Ravedutti
c70ebce4c1 Integrate GROMACS GPU implementation into master branch
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-08 18:33:23 +01:00
Rafael Ravedutti
493915fe95 Fix code for AVX and remove warnings
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-08 15:30:37 +01:00
Rafael Ravedutti
437b380229 Adjust NVCC flags
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-11-07 20:37:01 +01:00
Rafael Ravedutti
c4304e3619 Update figure widths again
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 18:41:40 +02:00
Rafael Ravedutti
b774e771ba Update width of figures in table
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 18:38:32 +02:00
Rafael Ravedutti
e86caa92b1 Fix Verlet Lists figure href
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 18:24:16 +02:00
Rafael Ravedutti
b201055658 Update table with HTML
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 18:23:29 +02:00
Rafael Ravedutti
8fce79dda6 Update README.md
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 17:56:37 +02:00
Rafael Ravedutti
1421a023a9 Remove gather-bench image
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 17:50:24 +02:00
Rafael Ravedutti
d3811c35c6 Update table with figures
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 17:48:39 +02:00
Rafael Ravedutti
239eea86b4 Update gather_bench figure
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 17:44:35 +02:00
Rafael Ravedutti
2ddb8a2934 Add gather-bench as submodule
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:55:11 +02:00
Rafael Ravedutti
1af19ad586 Update gather_bench image to PNG
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:47:45 +02:00
Rafael Ravedutti
b9fadd7fbf Update introduction text and add gather bench figure
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:46:24 +02:00
Rafael Ravedutti
12e7718a5f Insert stubbed case into table as well
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:38:08 +02:00
Rafael Ravedutti
4ddd84ef9d Insert table with figures
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:37:11 +02:00
Rafael Ravedutti
c0a54190d8 Update figures again
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:35:30 +02:00
Rafael Ravedutti
bc8f0e7c35 Update figs
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:28:18 +02:00
Rafael Ravedutti
9301610f7c Add more figures
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:23:34 +02:00
Rafael Ravedutti
70a2f48d64 Add links and figures to README
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 14:08:10 +02:00
Rafael Ravedutti
94abf8b362 Add new sections
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 12:52:54 +02:00
Rafael Ravedutti
da75f2cc36 Add usage section
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 12:39:09 +02:00
Rafael Ravedutti
880b82a86d Update README.md with config.mk options
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 12:21:29 +02:00
Rafael Ravedutti
35a8e3eeb7 Fix header of likwid-marker.h
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-09-29 11:48:05 +02:00
Jan Eitzinger
3c02a3fb7a Update README.md 2022-09-14 11:05:57 +02:00
Jan Eitzinger
3d0f4b97ee Switch copyright header in source files. 2022-09-05 10:39:42 +02:00
Rafael Ravedutti
28d3946072 Move common modules to common directory
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-17 17:56:31 +02:00
Rafael Ravedutti
47db9e86b0 Introduce common directory
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-17 17:20:57 +02:00
Rafael Ravedutti
418f392a11 Update .gitignore
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-16 19:33:38 +02:00
Rafael Ravedutti
29fa08fa7f Enhance output for gromacs variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-16 19:32:49 +02:00
Rafael Ravedutti
911ba63336 Adjust ISA options and improve output
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-16 18:36:47 +02:00
Rafael Ravedutti
0caeea0494 Rename cuda.c to device.c
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-12 18:17:07 +02:00
Rafael Ravedutti
90609a2b5f Adjust file structure for CUDA
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-12 18:12:29 +02:00
Rafael Ravedutti
939197a785 Create separate structs DeviceAtom and DeviceNeighbor with device pointers
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-12 17:28:06 +02:00
Rafael Ravedutti
065b596074 Initial refactoring of CUDA code
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-12 04:19:38 +02:00
Rafael Ravedutti
959ff65126 Fix macro condition
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-12 01:29:40 +02:00
Rafael Ravedutti
87d006d418 Fix GPU version
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-11 16:42:41 +02:00
Rafael Ravedutti
3d95ec4b0a Small fixes
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-09 19:19:48 +02:00
Rafael Ravedutti
c18124b066 Integrate LAMMPS CUDA versions into master branch
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-08-09 18:53:53 +02:00
Rafael Ravedutti
eb77e1a3bd Fix DEM setup
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-19 04:13:06 +02:00
Rafael Ravedutti
2e77f6207b Avoid errors when compiling for AVX2 due to SIMD LJ implementation
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-19 02:30:26 +02:00
Rafael Ravedutti
577955dfb7 Apply first changes to DEM kernel
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-13 02:34:33 +02:00
Rafael Ravedutti
99237241fb Include domain box on DEM input file
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-08 23:15:30 +02:00
Rafael Ravedutti
3b85da83a7 Update timestep size for dem
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-08 02:56:56 +02:00
Rafael Ravedutti
814f561993 Allow PBC in just some directions
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-08 02:30:03 +02:00
Rafael Ravedutti
32836eebcb Setup first DEM example with input file from lecture
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-07 02:11:50 +02:00
Rafael Ravedutti
9ffc09f497 Add DEM kernel to parameter options
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-07 00:47:38 +02:00
Rafael Ravedutti
79483a446e Adjust code with DEM to be compilable
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-06 01:07:39 +02:00
Rafael Ravedutti
bb599c9ea8 Add first version of DEM kernel
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-07-05 15:33:31 +02:00
Rafael Ravedutti
e4d7faf91b Adjust cutforce and atom positions in stubbed version
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-05-14 01:02:08 +02:00
Rafael Ravedutti
bbdcaf2983 New stubbed version
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-05-14 00:55:33 +02:00
Rafael Ravedutti
14838389ff Fix stubbed variant for LAMMPS algorithm
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-04-30 04:08:18 +02:00
Rafael Ravedutti
ab2eb1ff50 Write LAMMPS kernel with SIMD intrinsics and implement AVX512 with double-precision functions
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-04-05 02:57:23 +02:00
Rafael Ravedutti
af1756bfe4 Fix skin for Argon simulation
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-04-04 22:22:35 +02:00
Rafael Ravedutti
4d11c5a3c2 Merge branch 'master' of github.com:RRZE-HPC/MD-Bench 2022-04-04 21:52:47 +02:00
Rafael Ravedutti
e48b3fb653 Add option to check if cj is local before applying reaction force
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-04-04 21:52:40 +02:00
Jan Eitzinger
7a0d6479a1 Merge branch 'master' of https://github.com/RRZE-HPC/MD-Bench 2022-04-01 15:58:05 +02:00
Jan Eitzinger
5585ebcf42 Add ONEAPI config. Remove omp simd for full neigh. 2022-04-01 15:57:54 +02:00
Rafael Ravedutti
fdbeed4368 Fix AVX2 versions with half neighbor lists
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-27 16:39:39 +02:00
Rafael Ravedutti
0e742766b7 Add working version of Simd4xn kernel with half neighbor lists
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-23 15:54:18 +01:00
Rafael Ravedutti
e72323ab6a Fix Simd2xnn Kernel with half neighbor lists and add AVX512 intrinsics with double
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-23 15:21:07 +01:00
Rafael Ravedutti
94521f03b3 Fix reference version with half neighbor lists
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-23 14:31:47 +01:00
Rafael Ravedutti
8709bc2a06 Add first version for half neighbor lists in GROMACS variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-22 23:47:05 +01:00
Rafael Ravedutti
2a555a7deb Add simd reduction pragma to vectorize innermost loop on half-neighbor variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-21 17:02:09 +01:00
Rafael Ravedutti
719330807b Change data layout for force arrays according to position
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-18 01:40:51 +01:00
Rafael Ravedutti
e7737e9151 Refactor half neighbor lists code
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-18 01:28:11 +01:00
Rafael Ravedutti
5df544637f Fix force calculation time in LAMMPS variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-17 02:53:58 +01:00
Rafael Ravedutti
887f41871c Add parameter reading for LAMMPS variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-17 02:44:34 +01:00
Rafael Ravedutti
d4b34e1fa4 Fix intrinsics for AVX2
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-17 00:35:21 +01:00
Rafael Ravedutti
4090f43095 Optimize partial forces reduction for compute_4xn kernel
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-16 17:54:52 +01:00
Rafael Ravedutti
f3263a2d48 Separate simd file into multiple files
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-16 14:52:55 +01:00
rafaelravedutti
459853dc25 Merge pull request #4 from RRZE-HPC/gromacs_sp
Gromacs sp
2022-03-15 20:31:42 +01:00
Rafael Ravedutti
d47173d7a2 Fix Simd2xNN kernel
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-15 19:59:10 +01:00
Rafael Ravedutti
d61576699d Add first compilable version of Gromacs with SP
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-15 02:40:56 +01:00
Rafael Ravedutti
8669f2f6d7 Fix LJ Simd4xN kernel
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-11 01:12:59 +01:00
Rafael Ravedutti
d79c3c2a1d Add first working version with 4x8 config (ref kernel)
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-10 22:33:41 +01:00
Jan Eitzinger
c2fcd50773 Initial version of lammps halfneighbor list 2022-03-10 17:06:45 +01:00
Jan Eitzinger
ba3a0524f6 Merge branch 'master' of github.com:RRZE-HPC/MD-Bench 2022-03-10 16:30:40 +01:00
Jan Eitzinger
6203cb12b6 Start to introduce halfneigh version 2022-03-10 16:30:37 +01:00
Rafael Ravedutti
22d0f0b958 Commit version that works for M=N
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-10 01:31:50 +01:00
Rafael Ravedutti
2b441e691e Make code compilable
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-09 17:23:49 +01:00
Rafael Ravedutti
c7360305c8 Add first draft version of GROMACS method separating i-clusters and j-clusters
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-09 02:25:39 +01:00
Rafael Ravedutti
cecb31d6a9 Update params for argon_1000 test case
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-07 14:49:38 +01:00
Rafael Ravedutti
ba6785a865 Allow parameter reading from files and update data
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-05 03:21:52 +01:00
Rafael Ravedutti
aae29a5b5a Add code to read GRO files
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-03 20:03:33 +01:00
Rafael Ravedutti
af92800c64 Add SIMD version with AVX (no AVX2) and XTC output
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-03-02 23:12:04 +01:00
Rafael Ravedutti
022aa75c75 Add cutoff radius and skin as parameters of simulation
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-28 22:34:42 +01:00
Rafael Ravedutti
1389f89fb7 Add prunning kernel
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-28 17:20:39 +01:00
Rafael Ravedutti
c62e4ea4ad Add clusters efficiency on stats
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-28 16:10:09 +01:00
Rafael Ravedutti
ed2929c813 Add percentage of atoms within cutoff radius when using LAMMPS reference version
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-25 14:40:33 +01:00
Rafael Ravedutti
e637a26844 Add percentage of atoms within cutoff radius when using GROMACS reference version
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-25 14:19:48 +01:00
Rafael Ravedutti
fdd18df816 Fix argon simulation
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-24 16:42:58 +01:00
Rafael Ravedutti
1a708f2d3b Add PDB reading functions to lammps variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-24 15:17:51 +01:00
Rafael Ravedutti
d0ec9520f2 Write function to read PDB files and include data for Argon simulation
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-24 02:36:17 +01:00
Rafael Ravedutti
ca7775a62a Add average atoms per cluster on stats
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-09 17:50:54 +01:00
Rafael Ravedutti
769bab0faa Separate local and ghost cluster edges on VTK output
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-08 16:12:22 +01:00
Rafael Ravedutti
6a35a7a482 Update stats for cluster version
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-08 00:55:27 +01:00
Rafael Ravedutti
8deee3d954 Add cluster edges in VTK output
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-08 00:11:10 +01:00
Rafael Ravedutti
cd15911a97 When building neighbor lists, skip first iterations until z is in range
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-07 18:28:53 +01:00
Rafael Ravedutti
0eacb2453e Inline getBoundingBoxDistanceSq and avoid redundant loads from bbminz and bbmaxz
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-07 18:00:21 +01:00
Rafael Ravedutti
cdb1d5b9f1 Add version with AVX2 intrinsics for gromacs scheme
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-04 17:52:48 +01:00
Rafael Ravedutti
34ce407f18 Update stats for gromacs scheme
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-04 14:47:37 +01:00
Rafael Ravedutti
6e6a3f6502 Use aligned loads when gathering j atoms
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-04 14:29:32 +01:00
Rafael Ravedutti
7b90800a2b Setting forces to zero before calculation is not required
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-04 14:05:04 +01:00
Rafael Ravedutti
9daf9e5f4d Fix exclusion masks and add SIMD debug tools
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-02 21:54:18 +01:00
Rafael Ravedutti
4c5f013bf4 Assign masked adds results to forces
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-02 18:07:56 +01:00
Rafael Ravedutti
6ad1e58a3e Add first kernel using SIMD instrinsics for 4xn cases
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-02 18:00:44 +01:00
Rafael Ravedutti
5fd2d422ee Adjust kernels to work with MxN loops
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-02 00:49:55 +01:00
Rafael Ravedutti
85e7954932 Check all clusters in cell when building neighbor lists because ghost clusters may not be sorted
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-01 20:16:04 +01:00
Rafael Ravedutti
4a5216a177 Remove bb z-check on while loop when building neighbor lists
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-02-01 00:46:12 +01:00
Rafael Ravedutti
e64c3345bc Fix a few more bugs on gromacs variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-31 23:46:20 +01:00
Rafael Ravedutti
e0e6b6a68c Perform a few fixes for gromacs variant
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-31 17:49:22 +01:00
Rafael Ravedutti
6691803910 Add first version of force calculation with cluster scheme
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-28 18:07:41 +01:00
Rafael Ravedutti
eedcc97e4a Remove segfaults
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-28 15:18:54 +01:00
Rafael Ravedutti
a119fcdfdd Fix some segfaults and add function to update single atoms
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-27 03:07:31 +01:00
Rafael Ravedutti
aa0f4048d0 Rename default directory to lammps and reorganize gromacs variant steps
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-25 21:00:11 +01:00
Rafael Ravedutti
cbe42b8149 Fix errors to make gromacs approach compilable so far
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-25 12:19:28 +01:00
Rafael Ravedutti
6291709ae7 Add first draft code with GROMACS approach
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-25 00:43:10 +01:00
Rafael Ravedutti
72730bc27b Update Makefile and config.mk
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-17 14:16:39 +01:00
Rafael Ravedutti
df09c2861e Add first version with more than one optimization scheme
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-17 14:15:02 +01:00
Rafael Ravedutti
489e7ee9d3 Update .gitignore
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-17 11:46:57 +01:00
Rafael Ravedutti
165335cea0 Update compilation flags for all available compilers
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2022-01-17 11:40:44 +01:00
Rafael Ravedutti
35c110155e Separate tracing from force computation and fix stubbed version
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2021-12-01 00:07:45 +01:00
Rafael Ravedutti
bb21a885a1 Add new setups for Copper melting with LJ and EAM
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2021-11-30 01:33:55 +01:00
Rafael Ravedutti
f7010113bf Include commented timestamping on asm
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2021-11-10 14:39:44 +01:00
Rafael Ravedutti
841dfb9490 Fix data types for rdr and rdrho
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2021-11-09 20:36:23 +01:00
Rafael Ravedutti
f8b9a095cf Add working version of force.s
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2021-11-09 01:23:15 +01:00
Rafael Ravedutti
b2dada6179 Fix param references and assembler errors for force.s
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2021-11-09 00:05:42 +01:00
Rafael Ravedutti
d8c8733cb2 Update asm force again
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2021-11-08 23:32:24 +01:00
Rafael Ravedutti
cf7ea1460c Update asm version to be integrated
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
2021-11-08 22:31:58 +01:00
127 changed files with 111213 additions and 2194 deletions

6
.gitignore vendored
View File

@@ -54,5 +54,11 @@ dkms.conf
# Build directories and executables
GCC/
ICC/
ICX/
CLANG/
NVCC/
MDBench-GCC*
MDBench-ICC*
MDBench-ICX*
MDBench-CLANG*
MDBench-NVCC*

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "gather-bench"]
path = gather-bench
url = https://github.com/RRZE-HPC/gather-bench

View File

@@ -1,8 +1,10 @@
#CONFIGURE BUILD SYSTEM
TARGET = MDBench-$(TAG)
BUILD_DIR = ./$(TAG)
SRC_DIR = ./src
TARGET = MDBench-$(TAG)-$(OPT_SCHEME)
BUILD_DIR = ./$(TAG)-$(OPT_SCHEME)
SRC_DIR = ./$(OPT_SCHEME)
ASM_DIR = ./asm
COMMON_DIR = ./common
CUDA_DIR = ./$(SRC_DIR)/cuda
MAKE_DIR = ./
Q ?= @
@@ -10,7 +12,9 @@ Q ?= @
include $(MAKE_DIR)/config.mk
include $(MAKE_DIR)/include_$(TAG).mk
include $(MAKE_DIR)/include_LIKWID.mk
INCLUDES += -I./src/includes
include $(MAKE_DIR)/include_ISA.mk
include $(MAKE_DIR)/include_GROMACS.mk
INCLUDES += -I./$(SRC_DIR)/includes -I./$(COMMON_DIR)/includes
ifeq ($(strip $(DATA_LAYOUT)),AOS)
DEFINES += -DAOS
@@ -25,14 +29,6 @@ ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifneq ($(ATOMS_LOOP_RUNS),)
DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
endif
ifneq ($(NEIGHBORS_LOOP_RUNS),)
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
@@ -49,15 +45,71 @@ ifeq ($(strip $(COMPUTE_STATS)),true)
DEFINES += -DCOMPUTE_STATS
endif
ifeq ($(strip $(XTC_OUTPUT)),true)
DEFINES += -DXTC_OUTPUT
endif
ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
DEFINES += -DUSE_REFERENCE_VERSION
endif
ifeq ($(strip $(HALF_NEIGHBOR_LISTS_CHECK_CJ)),true)
DEFINES += -DHALF_NEIGHBOR_LISTS_CHECK_CJ
endif
ifeq ($(strip $(DEBUG)),true)
DEFINES += -DDEBUG
endif
ifneq ($(VECTOR_WIDTH),)
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
endif
VPATH = $(SRC_DIR) $(ASM_DIR)
ifeq ($(strip $(__SIMD_KERNEL__)),true)
DEFINES += -D__SIMD_KERNEL__
endif
ifeq ($(strip $(__SSE__)),true)
DEFINES += -D__ISA_SSE__
endif
ifeq ($(strip $(__ISA_AVX__)),true)
DEFINES += -D__ISA_AVX__
endif
ifeq ($(strip $(__ISA_AVX_FMA__)),true)
DEFINES += -D__ISA_AVX_FMA__
endif
ifeq ($(strip $(__ISA_AVX2__)),true)
DEFINES += -D__ISA_AVX2__
endif
ifeq ($(strip $(__ISA_AVX512__)),true)
DEFINES += -D__ISA_AVX512__
endif
ifeq ($(strip $(ENABLE_OMP_SIMD)),true)
DEFINES += -DENABLE_OMP_SIMD
endif
ifeq ($(strip $(USE_SIMD_KERNEL)),true)
DEFINES += -DUSE_SIMD_KERNEL
endif
ifeq ($(strip $(USE_SUPER_CLUSTERS)),true)
DEFINES += -DUSE_SUPER_CLUSTERS
endif
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
OBJ += $(patsubst $(COMMON_DIR)/%.c, $(BUILD_DIR)/%-common.o,$(wildcard $(COMMON_DIR)/*.c))
ifeq ($(strip $(TAG)),NVCC)
OBJ += $(patsubst $(CUDA_DIR)/%.cu, $(BUILD_DIR)/%-cuda.o,$(wildcard $(CUDA_DIR)/*.cu))
endif
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
# $(warning $(OBJ))
@@ -80,6 +132,16 @@ $(BUILD_DIR)/%.o: %.c
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%-common.o: $(COMMON_DIR)/%.c
$(info ===> COMPILE $@)
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%-cuda.o: %.cu
$(info ===> COMPILE $@)
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
$(BUILD_DIR)/%.s: %.c
$(info ===> GENERATE ASM $@)
$(Q)$(CC) -S $(ASFLAGS) $(CPPFLAGS) $(CFLAGS) $< -o $@

102
README.md
View File

@@ -1,27 +1,103 @@
# MD-Bench
A simple, sequential C implementation of the [Mantevo miniMD](https://github.com/Mantevo/miniMD) benchmark in less than 1000 LOC.
![Image](figures/features-v3.png "MD-Bench Features")
## Build
MD-Bench is a toolbox for the performance engineering of short-range force calculation kernels on molecular-dynamics applications.
It aims at covering all available state-of-the-art algorithms from different community codes such as LAMMPS and GROMACS.
1. Open `config.mk` and edit the `TAG` value according to the tool chain used. Currently supported is GCC, CLANG (LLVM), and ICC (Intel).
2. Change `DATA_LAYOUT` and `DATA_TYPE` if desired in config.mk.
3. Open and adapt the compiler flags in `<include_<TOOLCHAIN>.mk`, e.g. in `include_ICC.mk` for the Intel tool chain.
4. Build the binary calling `make`.
Apart from that, many tools to study and evaluate the in-depth performance of such kernels on distinct hardware are offered, like gather-bench, a standalone benchmark that mimics the data movement from MD kernels and the stubbed force calculation cases that focus on isolating the impacts caused by memory latencies and control flow divergence contributions in the overall performance.
<table>
<thead>
<tr>
<th>Verlet Lists</th>
<th>GROMACS MxN</th>
<th>Stubbed cases</th>
</tr>
</thead>
<tbody>
<tr>
<td><a target="_blank" rel="noopener noreferrer" href="figures/verlet_v2.png"><img src="figures/verlet_v2.png" alt="Image" title="Verlet Lists" style="width: 100%;"></a></td>
<td><a target="_blank" rel="noopener noreferrer" href="figures/gromacs_mxn_v2.png"><img src="figures/gromacs_mxn_v2.png" alt="Image" title="GROMACS MxN" style="width: 90%;"></a></td>
<td><a target="_blank" rel="noopener noreferrer" href="figures/stub_new_v3.png"><img src="figures/stub_new_v3.png" alt="Image" title="Stubbed cases" style="width: 100%;"></a></td>
</tr>
</tbody>
</table>
<!-- ![Image](figures/gather_bench.png "gather-bench") -->
## Build instructions
Properly configure your building by changing `config.mk` file. The following options are available:
- **TAG:** Compiler tag (available options: GCC, CLANG, ICC, ONEAPI, NVCC).
- **ISA:** Instruction set (available options: SSE, AVX, AVX\_FMA, AVX2, AVX512).
- **MASK\_REGISTERS:** Use AVX512 mask registers (always true when ISA is set to AVX512).
- **OPT\_SCHEME:** Optimization algorithm (available options: lammps, gromacs).
- **ENABLE\_LIKWID:** Enable likwid to make use of HPM counters.
- **DATA\_TYPE:** Floating-point precision (available options: SP, DP).
- **DATA\_LAYOUT:** Data layout for atom vector properties (available options: AOS, SOA).
- **ASM\_SYNTAX:** Assembly syntax to use when generating assembly files (available options: ATT, INTEL).
- **DEBUG:** Toggle debug mode.
- **EXPLICIT\_TYPES:** Explicitly store and load atom types.
- **MEM\_TRACER:** Trace memory addresses for cache simulator.
- **INDEX\_TRACER:** Trace indexes and distances for gather-md.
- **COMPUTE\_STATS:** Compute statistics.
Configurations for LAMMPS Verlet Lists optimization scheme:
- **ENABLE\_OMP\_SIMD:** Use omp simd pragma on half neighbor-lists kernels.
- **USE\_SIMD\_KERNEL:** Compile kernel with explicit SIMD intrinsics.
Configurations for GROMACS MxN optimization scheme:
- **USE\_REFERENCE\_VERSION:** Use reference version (only for correction purposes).
- **XTC\_OUTPUT:** Enable XTC output.
- **HALF\_NEIGHBOR\_LISTS\_CHECK\_CJ:** Check if j-clusters are local when decreasing the reaction force.
Configurations for CUDA:
- **USE\_CUDA\_HOST\_MEMORY:** Use CUDA host memory to optimize host-device transfers.
When done, just use `make` to compile the code.
You can clean intermediate build results with `make clean`, and all build results with `make distclean`.
You have to call `make clean` before `make` if you changed the build settings.
## Configuration
## Usage
Currently all settings apart from the options described below are hard-coded in `main.c`.
Use the following command to run a simulation:
## Run the benchmark
Without any options 200 steps with system size 32x32x32 is used.
The default can be changed using the following options:
```bash
./MD-Bench-<TAG>-<OPT_SCHEME> [OPTION]...
```
Where `TAG` and `OPT_SCHEME` correspond to the building options with the same name.
Without any options, a Copper FCC lattice system with size 32x32x32 (131072 atoms) over 200 time-steps using the Lennard-Jones potential (sigma=1.0, epsilon=1.0) is simulated.
The default behavior and other options can be changed using the following parameters:
```
-p <string>: file to read parameters from (can be specified more than once)
-f <string>: force field (lj or eam), default lj
-i <string>: input file with atom positions (dump)
-e <string>: input file for EAM
-n / --nsteps <int>: set number of timesteps for simulation
-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction
-r / --radius <real>: set cutoff radius
-s / --skin <real>: set skin (verlet buffer)
--freq <real>: processor frequency (GHz)
--vtk <string>: VTK file for visualization
--xtc <string>: XTC file for visualization
```
## Examples
TBD
## Citations
Rafael Ravedutti Lucio Machado, Jan Eitzinger, Harald Köstler, and Gerhard Wellein: MD-Bench: A generic proxy-app toolbox for state-of-the-art molecular dynamics algorithms. Accepted for [PPAM](https://ppam.edu.pl/) 2022, the 14th International Conference on Parallel Processing and Applied Mathematics, Gdansk, Poland, September 11-14, 2022. PPAM 2022 Best Paper Award. Preprint: [arXiv:2207.13094](https://arxiv.org/abs/2207.13094)
## Credits
MD-Bench is developed by the Erlangen National High Performance Computing Center ([NHR@FAU](https://hpc.fau.de/)) at the University of Erlangen-Nürnberg.
## License
[LGPL-3.0](https://github.com/RRZE-HPC/MD-Bench/blob/master/LICENSE)

View File

@@ -7,30 +7,33 @@ computeForce:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push rbp
push r12
push r13
push r14
push r15
push rbx
#call getTimeStamp # xmm0 <- getTimeStamp()
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [72+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [8+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [rdi] # xmm0 <- param->epsilon
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..exit_func
..B1.2:
jle ..atom_loop_exit
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..B1.6 # ecx == 0
je ..zero_last_element # ecx == 0
# Init forces to zero loop
..B1.4:
# Init forces to zero loop (unroll factor = 2)
..init_force_loop:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
@@ -40,46 +43,47 @@ computeForce:
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..B1.4
jb ..init_force_loop
..B1.5:
# Trick to make r11d contain value of last element to be zeroed plus 1
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..B1.6:
lea ecx, DWORD PTR [-1+r11] # r11d <- i * 2
cmp ecx, r9d # i < Nlocal
jae ..B1.8
..zero_last_element:
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
cmp ecx, r9d # i >= Nlocal
jae ..before_atom_loop
..B1.7:
# Set last element to zero
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
..B1.8:
# Initialize registers to be used within atom loop
..before_atom_loop:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
xor r8d, r8d # r8d <- 0
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm16 <- [48 * epsilon, ...]
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
# Register spilling
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
@@ -87,13 +91,12 @@ computeForce:
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
# Loop over all atoms
..B1.9:
..atom_loop_begin:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0
vmovapd xmm20, xmm25 # xmm20 <- 0
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
@@ -104,76 +107,28 @@ computeForce:
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
test r13d, r13d # numneighs <= 0
jle ..exit_func
jle ..atom_loop_exit
..B1.10:
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
xor r9d, r9d # r9d <- 0 (k)
mov r14d, r13d # r14d <- numneighs
xor r11d, r11d # r11d <- 0
and r14d, -8 # r14d <- numneighs & (-8)
lea r9d, DWORD PTR [8+r11] # r9d <- 8 (why lea?)
cmp r14d, r9d # r14d < r9d
jl ..B1.33
cmp r14d, 8
jl ..compute_forces_remainder
# cmp r13d, 8 # numneighs < 8
# jl ..B1.32
#..B1.11:
# cmp r13d, 1200 # numneighs < 1200
# jl ..B1.31
#..B1.12:
# mov rcx, r12
# imul rcx, r8
# add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i (r8)]
# mov r9, rcx # r9 <- neighs
# and r9, 63 # r9 <- neighs & 63
# test r9d, 3 # (r9d & 3) == 0 => r9d divisible by 8
# je ..B1.14
#..B1.13:
# xor r9d, r9d # r9d <- 0
# jmp ..B1.16
#..B1.14:
# test r9d, r9d # r9d == 0
# je ..B1.16
#..B1.15:
# neg r9d
# add r9d, 64
# shr r9d, 2 # r9d <- (64 - r9d) / 4
# cmp r13d, r9d # numneighs < r9d
# cmovl r9d, r13d # r9d <- MIN(numneighs, r9d)
#..B1.16:
# mov ebx, r13d
# sub ebx, r9d
# and ebx, 7
# neg ebx
# add ebx, r13d # ebx <- -((numneighs - r9d) & 7) + numneighs
# cmp r9d, 1 # r9d < 1
# jb ..B1.20
#..B1.20:
# lea ecx, DWORD PTR [8+r9] # ecx <- r9d[1]
# cmp ebx, ecx # -((numneighs - r9d) & 7) + numneighs < neighs
# jl ..B1.24
..B1.21:
mov rcx, r12
imul rcx, r8
vbroadcastsd zmm0, xmm8
vbroadcastsd zmm1, xmm9
vbroadcastsd zmm2, xmm10
movsxd r14, r9d
add rcx, r11
..B1.22:
vpcmpeqb k2, xmm0, xmm0
add r9d, 8
..compute_forces:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r14*4]
add r14, 8
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
@@ -181,26 +136,26 @@ computeForce:
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
vgatherdpd zmm5{k2}, QWORD PTR [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, QWORD PTR [16+rdx+ymm3*8]
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8]
#vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8]
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5
vsubpd zmm28, zmm0, zmm4
vsubpd zmm31, zmm2, zmm6
vmulpd zmm20, zmm29, zmm29
vfmadd231pd zmm20, zmm28, zmm28
vfmadd231pd zmm20, zmm31, zmm31
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# if condition cutoff radius
vrcp14pd zmm27, zmm20 #-> sr2
vcmppd k5, zmm20, zmm16, 1
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
@@ -208,14 +163,69 @@ computeForce:
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28
vfmadd231pd zmm12{k5}, zmm30, zmm29
vfmadd231pd zmm11{k5}, zmm30, zmm31
cmp r9d, ebx
jb ..B1.22
#end neighbor loop
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
sub r14d, 8
add r9, 8
cmp r14d, 8
jge ..compute_forces
..B1.26:
# Check if there are remaining neighbors to be computed
..compute_forces_remainder:
test r14d, r14d
jle ..sum_up_forces
vpbroadcastd ymm4, r14d
vpcmpgtd k1, ymm4, ymm17
kmovw r15d, k1
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
kmovw k2, k1
kmovw k3, k1
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
#### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
kmovw r9d, k5 # r9d <- rsq < cutforcesq
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
# and add them (reduction) to obtain the final contribution for the current atom
..sum_up_forces:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
@@ -236,8 +246,7 @@ computeForce:
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
#exit function
..exit_func:
..atom_loop_exit:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
@@ -245,8 +254,6 @@ computeForce:
add rax, 24
###
movsxd r8, r10d #55.32
inc r8 #55.32
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
@@ -255,12 +262,17 @@ computeForce:
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..B1.9
jb ..atom_loop_begin
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
#call getTimeStamp # xmm0 <- getTimeStamp()
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
pop rbx
pop r15
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
pop rbp #93.12
ret #93.12
.type computeForce,@function

326
asm/unused/force_lj.s Normal file
View File

@@ -0,0 +1,326 @@
.intel_syntax noprefix
.text
.align 16,0x90
.globl computeForceLJ
computeForceLJ:
# parameter 1: rdi Parameter*
# parameter 2: rsi Atom*
# parameter 3: rdx Neighbor*
push rbp
push r12
push r13
push r14
push r15
push rbx
mov r9d, DWORD PTR [4+rsi] # r9d <- atom->Nlocal
vmovsd xmm2, QWORD PTR [96+rdi] # xmm2 <- param->cutforce
vmovsd xmm1, QWORD PTR [32+rdi] # xmm1 <- param->sigma6
vmovsd xmm0, QWORD PTR [24+rdi] # xmm0 <- param->epsilon
mov r13, QWORD PTR [64+rsi] # r13 <- atom->fx
mov r14, QWORD PTR [72+rsi] # r14 <- atom->fy
mov rdi, QWORD PTR [80+rsi] # rdi <- atom->fz
test r9d, r9d # atom->Nlocal <= 0
jle ..atom_loop_exit
xor r10d, r10d # r10d <- 0
mov ecx, r9d # ecx <- atom->Nlocal
xor r8d, r8d # r8d <- 0
mov r11d, 1 # r11d <- 1
xor eax, eax # eax <- 0
shr ecx, 1 # ecx <- atom->Nlocal >> 1
je ..zero_last_element # ecx == 0
# Init forces to zero loop (unroll factor = 2)
..init_force_loop:
mov QWORD PTR [r8+r13], rax # fx[i] <- 0
mov QWORD PTR [r8+r14], rax # fy[i] <- 0
mov QWORD PTR [r8+rdi], rax # fz[i] <- 0
mov QWORD PTR [8+r8+r13], rax # fx[i] <- 0
mov QWORD PTR [8+r8+r14], rax # fy[i] <- 0
mov QWORD PTR [8+r8+rdi], rax # fz[i] <- 0
add r8, 16 # i++
inc r10 # i++
cmp r10, rcx # i < Nlocal
jb ..init_force_loop
# Trick to make r11d contain value of last element to be zeroed plus 1
# Maybe we can directly put r10+10 here and zero r11d above, then remove the -1 below
lea r11d, DWORD PTR [1+r10+r10] # r11d <- i * 2 + 1
..zero_last_element:
lea ecx, DWORD PTR [-1+r11] # ecx <- i * 2
cmp ecx, r9d # i >= Nlocal
jae ..before_atom_loop
# Set last element to zero
movsxd r11, r11d # r11 <- i * 2
mov QWORD PTR [-8+r13+r11*8], rax # fx[i] <- 0
mov QWORD PTR [-8+r14+r11*8], rax # fy[i] <- 0
mov QWORD PTR [-8+rdi+r11*8], rax # fz[i] <- 0
# Initialize registers to be used within atom loop
..before_atom_loop:
vmulsd xmm15, xmm2, xmm2 # xmm15 <- cutforcesq
vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] # ymm18 <- [8, ...]
vmulsd xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] # xmm0 <- 48 * epsilon
vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] # ymm17 <- [0..7]
vmovups zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] # zmm7 <- [0.5, ...]
vbroadcastsd zmm16, xmm15 # zmm16 <- [cutforcesq, ...]
vbroadcastsd zmm15, xmm1 # zmm15 <- [param->sigma6, ...]
vbroadcastsd zmm14, xmm0 # zmm14 <- [48 * epsilon, ...]
movsxd r9, r9d # r9 <- atom->Nlocal
xor r10d, r10d # r10d <- 0 (i)
mov rcx, QWORD PTR [24+rdx] # rcx <- neighbor->numneigh
mov r11, QWORD PTR [8+rdx] # r11 <- neighbor->neighbors
movsxd r12, DWORD PTR [16+rdx] # r12 <- neighbor->maxneighs
mov rdx, QWORD PTR [16+rsi] # rdx <- atom->x
### AOS
xor eax, eax
### SOA
#mov rax, QWORD PTR [24+rsi] # rax <- atom->y
#mov rsi, QWORD PTR [32+rsi] # rsi <- atom->z
###
shl r12, 2 # r12 <- neighbor->maxneighs * 4
# Register spilling
mov QWORD PTR [-32+rsp], r9 # [-32+rsp] <- atom->Nlocal
mov QWORD PTR [-24+rsp], rcx # [-24+rsp] <- neighbor->numneigh
mov QWORD PTR [-16+rsp], r14 # [-16+rsp] <- atom->fy
mov QWORD PTR [-8+rsp], r13 # [-8+rsp] <- atom->fx
mov QWORD PTR [-40+rsp], r15 # [-40+rsp] <- r15
mov QWORD PTR [-48+rsp], rbx # [-48+rsp] <- rbx
#sub rsp, 64
#call getTimeStamp # xmm0 <- getTimeStamp()
#vmovsd QWORD PTR [-56+rsp], xmm0 # [-56+rsp] <- xmm0 [spill]
#add rsp, 64
..atom_loop_begin:
mov rcx, QWORD PTR [-24+rsp] # rcx <- neighbor->numneigh
vxorpd xmm25, xmm25, xmm25 # xmm25 <- 0 (fix)
vmovapd xmm20, xmm25 # xmm20 <- 0 (fiy)
mov r13d, DWORD PTR [rcx+r10*4] # r13d <- neighbor->numneigh[i] (numneighs)
vmovapd xmm4, xmm20 # xmm4 <- 0 (fiz)
### AOS
vmovsd xmm8, QWORD PTR[rdx+rax] # xmm8 <- atom->x[i * 3]
vmovsd xmm9, QWORD PTR[8+rdx+rax] # xmm9 <- atom->x[i * 3 + 1]
vmovsd xmm10, QWORD PTR[16+rdx+rax] # xmm10 <- atom->x[i * 3 + 2]
### SOA
#vmovsd xmm8, QWORD PTR [rdx+r10*8] # xmm8 <- atom->x[i]
#vmovsd xmm9, QWORD PTR [rax+r10*8] # xmm9 <- atom->y[i]
#vmovsd xmm10, QWORD PTR [rsi+r10*8] # xmm10 <- atom->z[i]
###
vbroadcastsd zmm0, xmm8 # zmm0 <- atom_x(i)
vbroadcastsd zmm1, xmm9 # zmm1 <- atom_y(i)
vbroadcastsd zmm2, xmm10 # zmm2 <- atom_z(i)
test r13d, r13d # numneighs <= 0
jle ..atom_loop_exit
vpxord zmm13, zmm13, zmm13 # zmm13 <- 0 (fix)
vmovaps zmm12, zmm13 # zmm12 <- 0 (fiy)
vmovaps zmm11, zmm12 # zmm11 <- 0 (fiz)
mov rcx, r12 # rcx <- neighbor->maxneighs * 4
imul rcx, r10 # rcx <- neighbor->maxneighs * 4 * i
add rcx, r11 # rcx <- &neighbor->neighbors[neighbor->maxneighs * i]
xor r9d, r9d # r9d <- 0 (k)
mov r14d, r13d # r14d <- numneighs
cmp r14d, 8
jl ..compute_forces_remainder
..compute_forces:
vpcmpeqb k1, xmm0, xmm0
vpcmpeqb k2, xmm0, xmm0
vpcmpeqb k3, xmm0, xmm0
vmovdqu ymm3, YMMWORD PTR [rcx+r9*4]
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k5}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k5}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k5}, zmm30, zmm31 # fiz += force * delz
sub r14d, 8
add r9, 8
cmp r14d, 8
jge ..compute_forces
# Check if there are remaining neighbors to be computed
..compute_forces_remainder:
test r14d, r14d
jle ..sum_up_forces
vpbroadcastd ymm4, r14d
vpcmpgtd k1, ymm4, ymm17
kmovw r15d, k1
vmovdqu32 ymm3{k1}{z}, YMMWORD PTR [rcx+r9*4]
kmovw k2, k1
kmovw k3, k1
vpxord zmm5, zmm5, zmm5
vpxord zmm6, zmm6, zmm6
### AOS
vpaddd ymm4, ymm3, ymm3
vpaddd ymm3, ymm3, ymm4
vpxord zmm4, zmm4, zmm4
vgatherdpd zmm4{k1}, [rdx+ymm3*8]
vgatherdpd zmm5{k2}, [8+rdx+ymm3*8]
vgatherdpd zmm6{k3}, [16+rdx+ymm3*8]
#### SOA
#vpxord zmm4, zmm4, zmm4
#vgatherdpd zmm5{k2}, [rax+ymm3*8]
#vgatherdpd zmm4{k1}, [rdx+ymm3*8]
#vgatherdpd zmm6{k3}, [rsi+ymm3*8]
###
vsubpd zmm29, zmm1, zmm5 # zmm29 <- atom_y(i) - atom_y(j) -- dely
vsubpd zmm28, zmm0, zmm4 # zmm28 <- atom_x(i) - atom_x(j) -- delx
vsubpd zmm31, zmm2, zmm6 # zmm31 <- atom_z(i) - atom_z(j) -- delz
vmulpd zmm20, zmm29, zmm29 # zmm20 <- dely * dely
vfmadd231pd zmm20, zmm28, zmm28 # zmm20 <- dely * dely + delx * delx
vfmadd231pd zmm20, zmm31, zmm31 # zmm20 <- zmm20 + delz * delz -- rsq
# Cutoff radius condition
vrcp14pd zmm27, zmm20 # zmm27 <- 1.0 / rsq (sr2)
vcmppd k5, zmm20, zmm16, 1 # k5 <- rsq < cutforcesq
kmovw r9d, k5 # r9d <- rsq < cutforcesq
and r15d, r9d # r15d <- rsq < cutforcesq && k < numneighs
kmovw k3, r15d # k3 <- rsq < cutforcesq && k < numneighs
vmulpd zmm22, zmm27, zmm15 # zmm22 <- sr2 * sigma6
vmulpd zmm24, zmm27, zmm14 # zmm24 <- 48.0 * epsilon * sr2
vmulpd zmm25, zmm27, zmm22 # zmm25 <- sr2 * sigma6 * sr2
vmulpd zmm23, zmm27, zmm25 # zmm23 <- sr2 * sigma6 * sr2 * sr2
vfmsub213pd zmm27, zmm25, zmm7 # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5
vmulpd zmm26, zmm23, zmm24 # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2
vmulpd zmm30, zmm26, zmm27 # zmm30 <- force
vfmadd231pd zmm13{k3}, zmm30, zmm28 # fix += force * delx
vfmadd231pd zmm12{k3}, zmm30, zmm29 # fiy += force * dely
vfmadd231pd zmm11{k3}, zmm30, zmm31 # fiz += force * delz
# Forces are currently separated in different lanes of zmm registers, hence it is necessary to permutate
# and add them (reduction) to obtain the final contribution for the current atom
..sum_up_forces:
vmovups zmm10, ZMMWORD PTR .L_2il0floatpacket.6[rip]
vpermd zmm0, zmm10, zmm11
vpermd zmm5, zmm10, zmm12
vpermd zmm21, zmm10, zmm13
vaddpd zmm11, zmm0, zmm11
vaddpd zmm12, zmm5, zmm12
vaddpd zmm13, zmm21, zmm13
vpermpd zmm1, zmm11, 78
vpermpd zmm6, zmm12, 78
vpermpd zmm22, zmm13, 78
vaddpd zmm2, zmm11, zmm1
vaddpd zmm8, zmm12, zmm6
vaddpd zmm23, zmm13, zmm22
vpermpd zmm3, zmm2, 177
vpermpd zmm9, zmm8, 177
vpermpd zmm24, zmm23, 177
vaddpd zmm4, zmm2, zmm3
vaddpd zmm20, zmm8, zmm9
vaddpd zmm25, zmm23, zmm24
..atom_loop_exit:
mov rcx, QWORD PTR [-8+rsp] #84.9[spill]
mov rbx, QWORD PTR [-16+rsp] #85.9[spill]
### AOS
add rax, 24
###
vaddsd xmm0, xmm25, QWORD PTR [rcx+r10*8] #84.9
vmovsd QWORD PTR [rcx+r10*8], xmm0 #84.9
vaddsd xmm1, xmm20, QWORD PTR [rbx+r10*8] #85.9
vmovsd QWORD PTR [rbx+r10*8], xmm1 #85.9
vaddsd xmm2, xmm4, QWORD PTR [rdi+r10*8] #86.9
vmovsd QWORD PTR [rdi+r10*8], xmm2 #86.9
inc r10 #55.5
cmp r10, QWORD PTR [-32+rsp] #55.5[spill]
jb ..atom_loop_begin
vzeroupper #93.12
vxorpd xmm0, xmm0, xmm0 #93.12
#call getTimeStamp # xmm0 <- getTimeStamp()
#vsubsd xmm0, xmm0, QWORD PTR [-56+rsp] # xmm0 <- E-S
pop rbx
pop r15
pop r14 #93.12
pop r13 #93.12
pop r12 #93.12
pop rbp #93.12
ret #93.12
.type computeForceLJ,@function
.size computeForceLJ,.-computeForceLJ
..LNcomputeForce.0:
.data
# -- End computeForceLJ
.section .rodata, "a"
.align 64
.align 64
.L_2il0floatpacket.2:
.long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000
.type .L_2il0floatpacket.2,@object
.size .L_2il0floatpacket.2,64
.align 64
.L_2il0floatpacket.4:
.long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000
.type .L_2il0floatpacket.4,@object
.size .L_2il0floatpacket.4,64
.align 64
.L_2il0floatpacket.6:
.long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f
.type .L_2il0floatpacket.6,@object
.size .L_2il0floatpacket.6,64
.align 32
.L_2il0floatpacket.0:
.long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,32
.align 32
.L_2il0floatpacket.1:
.long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007
.type .L_2il0floatpacket.1,@object
.size .L_2il0floatpacket.1,32
.align 8
.L_2il0floatpacket.3:
.long 0x00000000,0x40480000
.type .L_2il0floatpacket.3,@object
.size .L_2il0floatpacket.3,8
.align 8
.L_2il0floatpacket.5:
.long 0x00000000,0x3ff00000
.type .L_2il0floatpacket.5,@object
.size .L_2il0floatpacket.5,8
.data
.section .note.GNU-stack, ""
# End

44
common/allocate.c Normal file
View File

@@ -0,0 +1,44 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <util.h>
void *allocate(int alignment, size_t bytesize) {
void *ptr;
int errorCode;
errorCode = posix_memalign(&ptr, alignment, bytesize);
if(errorCode == EINVAL) {
fprintf(stderr, "Error: Alignment parameter is not a power of two\n");
exit(EXIT_FAILURE);
}
if(errorCode == ENOMEM) {
fprintf(stderr, "Error: Insufficient memory to fulfill the request\n");
exit(EXIT_FAILURE);
}
if(ptr == NULL) {
fprintf(stderr, "Error: posix_memalign failed!\n");
exit(EXIT_FAILURE);
}
return ptr;
}
void *reallocate(void* ptr, int alignment, size_t new_bytesize, size_t old_bytesize) {
void *newarray = allocate(alignment, new_bytesize);
if(ptr != NULL) {
memcpy(newarray, ptr, old_bytesize);
free(ptr);
}
return newarray;
}

68
common/device.c Normal file
View File

@@ -0,0 +1,68 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
//---
#include <device.h>
#ifdef CUDA_TARGET
#include <cuda_runtime.h>
void cuda_assert(const char *label, cudaError_t err) {
if (err != cudaSuccess) {
printf("[CUDA Error]: %s: %s\r\n", label, cudaGetErrorString(err));
exit(-1);
}
}
void *allocateGPU(size_t bytesize) {
void *ptr;
#ifdef CUDA_HOST_MEMORY
cuda_assert("allocateGPU", cudaMallocHost((void **) &ptr, bytesize));
#else
cuda_assert("allocateGPU", cudaMalloc((void **) &ptr, bytesize));
#endif
return ptr;
}
// Data is not preserved
void *reallocateGPU(void *ptr, size_t new_bytesize) {
if(ptr != NULL) {
#ifdef CUDA_HOST_MEMORY
cudaFreeHost(ptr);
#else
cudaFree(ptr);
#endif
}
return allocateGPU(new_bytesize);
}
void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {
#ifndef CUDA_HOST_MEMORY
cuda_assert("memcpyToGPU", cudaMemcpy(d_ptr, h_ptr, bytesize, cudaMemcpyHostToDevice));
#endif
}
void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {
#ifndef CUDA_HOST_MEMORY
cuda_assert("memcpyFromGPU", cudaMemcpy(h_ptr, d_ptr, bytesize, cudaMemcpyDeviceToHost));
#endif
}
void memsetGPU(void *d_ptr, int value, size_t bytesize) {
cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize));
}
#else
void initDevice(Atom *atom, Neighbor *neighbor) {}
void *allocateGPU(size_t bytesize) { return NULL; }
void *reallocateGPU(void *ptr, size_t new_bytesize) { return NULL; }
void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize) {}
void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize) {}
void memsetGPU(void *d_ptr, int value, size_t bytesize) {}
#endif

View File

@@ -1,24 +1,8 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
@@ -43,7 +27,7 @@ void initEam(Eam* eam, Parameter* param) {
}
void coeff(Eam* eam, Parameter* param) {
read_file(&eam->file, param->input_file);
read_eam_file(&eam->file, param->eam_file);
param->mass = eam->file.mass;
param->cutforce = eam->file.cut;
param->cutneigh = param->cutforce + 1.0;
@@ -59,7 +43,7 @@ void init_style(Eam* eam, Parameter* param) {
array2spline(eam, param);
}
void read_file(Funcfl* file, const char* filename) {
void read_eam_file(Funcfl* file, const char* filename) {
FILE* fptr;
char line[MAXLINE];
@@ -70,10 +54,10 @@ void read_file(Funcfl* file, const char* filename) {
}
int tmp;
fgets(line, MAXLINE, fptr);
fgets(line, MAXLINE, fptr);
readline(line, fptr);
readline(line, fptr);
sscanf(line, "%d %lg", &tmp, &(file->mass));
fgets(line, MAXLINE, fptr);
readline(line, fptr);
sscanf(line, "%d %lg %d %lg %lg", &file->nrho, &file->drho, &file->nr, &file->dr, &file->cut);
//printf("Read: %lf %i %lf %i %lf %lf\n",file->mass,file->nrho,file->drho,file->nr,file->dr,file->cut);
@@ -277,9 +261,9 @@ void grab(FILE* fptr, int n, MD_FLOAT* list) {
int i = 0;
while(i < n) {
fgets(line, MAXLINE, fptr);
readline(line, fptr);
ptr = strtok(line, " \t\n\r\f");
list[i++] = atof(ptr);
while(ptr = strtok(NULL, " \t\n\r\f")) list[i++] = atof(ptr);
while((ptr = strtok(NULL, " \t\n\r\f"))) list[i++] = atof(ptr);
}
}

View File

@@ -0,0 +1,13 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#ifndef __ALLOCATE_H_
#define __ALLOCATE_H_
extern void* allocate (int alignment, size_t bytesize);
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
#endif

26
common/includes/device.h Normal file
View File

@@ -0,0 +1,26 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stddef.h>
//---
#include <atom.h>
#include <neighbor.h>
#ifndef __DEVICE_H_
#define __DEVICE_H_
#ifdef CUDA_TARGET
#include <cuda_runtime.h>
extern void cuda_assert(const char *msg, cudaError_t err);
#endif
extern void initDevice(Atom*, Neighbor*);
extern void *allocateGPU(size_t bytesize);
extern void *reallocateGPU(void *ptr, size_t new_bytesize);
extern void memcpyToGPU(void *d_ptr, void *h_ptr, size_t bytesize);
extern void memcpyFromGPU(void *h_ptr, void *d_ptr, size_t bytesize);
extern void memsetGPU(void *d_ptr, int value, size_t bytesize);
#endif

39
common/includes/eam.h Normal file
View File

@@ -0,0 +1,39 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <atom.h>
#include <parameter.h>
#ifndef __EAM_H_
#define __EAM_H_
typedef struct {
int nrho, nr;
MD_FLOAT drho, dr, cut, mass;
MD_FLOAT *frho, *rhor, *zr;
} Funcfl;
typedef struct {
MD_FLOAT* fp;
int nmax;
int nrho, nr;
int nrho_tot, nr_tot;
MD_FLOAT dr, rdr, drho, rdrho;
MD_FLOAT *frho, *rhor, *z2r;
MD_FLOAT *rhor_spline, *frho_spline, *z2r_spline;
Funcfl file;
} Eam;
void initEam(Eam* eam, Parameter* param);
void coeff(Eam* eam, Parameter* param);
void init_style(Eam* eam, Parameter *param);
void read_eam_file(Funcfl* file, const char* filename);
void file2array(Eam* eam);
void array2spline(Eam* eam, Parameter* param);
void interpolate(int n, MD_FLOAT delta, MD_FLOAT* f, MD_FLOAT* spline);
void grab(FILE* fptr, int n, MD_FLOAT* list);
#endif

View File

@@ -1,32 +1,8 @@
/*
* =======================================================================================
*
* Filename: likwid-marker.h
*
* Description: Header File of likwid Marker API
*
* Version: <VERSION>
* Released: <DATE>
*
* Authors: Thomas Gruber (tg), thomas.roehl@googlemail.com
*
* Project: likwid
*
* Copyright (C) 2016 RRZE, University Erlangen-Nuremberg
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
* =======================================================================================
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef LIKWID_MARKER_H
#define LIKWID_MARKER_H

View File

@@ -0,0 +1,59 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __PARAMETER_H_
#define __PARAMETER_H_
#if PRECISION == 1
#define MD_FLOAT float
#else
#define MD_FLOAT double
#endif
typedef struct {
int force_field;
char* param_file;
char* input_file;
char* vtk_file;
char* xtc_file;
MD_FLOAT epsilon;
MD_FLOAT sigma;
MD_FLOAT sigma6;
MD_FLOAT temp;
MD_FLOAT rho;
MD_FLOAT mass;
int ntypes;
int ntimes;
int nstat;
int reneigh_every;
int prune_every;
int x_out_every;
int v_out_every;
int half_neigh;
MD_FLOAT dt;
MD_FLOAT dtforce;
MD_FLOAT skin;
MD_FLOAT cutforce;
MD_FLOAT cutneigh;
int nx, ny, nz;
int pbc_x, pbc_y, pbc_z;
MD_FLOAT lattice;
MD_FLOAT xlo, xhi, ylo, yhi, zlo, zhi;
MD_FLOAT xprd, yprd, zprd;
double proc_freq;
char* eam_file;
// DEM
MD_FLOAT k_s;
MD_FLOAT k_dn;
MD_FLOAT gx, gy, gz;
MD_FLOAT reflect_x, reflect_y, reflect_z;
} Parameter;
void initParameter(Parameter*);
void readParameter(Parameter*, const char*);
void printParameter(Parameter*);
#endif

68
common/includes/simd.h Normal file
View File

@@ -0,0 +1,68 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __SIMD_H__
#define __SIMD_H__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#ifndef NO_ZMM_INTRIN
# include <zmmintrin.h>
#endif
#ifndef CLUSTER_M
# define CLUSTER_M 1
#endif
#ifndef CLUSTER_N
# define CLUSTER_N 1
#endif
#if defined(__ISA_AVX512__)
# if PRECISION == 2
# include "simd/avx512_double.h"
# else
# include "simd/avx512_float.h"
# endif
#endif
#if defined(__ISA_AVX2__)
# if PRECISION == 2
# include "simd/avx2_double.h"
# else
# include "simd/avx2_float.h"
# endif
#endif
#if defined(__ISA_AVX__)
# if PRECISION == 2
# include "simd/avx_double.h"
# else
# include "simd/avx_float.h"
# endif
#endif
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
double x[VECTOR_WIDTH];
memcpy(x, &a, sizeof(x));
fprintf(stdout, "%s: ", ref);
for(int i = 0; i < VECTOR_WIDTH; i++) {
fprintf(stdout, "%f ", x[i]);
}
fprintf(stdout, "\n");
}
static inline void simd_print_mask(const char *ref, MD_SIMD_MASK a) { fprintf(stdout, "%s: %x\n", ref, simd_mask_to_u32(a)); }
#endif // __SIMD_H__

View File

@@ -0,0 +1,101 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#define MD_SIMD_FLOAT __m256d
#define MD_SIMD_INT __m128i
#define MD_SIMD_MASK __m256d
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
MD_SIMD_FLOAT ret;
fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX2 with double precision!");
exit(-1);
return ret;
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
MD_SIMD_FLOAT ret;
fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX2 with double precision!");
exit(-1);
return ret;
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX2 with double precision!");
exit(-1);
return 0.0;
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m256d t0, t1, t2;
__m128d a0, a1;
t0 = _mm256_hadd_pd(v0, v1);
t1 = _mm256_hadd_pd(v2, v3);
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
t0 = _mm256_add_pd(t0, t2);
t1 = _mm256_add_pd(t1, t2);
t0 = _mm256_blend_pd(t0, t1, 0b1100);
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
_mm256_store_pd(m, t1);
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
a0 = _mm256_castpd256_pd128(t0);
a1 = _mm256_extractf128_pd(t0, 0x1);
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
//static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
const unsigned long long int none = 0x0;
return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
}
// TODO: Implement this, althrough it is just required for debugging
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128d a0, a1;
// test with shuffle & add as an alternative to hadd later
a = _mm256_hadd_pd(a, a);
a0 = _mm256_castpd256_pd128(a);
a1 = _mm256_extractf128_pd(a, 0x1);
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
fprintf(stderr, "simd_h_decr3(): Not implemented for AVX2 with double precision!");
exit(-1);
}
// Functions used in LAMMPS kernel
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }

View File

@@ -0,0 +1,84 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <immintrin.h>
#include <zmmintrin.h>
#define MD_SIMD_FLOAT __m256
#define MD_SIMD_MASK __mmask8
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128 t0;
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m128 t0, t2;
v0 = _mm256_hadd_ps(v0, v1);
v2 = _mm256_hadd_ps(v2, v3);
v0 = _mm256_hadd_ps(v0, v2);
t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
t2 = _mm_add_ps(t0, _mm_load_ps(m));
_mm_store_ps(m, t2);
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
return _mm256_broadcast_ps((const __m128 *)(m));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
__m128 t0, t1;
t0 = _mm_broadcast_ss(m);
t1 = _mm_broadcast_ss(m + 1);
return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m128 t0, t1;
v0 = _mm256_hadd_ps(v0, v1);
t0 = _mm256_extractf128_ps(v0, 0x1);
t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
t1 = _mm_add_ps(t0, _mm_load_ps(m));
_mm_store_ps(m, t1);
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
_mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
simd_h_decr(m, a0);
simd_h_decr(m + CLUSTER_N, a1);
simd_h_decr(m + CLUSTER_N * 2, a2);
}

View File

@@ -0,0 +1,111 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <immintrin.h>
#ifndef NO_ZMM_INTRIN
# include <zmmintrin.h>
#endif
#define MD_SIMD_FLOAT __m512d
#define MD_SIMD_MASK __mmask8
#define MD_SIMD_INT __m256i
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_pd(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_pd(a, b); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_pd(a, b, c); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_pd(a); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_pd(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_pd(p, a); }
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_pd(_mm512_setzero_pd(), m, a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
return *((MD_FLOAT *) &x);
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m512d t0, t2;
__m256d t3, t4;
t0 = _mm512_add_pd(v0, _mm512_permute_pd(v0, 0x55));
t2 = _mm512_add_pd(v2, _mm512_permute_pd(v2, 0x55));
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xaa), v1, _mm512_permute_pd(v1, 0x55));
t2 = _mm512_mask_add_pd(t2, simd_mask_from_u32(0xaa), v3, _mm512_permute_pd(v3, 0x55));
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0x4e));
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xF0), t2, _mm512_shuffle_f64x2(t2, t2, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0xb1));
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0x0C), t0, t0, 0xee);
t3 = _mm512_castpd512_pd256(t0);
t4 = _mm256_load_pd(m);
t4 = _mm256_add_pd(t4, t3);
_mm256_store_pd(m, t4);
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m512d t0;
__m256d t2, t3;
t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
t2 = _mm512_castpd512_pd256(t0);
t3 = _mm256_load_pd(m);
t3 = _mm256_add_pd(t3, t2);
_mm256_store_pd(m, t3);
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
}
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m256d t;
a = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
t = _mm256_load_pd(m);
t = _mm256_sub_pd(t, _mm512_castpd512_pd256(a));
_mm256_store_pd(m, t);
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
simd_h_decr(m, a0);
simd_h_decr(m + CLUSTER_N, a1);
simd_h_decr(m + CLUSTER_N * 2, a2);
}
// Functions used in LAMMPS kernel
//static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm512_i32gather_pd(vidx, m, s); }
#define simd_gather(vidx,m,s) (_mm512_i32gather_pd(vidx, m, s))
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm256_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm256_setzero_si256(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); }
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_si256((const MD_SIMD_INT *) m); }
//static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm256_load_epi32(m); }
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_add_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_mul_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return _mm256_mask_load_epi32(simd_int_zero(), k, m); }
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cmp_epi32_mask(a, b, _MM_CMPINT_LT); }

View File

@@ -0,0 +1,84 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#include <zmmintrin.h>
#define MD_SIMD_FLOAT __m512
#define MD_SIMD_MASK __mmask16
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm512_store_ps(p, a); }
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm512_mask_mov_ps(_mm512_setzero_ps(), m, a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
exit(-1);
return 0.0;
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
exit(-1);
return 0.0;
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m512 t0, t1;
__m128 t2, t3;
t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
t0 = _mm512_add_ps(t0, t1);
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
t3 = _mm512_castps512_ps128(t0);
t2 = _mm_load_ps(m);
t2 = _mm_add_ps(t2, t3);
_mm_store_ps(m, t2);
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
return _mm_cvtss_f32(t3);
}
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m256 t;
a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
t = _mm256_load_ps(m);
t = _mm256_sub_ps(t, _mm512_castps512_ps256(a));
_mm256_store_ps(m, t);
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
simd_h_decr(m, a0);
simd_h_decr(m + CLUSTER_N, a1);
simd_h_decr(m + CLUSTER_N * 2, a2);
}

View File

@@ -0,0 +1,103 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#define MD_SIMD_FLOAT __m256d
#define MD_SIMD_INT __m128i
#define MD_SIMD_MASK __m256d
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_pd(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_pd(p, a); }
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
MD_SIMD_FLOAT ret;
fprintf(stderr, "simd_load_h_duplicate(): Not implemented for AVX with double precision!");
exit(-1);
return ret;
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
MD_SIMD_FLOAT ret;
fprintf(stderr, "simd_load_h_dual(): Not implemented for AVX with double precision!");
exit(-1);
return ret;
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
fprintf(stderr, "simd_h_dual_incr_reduced_sum(): Not implemented for AVX with double precision!");
exit(-1);
return 0.0;
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m256d t0, t1, t2;
__m128d a0, a1;
t0 = _mm256_hadd_pd(v0, v1);
t1 = _mm256_hadd_pd(v2, v3);
t2 = _mm256_permute2f128_pd(t0, t1, 0x21);
t0 = _mm256_add_pd(t0, t2);
t1 = _mm256_add_pd(t1, t2);
t0 = _mm256_blend_pd(t0, t1, 0b1100);
t1 = _mm256_add_pd(t0, _mm256_load_pd(m));
_mm256_store_pd(m, t1);
t0 = _mm256_add_pd(t0, _mm256_permute_pd(t0, 0b0101));
a0 = _mm256_castpd256_pd128(t0);
a1 = _mm256_extractf128_pd(t0, 0x1);
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_and_pd(a, m); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
#ifdef __ISA_AVX_FMA__
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
#else
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
#endif
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_int_cond_lt(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm256_cvtepi32_pd(_mm_cmplt_epi32(a, b)); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _mm256_and_pd(a, b); }
// TODO: Initialize all diagonal cases and just select the proper one (all bits set or diagonal) based on cond0
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
const unsigned long long int all = 0xFFFFFFFFFFFFFFFF;
const unsigned long long int none = 0x0;
return _mm256_castsi256_pd(_mm256_set_epi64x((a & 0x8) ? all : none, (a & 0x4) ? all : none, (a & 0x2) ? all : none, (a & 0x1) ? all : none));
}
// TODO: Implement this, althrough it is just required for debugging
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128d a0, a1;
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
a0 = _mm256_castpd256_pd128(a);
a1 = _mm256_extractf128_pd(a, 0x1);
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
fprintf(stderr, "simd_h_decr3(): Not implemented for AVX with double precision!");
exit(-1);
}
// Functions used in LAMMPS kernel
static inline MD_SIMD_FLOAT simd_gather(MD_SIMD_INT vidx, const MD_FLOAT *m, int s) { return _mm256_i32gather_pd(m, vidx, s); }
static inline MD_SIMD_INT simd_int_broadcast(int scalar) { return _mm_set1_epi32(scalar); }
static inline MD_SIMD_INT simd_int_zero() { return _mm_setzero_si128(); }
static inline MD_SIMD_INT simd_int_seq() { return _mm_set_epi32(3, 2, 1, 0); }
static inline MD_SIMD_INT simd_int_load(const int *m) { return _mm_load_si128((__m128i const *) m); }
static inline MD_SIMD_INT simd_int_add(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_add_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mul(MD_SIMD_INT a, MD_SIMD_INT b) { return _mm_mul_epi32(a, b); }
static inline MD_SIMD_INT simd_int_mask_load(const int *m, MD_SIMD_MASK k) { return simd_int_load(m) & _mm256_cvtpd_epi32(k); }

View File

@@ -0,0 +1,84 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <immintrin.h>
#include <zmmintrin.h>
#define MD_SIMD_FLOAT __m256
#define MD_SIMD_MASK __mmask8
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm256_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
static inline void simd_store(MD_FLOAT *p, MD_SIMD_FLOAT a) { _mm256_store_ps(p, a); }
static inline MD_SIMD_FLOAT select_by_mask(MD_SIMD_FLOAT a, MD_SIMD_MASK m) { return _mm256_mask_mov_ps(_mm256_setzero_ps(), m, a); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_ps_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128 t0;
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
__m128 t0, t2;
v0 = _mm256_hadd_ps(v0, v1);
v2 = _mm256_hadd_ps(v2, v3);
v0 = _mm256_hadd_ps(v0, v2);
t0 = _mm_add_ps(_mm256_castps256_ps128(v0), _mm256_extractf128_ps(v0, 0x1));
t2 = _mm_add_ps(t0, _mm_load_ps(m));
_mm_store_ps(m, t2);
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
return _mm256_broadcast_ps((const __m128 *)(m));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
__m128 t0, t1;
t0 = _mm_broadcast_ss(m);
t1 = _mm_broadcast_ss(m + 1);
return _mm256_insertf128_ps(_mm256_castps128_ps256(t0), t1, 0x1);
}
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m128 t0, t1;
v0 = _mm256_hadd_ps(v0, v1);
t0 = _mm256_extractf128_ps(v0, 0x1);
t0 = _mm_hadd_ps(_mm256_castps256_ps128(v0), t0);
t0 = _mm_permute_ps(t0, _MM_SHUFFLE(3, 1, 2, 0));
t1 = _mm_add_ps(t0, _mm_load_ps(m));
_mm_store_ps(m, t1);
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
__m128 asum = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
_mm_store_ps(m, _mm_sub_ps(_mm_load_ps(m), asum));
}
static inline void simd_h_decr3(MD_FLOAT *m, MD_SIMD_FLOAT a0, MD_SIMD_FLOAT a1, MD_SIMD_FLOAT a2) {
simd_h_decr(m, a0);
simd_h_decr(m + CLUSTER_N, a1);
simd_h_decr(m + CLUSTER_N * 2, a2);
}

15
common/includes/thermo.h Normal file
View File

@@ -0,0 +1,15 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#include <atom.h>
#ifndef __THERMO_H_
#define __THERMO_H_
extern void setupThermo(Parameter*, int);
extern void computeThermo(int, Parameter*, Atom*);
extern void adjustThermo(Parameter*, Atom*);
#endif

17
common/includes/timers.h Normal file
View File

@@ -0,0 +1,17 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __TIMERS_H_
#define __TIMERS_H_
typedef enum {
TOTAL = 0,
NEIGH,
FORCE,
NUMTIMER
} timertype;
#endif

14
common/includes/timing.h Normal file
View File

@@ -0,0 +1,14 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __TIMING_H_
#define __TIMING_H_
extern double getTimeStamp();
extern double getTimeResolution();
extern double getTimeStamp_();
#endif

46
common/includes/util.h Normal file
View File

@@ -0,0 +1,46 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#ifndef __UTIL_H_
#define __UTIL_H_
#ifndef MIN
# define MIN(x,y) ((x)<(y)?(x):(y))
#endif
#ifndef MAX
# define MAX(x,y) ((x)>(y)?(x):(y))
#endif
#ifndef ABS
# define ABS(a) ((a) >= 0 ? (a) : -(a))
#endif
#define DEBUG_MESSAGE debug_printf
#ifndef MAXLINE
# define MAXLINE 4096
#endif
#define FF_LJ 0
#define FF_EAM 1
#define FF_DEM 2
#if PRECISION == 1
# define PRECISION_STRING "single"
#else
# define PRECISION_STRING "double"
#endif
extern double myrandom(int*);
extern void random_reset(int *seed, int ibase, double *coord);
extern int str2ff(const char *string);
extern const char* ff2str(int ff);
extern int get_num_threads();
extern void readline(char *line, FILE *fp);
extern void debug_printf(const char *format, ...);
#endif

180
common/parameter.c Normal file
View File

@@ -0,0 +1,180 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//---
#include <atom.h>
#include <parameter.h>
#include <util.h>
void initParameter(Parameter *param) {
param->input_file = NULL;
param->vtk_file = NULL;
param->xtc_file = NULL;
param->eam_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma = 1.0;
param->sigma6 = 1.0;
param->rho = 0.8442;
param->ntypes = 4;
param->ntimes = 200;
param->dt = 0.005;
param->nx = 32;
param->ny = 32;
param->nz = 32;
param->pbc_x = 1;
param->pbc_y = 1;
param->pbc_z = 1;
param->cutforce = 2.5;
param->skin = 0.3;
param->cutneigh = param->cutforce + param->skin;
param->temp = 1.44;
param->nstat = 100;
param->mass = 1.0;
param->dtforce = 0.5 * param->dt;
param->reneigh_every = 20;
param->prune_every = 1000;
param->x_out_every = 20;
param->v_out_every = 5;
param->half_neigh = 0;
param->proc_freq = 2.4;
// DEM
param->k_s = 1.0;
param->k_dn = 1.0;
param->gx = 0.0;
param->gy = 0.0;
param->gz = 0.0;
param->reflect_x = 0.0;
param->reflect_y = 0.0;
param->reflect_z = 0.0;
}
void readParameter(Parameter *param, const char *filename) {
FILE *fp = fopen(filename, "r");
char line[MAXLINE];
int i;
if(!fp) {
fprintf(stderr, "Could not open parameter file: %s\n", filename);
exit(-1);
}
while(!feof(fp)) {
line[0] = '\0';
readline(line, fp);
for(i = 0; line[i] != '\0' && line[i] != '#'; i++);
line[i] = '\0';
char *tok = strtok(line, " ");
char *val = strtok(NULL, " ");
#define PARSE_PARAM(p,f) if(strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) { param->p = f(val); }
#define PARSE_STRING(p) PARSE_PARAM(p, strdup)
#define PARSE_INT(p) PARSE_PARAM(p, atoi)
#define PARSE_REAL(p) PARSE_PARAM(p, atof)
if(tok != NULL && val != NULL) {
PARSE_PARAM(force_field, str2ff);
PARSE_STRING(input_file);
PARSE_STRING(eam_file);
PARSE_STRING(vtk_file);
PARSE_STRING(xtc_file);
PARSE_REAL(epsilon);
PARSE_REAL(sigma);
PARSE_REAL(k_s);
PARSE_REAL(k_dn);
PARSE_REAL(reflect_x);
PARSE_REAL(reflect_y);
PARSE_REAL(reflect_z);
PARSE_REAL(gx);
PARSE_REAL(gy);
PARSE_REAL(gz);
PARSE_REAL(rho);
PARSE_REAL(dt);
PARSE_REAL(cutforce);
PARSE_REAL(skin);
PARSE_REAL(temp);
PARSE_REAL(mass);
PARSE_REAL(proc_freq);
PARSE_INT(ntypes);
PARSE_INT(ntimes);
PARSE_INT(nx);
PARSE_INT(ny);
PARSE_INT(nz);
PARSE_INT(pbc_x);
PARSE_INT(pbc_y);
PARSE_INT(pbc_z);
PARSE_INT(nstat);
PARSE_INT(reneigh_every);
PARSE_INT(prune_every);
PARSE_INT(x_out_every);
PARSE_INT(v_out_every);
PARSE_INT(half_neigh);
}
}
// Update dtforce
param->dtforce = 0.5 * param->dt;
// Update sigma6 parameter
MD_FLOAT s2 = param->sigma * param->sigma;
param->sigma6 = s2 * s2 * s2;
fclose(fp);
}
void printParameter(Parameter *param) {
printf("Parameters:\n");
if(param->input_file != NULL) {
printf("Input file: %s\n", param->input_file);
}
if(param->vtk_file != NULL) {
printf("VTK file: %s\n", param->vtk_file);
}
if(param->xtc_file != NULL) {
printf("XTC file: %s\n", param->xtc_file);
}
if(param->eam_file != NULL) {
printf("EAM file: %s\n", param->eam_file);
}
printf("\tForce field: %s\n", ff2str(param->force_field));
#ifdef CLUSTER_M
printf("\tKernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
#else
printf("\tKernel: %s\n", KERNEL_NAME);
#endif
printf("\tData layout: %s\n", POS_DATA_LAYOUT);
printf("\tFloating-point precision: %s\n", PRECISION_STRING);
printf("\tUnit cells (nx, ny, nz): %d, %d, %d\n", param->nx, param->ny, param->nz);
printf("\tDomain box sizes (x, y, z): %e, %e, %e\n", param->xprd, param->yprd, param->zprd);
printf("\tPeriodic (x, y, z): %d, %d, %d\n", param->pbc_x, param->pbc_y, param->pbc_z);
printf("\tLattice size: %e\n", param->lattice);
printf("\tEpsilon: %e\n", param->epsilon);
printf("\tSigma: %e\n", param->sigma);
printf("\tSpring constant: %e\n", param->k_s);
printf("\tDamping constant: %e\n", param->k_dn);
printf("\tTemperature: %e\n", param->temp);
printf("\tRHO: %e\n", param->rho);
printf("\tMass: %e\n", param->mass);
printf("\tNumber of types: %d\n", param->ntypes);
printf("\tNumber of timesteps: %d\n", param->ntimes);
printf("\tReport stats every (timesteps): %d\n", param->nstat);
printf("\tReneighbor every (timesteps): %d\n", param->reneigh_every);
printf("\tPrune every (timesteps): %d\n", param->prune_every);
printf("\tOutput positions every (timesteps): %d\n", param->x_out_every);
printf("\tOutput velocities every (timesteps): %d\n", param->v_out_every);
printf("\tDelta time (dt): %e\n", param->dt);
printf("\tCutoff radius: %e\n", param->cutforce);
printf("\tSkin: %e\n", param->skin);
printf("\tHalf neighbor lists: %d\n", param->half_neigh);
printf("\tProcessor frequency (GHz): %.4f\n", param->proc_freq);
}

View File

@@ -1,30 +1,15 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <thermo.h>
#include <util.h>
static int *steparr;
static MD_FLOAT *tmparr;
@@ -56,7 +41,7 @@ void setupThermo(Parameter *param, int natoms)
t_scale = mvv2e / dof_boltz;
p_scale = 1.0 / 3 / param->xprd / param->yprd / param->zprd;
e_scale = 0.5;
} else {
} else if(param->force_field == FF_EAM) {
mvv2e = 1.036427e-04;
dof_boltz = (natoms * 3 - 3) * 8.617343e-05;
t_scale = mvv2e / dof_boltz;
@@ -64,19 +49,13 @@ void setupThermo(Parameter *param, int natoms)
e_scale = 524287.985533;//16.0;
param->dtforce /= mvv2e;
}
printf("step\ttemp\t\tpressure\n");
}
void computeThermo(int iflag, Parameter *param, Atom *atom)
{
MD_FLOAT t = 0.0, p;
MD_FLOAT* vx = atom->vx;
MD_FLOAT* vy = atom->vy;
MD_FLOAT* vz = atom->vz;
for(int i = 0; i < atom->Nlocal; i++) {
t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
}
t = t * t_scale;
@@ -101,12 +80,11 @@ void adjustThermo(Parameter *param, Atom *atom)
{
/* zero center-of-mass motion */
MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
for(int i = 0; i < atom->Nlocal; i++) {
vxtot += vx[i];
vytot += vy[i];
vztot += vz[i];
vxtot += atom_vx(i);
vytot += atom_vy(i);
vztot += atom_vz(i);
}
vxtot = vxtot / atom->Natoms;
@@ -114,24 +92,24 @@ void adjustThermo(Parameter *param, Atom *atom)
vztot = vztot / atom->Natoms;
for(int i = 0; i < atom->Nlocal; i++) {
vx[i] -= vxtot;
vy[i] -= vytot;
vz[i] -= vztot;
atom_vx(i) -= vxtot;
atom_vy(i) -= vytot;
atom_vz(i) -= vztot;
}
t_act = 0;
MD_FLOAT t = 0.0;
for(int i = 0; i < atom->Nlocal; i++) {
t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
}
t *= t_scale;
MD_FLOAT factor = sqrt(param->temp / t);
for(int i = 0; i < atom->Nlocal; i++) {
vx[i] *= factor;
vy[i] *= factor;
vz[i] *= factor;
atom_vx(i) *= factor;
atom_vy(i) *= factor;
atom_vz(i) *= factor;
}
}

27
common/timing.c Normal file
View File

@@ -0,0 +1,27 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <time.h>
double getTimeStamp()
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
}
double getTimeResolution()
{
struct timespec ts;
clock_getres(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
}
double getTimeStamp_()
{
return getTimeStamp();
}

105
common/util.c Normal file
View File

@@ -0,0 +1,105 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <util.h>
/* Park/Miller RNG w/out MASKING, so as to be like f90s version */
#define IA 16807
#define IM 2147483647
#define AM (1.0/IM)
#define IQ 127773
#define IR 2836
#define MASK 123459876
double myrandom(int* seed) {
int k= (*seed) / IQ;
double ans;
*seed = IA * (*seed - k * IQ) - IR * k;
if(*seed < 0) *seed += IM;
ans = AM * (*seed);
return ans;
}
void random_reset(int *seed, int ibase, double *coord) {
int i;
char *str = (char *) &ibase;
int n = sizeof(int);
unsigned int hash = 0;
for (i = 0; i < n; i++) {
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
str = (char *) coord;
n = 3 * sizeof(double);
for (i = 0; i < n; i++) {
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
// keep 31 bits of unsigned int as new seed
// do not allow seed = 0, since will cause hang in gaussian()
*seed = hash & 0x7ffffff;
if (!(*seed)) *seed = 1;
// warm up the RNG
for (i = 0; i < 5; i++) myrandom(seed);
//save = 0;
}
int str2ff(const char *string) {
if(strncmp(string, "lj", 2) == 0) return FF_LJ;
if(strncmp(string, "eam", 3) == 0) return FF_EAM;
if(strncmp(string, "dem", 3) == 0) return FF_DEM;
return -1;
}
const char* ff2str(int ff) {
if(ff == FF_LJ) { return "lj"; }
if(ff == FF_EAM) { return "eam"; }
if(ff == FF_DEM) { return "dem"; }
return "invalid";
}
int get_num_threads() {
const char *num_threads_env = getenv("NUM_THREADS");
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
}
void readline(char *line, FILE *fp) {
if(fgets(line, MAXLINE, fp) == NULL) {
if(errno != 0) {
perror("readline()");
exit(-1);
}
}
}
void debug_printf(const char *format, ...) {
#ifdef DEBUG
va_list arg;
int ret;
va_start(arg, format);
if((vfprintf(stdout, format, arg)) < 0) { perror("debug_printf()"); }
va_end(arg);
#endif
}

View File

@@ -1,29 +1,48 @@
# Compiler tag (GCC/CLANG/ICC)
TAG ?= CLANG
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
TAG ?= NVCC
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
ISA ?= AVX512
# Optimization scheme (lammps/gromacs/clusters_per_bin)
OPT_SCHEME ?= gromacs
# Enable likwid (true or false)
ENABLE_LIKWID ?= false
ENABLE_LIKWID ?= true
# SP or DP
DATA_TYPE ?= DP
# AOS or SOA
DATA_LAYOUT ?= AOS
# Assembly syntax to generate (ATT/INTEL)
ASM_SYNTAX ?= ATT
# Debug
DEBUG ?= true
# Number of times to run the atoms loop on stubbed variant
ATOMS_LOOP_RUNS ?= 1
# Number of times to run the neighbors loop on stubbed variant
NEIGHBORS_LOOP_RUNS ?= 1
# Explicitly store and load atom types (true or false)
EXPLICIT_TYPES ?= false
# Trace memory addresses for cache simulator (true or false)
MEM_TRACER ?= false
# Trace indexes and distances for gather-md (true or false)
INDEX_TRACER ?= false
# Vector width (elements) for index and distance tracer
VECTOR_WIDTH ?= 8
# Compute statistics
COMPUTE_STATS ?= true
# Configurations for lammps optimization scheme
# Use omp simd pragma when running with half neighbor-lists
ENABLE_OMP_SIMD ?= true
# Use kernel with explicit SIMD intrinsics
USE_SIMD_KERNEL ?= false
# Configurations for gromacs optimization scheme
# Use reference version
USE_REFERENCE_VERSION ?= false
# Enable XTC output
XTC_OUTPUT ?= false
# Check if cj is local when decreasing reaction force
HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
# Configurations for CUDA
# Use CUDA host memory to optimize transfers
USE_CUDA_HOST_MEMORY ?= false
USE_SUPER_CLUSTERS ?= true
#Feature options
OPTIONS = -DALIGNMENT=64
#OPTIONS += More options

File diff suppressed because it is too large Load Diff

1003
data/argon_1000/conf.gro Normal file

File diff suppressed because it is too large Load Diff

244
data/argon_1000/grompp.mdp Normal file
View File

@@ -0,0 +1,244 @@
;
; Generated by:
; Vitaly V. Chaban
; School of Chemistry
; University of Kharkiv
; Ukraine, Kharkiv-61077, Svoboda sq., 4
; email: chaban@univer.kharkov.ua, vvchaban@gmail.com
; skype: vvchaban
; System: Liquid argon (1000 atoms) at 80 K. Equilibrated for 500ps.
; VARIOUS PREPROCESSING OPTIONS
title = Yo
cpp = /usr/bin/cpp
include =
define =
; RUN CONTROL PARAMETERS
integrator = md
; Start time and timestep in ps
tinit = 0
dt = 0.001
nsteps = 250000
; For exact run continuation or redoing part of a run
init_step = 0
; mode for center of mass motion removal
comm-mode = Linear
; number of steps for center of mass motion removal
nstcomm = 1
; group(s) for center of mass motion removal
comm-grps =
; LANGEVIN DYNAMICS OPTIONS
; Temperature, friction coefficient (amu/ps) and random seed
bd-temp = 300
bd-fric = 0
ld-seed = 1993
; ENERGY MINIMIZATION OPTIONS
; Force tolerance and initial step-size
emtol = 100
emstep = 0.01
; Max number of iterations in relax_shells
niter = 20
; Step size (1/ps^2) for minimization of flexible constraints
fcstep = 0
; Frequency of steepest descents steps when doing CG
nstcgsteep = 1000
nbfgscorr = 10
; OUTPUT CONTROL OPTIONS
; Output frequency for coords (x), velocities (v) and forces (f)
nstxout = 500
nstvout = 5
nstfout = 0
; Checkpointing helps you continue after crashes
nstcheckpoint = 1000
; Output frequency for energies to log file and energy file
nstlog = 50
nstenergy = 50
; Output frequency and precision for xtc file
nstxtcout = 5
xtc-precision = 1000
; This selects the subset of atoms for the xtc file. You can
; select multiple groups. By default all atoms will be written.
xtc-grps =
; Selection of energy groups
energygrps =
; NEIGHBORSEARCHING PARAMETERS
; nblist update frequency
nstlist = 5
; ns algorithm (simple or grid)
ns_type = grid
; Periodic boundary conditions: xyz (default), no (vacuum)
; or full (infinite systems only)
pbc = xyz
; nblist cut-off
rlist = 0.9
domain-decomposition = no
; OPTIONS FOR ELECTROSTATICS AND VDW
; Method for doing electrostatics
coulombtype = Cut-off
rcoulomb-switch = 0
rcoulomb = 0.9
; Dielectric constant (DC) for cut-off or DC of reaction field
epsilon-r = 1
; Method for doing Van der Waals
vdw-type = Cut-off
; cut-off lengths
rvdw-switch = 0
rvdw = 0.9
; Apply long range dispersion corrections for Energy and Pressure
DispCorr = EnerPres
; Extension of the potential lookup tables beyond the cut-off
table-extension = 1
; Spacing for the PME/PPPM FFT grid
fourierspacing = 0.12
; FFT grid size, when a value is 0 fourierspacing will be used
fourier_nx = 0
fourier_ny = 0
fourier_nz = 0
; EWALD/PME/PPPM parameters
pme_order = 4
ewald_rtol = 1e-05
ewald_geometry = 3d
epsilon_surface = 0
optimize_fft = no
; GENERALIZED BORN ELECTROSTATICS
; Algorithm for calculating Born radii
gb_algorithm = Still
; Frequency of calculating the Born radii inside rlist
nstgbradii = 1
; Cutoff for Born radii calculation; the contribution from atoms
; between rlist and rgbradii is updated every nstlist steps
rgbradii = 2
; Salt concentration in M for Generalized Born models
gb_saltconc = 0
; IMPLICIT SOLVENT (for use with Generalized Born electrostatics)
implicit_solvent = No
; OPTIONS FOR WEAK COUPLING ALGORITHMS
; Temperature coupling
Tcoupl = berendsen
; Groups to couple separately
tc-grps = System
; Time constant (ps) and reference temperature (K)
tau_t = 0.1
ref_t = 80
; Pressure coupling
Pcoupl = no
Pcoupltype = isotropic
; Time constant (ps), compressibility (1/bar) and reference P (bar)
tau_p = 1.0
compressibility = 4.5e-5
ref_p = 1.0
; Random seed for Andersen thermostat
andersen_seed = 815131
; SIMULATED ANNEALING
; Type of annealing for each temperature group (no/single/periodic)
annealing = no
; Number of time points to use for specifying annealing in each group
annealing_npoints =
; List of times at the annealing points for each group
annealing_time =
; Temp. at each annealing point, for each group.
annealing_temp =
; GENERATE VELOCITIES FOR STARTUP RUN
gen_vel = yes
gen_temp = 80
gen_seed = 1993
; OPTIONS FOR BONDS
constraints = all-bonds
; Type of constraint algorithm
constraint-algorithm = Lincs
; Do not constrain the start configuration
unconstrained-start = no
; Use successive overrelaxation to reduce the number of shake iterations
Shake-SOR = no
; Relative tolerance of shake
shake-tol = 1e-04
; Highest order in the expansion of the constraint coupling matrix
lincs-order = 4
; Number of iterations in the final step of LINCS. 1 is fine for
; normal simulations, but use 2 to conserve energy in NVE runs.
; For energy minimization with constraints it should be 4 to 8.
lincs-iter = 1
; Lincs will write a warning to the stderr if in one step a bond
; rotates over more degrees than
lincs-warnangle = 30
; Convert harmonic bonds to morse potentials
morse = no
; ENERGY GROUP EXCLUSIONS
; Pairs of energy groups for which all non-bonded interactions are excluded
energygrp_excl =
; NMR refinement stuff
; Distance restraints type: No, Simple or Ensemble
disre = No
; Force weighting of pairs in one distance restraint: Conservative or Equal
disre-weighting = Conservative
; Use sqrt of the time averaged times the instantaneous violation
disre-mixed = no
disre-fc = 1000
disre-tau = 0
; Output frequency for pair distances to energy file
nstdisreout = 100
; Orientation restraints: No or Yes
orire = no
; Orientation restraints force constant and tau for time averaging
orire-fc = 0
orire-tau = 0
orire-fitgrp =
; Output frequency for trace(SD) to energy file
nstorireout = 100
; Dihedral angle restraints: No, Simple or Ensemble
dihre = No
dihre-fc = 1000
dihre-tau = 0
; Output frequency for dihedral values to energy file
nstdihreout = 100
; Free energy control stuff
free-energy = no
init-lambda = 0
delta-lambda = 0
sc-alpha = 0
sc-sigma = 0.3
; Non-equilibrium MD stuff
acc-grps =
accelerate =
freezegrps =
freezedim =
cos-acceleration = 0
; Electric fields
; Format is number of terms (int) and for all terms an amplitude (real)
; and a phase angle (real)
E-x =
E-xt =
E-y =
E-yt =
E-z =
E-zt =
; User defined thingies
user1-grps =
user2-grps =
userint1 = 0
userint2 = 0
userint3 = 0
userint4 = 0
userreal1 = 0
userreal2 = 0
userreal3 = 0
userreal4 = 0

View File

@@ -0,0 +1,12 @@
mass 39.94
sigma 0.0062220
epsilon 0.0000096960
ntimes 250000
dt 0.001
temp 80
x_out_freq 500
v_out_freq 5
cutforce 0.9
skin 0.0
reneigh_every 100
nstat 125000

1003
data/argon_1000/tprout.gro Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,304 @@
DATE: 2007-06-11 UNITS: metal CONTRIBUTOR: Stephen Foiles, foiles@sandia.gov CITATION: Foiles et al, Phys Rev B, 33, 7983 (1986) COMMENT: Cu functions (universal 3), SM Foiles et al, PRB, 33, 7983 (1986)
29 63.550 3.6150 FCC
500 5.0100200400801306e-04 500 1.0000000000000009e-02 4.9499999999999886e+00
0. -3.1561636903424350e-01 -5.2324876182494506e-01 -6.9740831416804383e-01 -8.5202525457518519e-01
-9.9329216586042435e-01 -1.1246331970890324e+00 -1.2481882647347859e+00 -1.3654054700363645e+00 -1.4773214276236644e+00
-1.5847099936904741e+00 -1.6865851873526410e+00 -1.7843534091637920e+00 -1.8790616476576076e+00 -1.9710188604521761e+00
-2.0604838665854572e+00 -2.1476762477372944e+00 -2.2327843595560068e+00 -2.3159713409697673e+00 -2.3973797031286352e+00
-2.4771348895887826e+00 -2.5553480773272810e+00 -2.6321184083774227e+00 -2.7075347880408458e+00 -2.7816773487592030e+00
-2.8546186529652005e+00 -2.9264246898861899e+00 -2.9971557080624507e+00 -3.0668669157065978e+00 -3.1356090736776849e+00
-3.2034290008357829e+00 -3.2703700069757247e+00 -3.3364722658277230e+00 -3.4017731379735778e+00 -3.4663074517059016e+00
-3.5301077484029122e+00 -3.5932044977085980e+00 -3.6556262870729199e+00 -3.7173999892229403e+00 -3.7785509106421671e+00
-3.8391029237823773e+00 -3.8990785849196925e+00 -3.9584992397079333e+00 -4.0173851179270912e+00 -4.0744518500210916e+00
-4.1306733564032641e+00 -4.1864034067843932e+00 -4.2416582335814326e+00 -4.2964533268445280e+00 -4.3508034838872618e+00
-4.4047228547107977e+00 -4.4582249835318351e+00 -4.5113228468570128e+00 -4.5640288884490872e+00 -4.6163550514904443e+00
-4.6683128082199232e+00 -4.7199131872767452e+00 -4.7711667990036801e+00 -4.8220838587683374e+00 -4.8726742087289665e+00
-4.9229473379113813e+00 -4.9729124009208192e+00 -5.0225782353423369e+00 -5.0719533779533492e+00 -5.1210460798461668e+00
-5.1698643205481289e+00 -5.2184158212228908e+00 -5.2667080570261362e+00 -5.3147482686812282e+00 -5.3625434733324937e+00
-5.4101004747367369e+00 -5.4574258728391953e+00 -5.5045260727784751e+00 -5.5514072933650311e+00 -5.5980755750691458e+00
-5.6445367875538750e+00 -5.6907966367860183e+00 -5.7368606717507191e+00 -5.7827342908000219e+00 -5.8284227476608805e+00
-5.8739311571204382e+00 -5.9192645004390272e+00 -5.9644276303605182e+00 -6.0094252761103064e+00 -6.0542620478988169e+00
-6.0989424413057520e+00 -6.1434708414539330e+00 -6.1878515269578429e+00 -6.2320886736884802e+00 -6.2761863583589275e+00
-6.3201485619430571e+00 -6.3639791729330000e+00 -6.4076819904493902e+00 -6.4512607272098990e+00 -6.4947190123648113e+00
-6.5380603942065250e+00 -6.5812883427622069e+00 -6.6243939095620874e+00 -6.6670830925929181e+00 -6.7096660473058591e+00
-6.7521459135001862e+00 -6.7945257643836499e+00 -6.8368086085521611e+00 -6.8789973918942735e+00 -6.9210949994162263e+00
-6.9631042569970703e+00 -7.0050279330721992e+00 -7.0468687402560874e+00 -7.0886293368973554e+00 -7.1303123285804020e+00
-7.1719202695651916e+00 -7.2134556641788095e+00 -7.2549209681507421e+00 -7.2963185899023415e+00 -7.3376508917899628e+00
-7.3789201913012903e+00 -7.4201287622117036e+00 -7.4612788356982946e+00 -7.5023726014152032e+00 -7.5434122085331978e+00
-7.5843997667427345e+00 -7.6253373472216595e+00 -7.6662269835740062e+00 -7.7070706727342895e+00 -7.7478703758424388e+00
-7.7886280190928119e+00 -7.8293454945503811e+00 -7.8700246609474789e+00 -7.9106673444489104e+00 -7.9512753393968865e+00
-7.9918504090315139e+00 -8.0323942861870705e+00 -8.0729086739704030e+00 -8.1133952464140293e+00 -8.1538556491162808e+00
-8.1942914998523975e+00 -8.2347043891773524e+00 -8.2750958810033808e+00 -8.3154675131659701e+00 -8.3558207979692725e+00
-8.3961572227176475e+00 -8.4364782502312892e+00 -8.4767853193496308e+00 -8.5170798454139458e+00 -8.5573632207473906e+00
-8.5976368151087286e+00 -8.6379019761436666e+00 -8.6781600298199919e+00 -8.7184122808490656e+00 -8.7586600130993020e+00
-8.7989044899963460e+00 -8.8391469549140993e+00 -8.8793886315543773e+00 -8.9196307243150841e+00 -8.9598744186541239e+00
-9.0001208814363167e+00 -9.0403712612778122e+00 -9.0806266888772029e+00 -9.1208882773446476e+00 -9.1611571225108719e+00
-9.2014343032440138e+00 -9.2417208817437881e+00 -9.2820179038447463e+00 -9.3223263992829857e+00 -9.3626473819958278e+00
-9.4029818503831279e+00 -9.4433307875392529e+00 -9.4836951616705960e+00 -9.5237840547885071e+00 -9.5637918926951784e+00
-9.6038142178817338e+00 -9.6438519061474608e+00 -9.6839058194810832e+00 -9.7239768064614509e+00 -9.7640657024289226e+00
-9.8041733297054634e+00 -9.8443004978059889e+00 -9.8844480036373170e+00 -9.9246166317080906e+00 -9.9648071543198853e+00
-1.0005020331762637e+01 -1.0045256912501884e+01 -1.0085517633366123e+01 -1.0125803219723423e+01 -1.0166114385662183e+01
-1.0206451834160134e+01 -1.0246816257258331e+01 -1.0287208336224353e+01 -1.0327628741713852e+01 -1.0368078133934148e+01
-1.0408557162795717e+01 -1.0449066468066974e+01 -1.0489606679525650e+01 -1.0530178417100558e+01 -1.0570782291022510e+01
-1.0611418901960292e+01 -1.0652088841158786e+01 -1.0692792690577562e+01 -1.0733531023022920e+01 -1.0774304402276016e+01
-1.0815113383222808e+01 -1.0855958511980305e+01 -1.0896840326017184e+01 -1.0937759354276295e+01 -1.0978716117290730e+01
-1.1019711127305925e+01 -1.1060744888386239e+01 -1.1101817896531486e+01 -1.1142930639787664e+01 -1.1184083598352004e+01
-1.1225277244679319e+01 -1.1266512043589387e+01 -1.1307788452364719e+01 -1.1349106920870327e+01 -1.1390467891550486e+01
-1.1431871799781504e+01 -1.1473319073642074e+01 -1.1514810134213008e+01 -1.1556345395619132e+01 -1.1597925265115521e+01
-1.1639550143177303e+01 -1.1681220423591583e+01 -1.1722936493536452e+01 -1.1764698733669888e+01 -1.1806507518187232e+01
-1.1848363215029394e+01 -1.1890266185706139e+01 -1.1932216785634637e+01 -1.1974215364086319e+01 -1.2016262264291129e+01
-1.2058357823507606e+01 -1.2100502373105996e+01 -1.2142696238631970e+01 -1.2184939739884385e+01 -1.2227233190982815e+01
-1.2269576900438324e+01 -1.2311971171220080e+01 -1.2354416300827552e+01 -1.2396912581348374e+01 -1.2439460299532641e+01
-1.2482059736851909e+01 -1.2524711169562636e+01 -1.2567414868772744e+01 -1.2610171100495961e+01 -1.2652980125719694e+01
-1.2695842200459083e+01 -1.2738757575819193e+01 -1.2781726498053729e+01 -1.2824749208615117e+01 -1.2867825944219817e+01
-1.2910956936899197e+01 -1.2954142414054047e+01 -1.2997382598508125e+01 -1.3040677708563408e+01 -1.3084027958052218e+01
-1.3127433556386677e+01 -1.3170894708610035e+01 -1.3214411615448739e+01 -1.3257984473359954e+01 -1.3301613474583519e+01
-1.3345298807190659e+01 -1.3389040655121903e+01 -1.3432839198243016e+01 -1.3476694612386723e+01 -1.3520607069407617e+01
-1.3564576737214225e+01 -1.3608603779754390e+01 -1.3652688357330362e+01 -1.3696830626228689e+01 -1.3741030739041094e+01
-1.3785288844633044e+01 -1.3829605088192579e+01 -1.3873979611263849e+01 -1.3918412551792358e+01 -1.3962904044165157e+01
-1.4007454219246995e+01 -1.4052063204422609e+01 -1.4096731123636516e+01 -1.4141458097424390e+01 -1.4186244242962175e+01
-1.4231089674089560e+01 -1.4275994501358696e+01 -1.4320958832063411e+01 -1.4365982770278379e+01 -1.4411066416893846e+01
-1.4456209869649911e+01 -1.4501413223171539e+01 -1.4546676569005058e+01 -1.4591999995647598e+01 -1.4637383588581656e+01
-1.4682827430315228e+01 -1.4728331600403862e+01 -1.4773896175488971e+01 -1.4819521229330235e+01 -1.4865206832833337e+01
-1.4910953054084985e+01 -1.4956759958383259e+01 -1.5002627608264334e+01 -1.5048556063539081e+01 -1.5094545381317744e+01
-1.5140595616041765e+01 -1.5186706819511983e+01 -1.5232879040916600e+01 -1.5279112326867676e+01 -1.5325406721414765e+01
-1.5371762266086876e+01 -1.5418178999911675e+01 -1.5464656959446415e+01 -1.5511196178805903e+01 -1.5557796689685119e+01
-1.5604458521389688e+01 -1.5651181700861002e+01 -1.5697966252703509e+01 -1.5744812199205967e+01 -1.5791719560374304e+01
-1.5838688353945599e+01 -1.5885718595428898e+01 -1.5932810298111235e+01 -1.5979963473102316e+01 -1.6027178129340314e+01
-1.6074454273625634e+01 -1.6121791910645470e+01 -1.6169191042992907e+01 -1.6216651671189425e+01 -1.6264173793714576e+01
-1.6311757407021901e+01 -1.6359402505566209e+01 -1.6407109081822910e+01 -1.6454877126310635e+01 -1.6502706627614998e+01
-1.6550597572407241e+01 -1.6598549945469813e+01 -1.6646563729715353e+01 -1.6694638906205682e+01 -1.6742775454176012e+01
-1.6790973351056778e+01 -1.6839232572488413e+01 -1.6887553092348412e+01 -1.6935934882766333e+01 -1.6984377914146876e+01
-1.7032882155186826e+01 -1.7081447572897673e+01 -1.7130074132623690e+01 -1.7178761798061373e+01 -1.7227510531275698e+01
-1.7276320292724563e+01 -1.7325191041271864e+01 -1.7374122734215121e+01 -1.7423115327299456e+01 -1.7472168774711918e+01
-1.7521283029136725e+01 -1.7570458041655343e+01 -1.7619693762170868e+01 -1.7668990138814479e+01 -1.7718347118374936e+01
-1.7767764646209685e+01 -1.7817242666259403e+01 -1.7866781121071881e+01 -1.7916379951810882e+01 -1.7966039098283659e+01
-1.8015758498943796e+01 -1.8065538090918608e+01 -1.8115377810021755e+01 -1.8165277590764617e+01 -1.8215237366381530e+01
-1.8265257068836149e+01 -1.8315336628844307e+01 -1.8365475975885602e+01 -1.8415675038220570e+01 -1.8465933742903644e+01
-1.8516252015799409e+01 -1.8566629781600568e+01 -1.8617066963838965e+01 -1.8667563484898778e+01 -1.8718119266039025e+01
-1.8768734227397317e+01 -1.8819408288014415e+01 -1.8870141365839345e+01 -1.8920933377750998e+01 -1.8971784239569388e+01
-1.9022693866067016e+01 -1.9073662170983084e+01 -1.9124689067045438e+01 -1.9175774465969539e+01 -1.9226918278483254e+01
-1.9278120414338218e+01 -1.9329380782317116e+01 -1.9380699290257098e+01 -1.9432075845048644e+01 -1.9483510352663075e+01
-1.9535002718153464e+01 -1.9586552845676124e+01 -1.9638160638497766e+01 -1.9689825999008235e+01 -1.9741548828738019e+01
-1.9793329028359494e+01 -1.9845166497711489e+01 -1.9897061135804051e+01 -1.9949012840833348e+01 -2.0001021510188707e+01
-2.0053087040468540e+01 -2.0105209327494322e+01 -2.0157388266314911e+01 -2.0209623751249865e+01 -2.0261915675825890e+01
-2.0314263932714312e+01 -2.0366668414255741e+01 -2.0419129011700647e+01 -2.0471645615726288e+01 -2.0524218116314501e+01
-2.0576846402769888e+01 -2.0629530363722893e+01 -2.0682269887147754e+01 -2.0735064860369221e+01 -2.0787915170073120e+01
-2.0840820702317274e+01 -2.0893781342541502e+01 -2.0946796975575580e+01 -2.0999867485656864e+01 -2.1052992756428125e+01
-2.1106172670961428e+01 -2.1159407111702421e+01 -2.1212695960751944e+01 -2.1266039099329419e+01 -2.1319436408360275e+01
-2.1372887768154328e+01 -2.1426393058473991e+01 -2.1479952158748461e+01 -2.1533564947619766e+01 -2.1587231303431395e+01
-2.1640951103995235e+01 -2.1694724226644553e+01 -2.1748550548245930e+01 -2.1802429945213817e+01 -2.1856362293508028e+01
-2.1910347468648524e+01 -2.1964385345728829e+01 -2.2018475799410339e+01 -2.2072618703948137e+01 -2.2126813933181779e+01
-2.2181061360561898e+01 -2.2235360859143157e+01 -2.2289712301596296e+01 -2.2344115560361388e+01 -2.2398570507087584e+01
-2.2453077013515781e+01 -2.2507634950890292e+01 -2.2562244190064348e+01 -2.2616904601590250e+01 -2.2671616055687764e+01
-2.2726378422261405e+01 -2.2781191570901910e+01 -2.2836055370890790e+01 -2.2890969691219198e+01 -2.2945934400583837e+01
-2.3000949367399926e+01 -2.3056014459808921e+01 -2.3111129545678523e+01 -2.3166294492618363e+01 -2.3221509167983868e+01
-2.3276773438880355e+01 -2.3332087172173260e+01 -2.3387450234495873e+01 -2.3442862492249787e+01 -2.3498323811618320e+01
-2.3553834058571510e+01 -2.3609393098863848e+01 -2.3665000798062465e+01 -2.3720657021526677e+01 -2.3776361634436626e+01
-2.3832114501780552e+01 -2.3887915488378439e+01 -2.3943764458878377e+01 -2.3999661277761106e+01 -2.4055605809352301e+01
-2.4111597917826657e+01 -2.4167637467209488e+01 -2.4223724321393092e+01 -2.4279858344124932e+01 -2.4336039399030597e+01
-2.4392267349614485e+01 -2.4448542059257761e+01 -2.4504863391234494e+01 -2.4561231208711206e+01 -2.4617645374753693e+01
-2.4674105752332935e+01 -2.4730612204329191e+01 -2.4787164593538137e+01 -2.4843762782677913e+01 -2.4900406634392539e+01
-2.4957096011252133e+01 -2.5013830775771112e+01 -2.5070610790396586e+01 -2.5127435917366029e+01 -2.5184306019355063e+01
-2.5241220958503845e+01 -2.5298180597080318e+01 -2.5355184797285347e+01 -2.5412233421340488e+01 -2.5469326331427965e+01
1.0000000000000000e+01 1.0801534951171448e+01 1.0617375158244670e+01 1.0436688151228793e+01 1.0259403283230313e+01
1.0085451405601304e+01 9.9147648356938589e+00 9.7472773253084029e+00 9.5829240298195373e+00 9.4216414779654656e+00
9.2633675422888473e+00 9.1080414102110012e+00 8.9556035557302494e+00 8.8059957117284853e+00 8.6591608428743143e+00
8.5150431191084976e+00 8.3735878897014118e+00 8.2347416578681987e+00 8.0984520559319435e+00 7.9646678210201571e+00
7.8333387712866624e+00 7.7044157826449009e+00 7.5778507660022569e+00 7.4535966449878401e+00 7.3316073341564731e+00
7.2118377176659578e+00 7.0942436284134374e+00 6.9787818276207929e+00 6.8654099848621115e+00 6.7540866585212882e+00
6.6447712766712357e+00 6.5374241183666584e+00 6.4320062953403578e+00 6.3284797340946000e+00 6.2268071583795574e+00
6.1269520720505000e+00 6.0288787422946655e+00 5.9325521832211621e+00 5.8379381398054591e+00 5.7450030721804524e+00
5.6537141402680220e+00 5.5640391887418730e+00 5.4759467323160322e+00 5.3894059413519244e+00 5.3043866277758980e+00
5.2208592313018016e+00 5.1387948059520454e+00 5.0581650068698707e+00 4.9789420774166615e+00 4.9010988365496075e+00
4.8246086664712777e+00 4.7494455005478358e+00 4.6755838114879396e+00 4.6029985997776066e+00 4.5316653823665547e+00
4.4615601815980312e+00 4.3926595143797726e+00 4.3249403815888456e+00 4.2583802577058805e+00 4.1929570806747449e+00
4.1286492419807814e+00 4.0654355769448500e+00 4.0032953552278059e+00 3.9422082715398403e+00 3.8821544365521561e+00
3.8231143680053350e+00 3.7650689820101348e+00 3.7079995845373759e+00 3.6518878630917868e+00 3.5967158785670392e+00
3.5424660572764992e+00 3.4891211831576925e+00 3.4366643901451397e+00 3.3850791547089756e+00 3.3343492885547761e+00
3.2844589314827459e+00 3.2353925444006251e+00 3.1871349024889781e+00 3.1396710885139782e+00 3.0929864862859660e+00
3.0470667742591075e+00 3.0018979192706325e+00 2.9574661704151453e+00 2.9137580530522627e+00 2.8707603629438552e+00
2.8284601605189152e+00 2.7868447652620318e+00 2.7459017502243626e+00 2.7056189366531243e+00 2.6659843887374848e+00
2.6269864084689516e+00 2.5886135306124487e+00 2.5508545177868598e+00 2.5136983556521244e+00 2.4771342482006986e+00
2.4411516131510069e+00 2.4057400774406830e+00 2.3708894728175807e+00 2.3365898315265383e+00 2.3028313820887689e+00
2.2696045451740474e+00 2.2368999295609058e+00 2.2047083281853901e+00 2.1730207142748128e+00 2.1418282375653348e+00
2.1111222206016862e+00 2.0808941551166384e+00 2.0511356984892615e+00 2.0218386702793651e+00 1.9929950488372441e+00
1.9645969679867363e+00 1.9366367137799969e+00 1.9091067213223525e+00 1.8819995716660998e+00 1.8553079887710169e+00
1.8290248365311754e+00 1.8031431158652609e+00 1.7776559618705363e+00 1.7525566410377422e+00 1.7278385485262007e+00
1.7034952054980579e+00 1.6795202565098251e+00 1.6559074669601728e+00 1.6326507205929630e+00 1.6097440170540054e+00
1.5871814695006066e+00 1.5649573022624637e+00 1.5430658485530984e+00 1.5215015482308161e+00 1.5002589456071576e+00
1.4793326873036463e+00 1.4587175201534635e+00 1.4384082891492156e+00 1.4183999354343300e+00 1.3986874943378140e+00
1.3792660934511431e+00 1.3601309507466510e+00 1.3412773727360872e+00 1.3227007526689576e+00 1.3043965687692420e+00
1.2863603825102174e+00 1.2685878369261090e+00 1.2510746549598935e+00 1.2338166378466084e+00 1.2168096635312082e+00
1.2000496851203266e+00 1.1835327293670588e+00 1.1672548951882362e+00 1.1512123522134416e+00 1.1354013393647548e+00
1.1198181634671940e+00 1.1044591978884952e+00 1.0893208812080033e+00 1.0743997159140335e+00 1.0596922671287743e+00
1.0451951613605601e+00 1.0309050852825337e+00 1.0168187845373140e+00 1.0029330625671378e+00 9.8924477946872713e-01
9.7575085087259694e-01 9.6244824684604424e-01 9.4933399081931213e-01 9.3640515853477169e-01 9.2365887701803118e-01
9.1109232357100112e-01 8.9870272478628266e-01 8.8648735558209424e-01 8.7444353825798160e-01 8.6256864157006774e-01
8.5086007982605949e-01 8.3931531199913678e-01 8.2793184086057892e-01 8.1670721213066955e-01 8.0563901364725510e-01
7.9472487455206675e-01 7.8396246449372953e-01 7.7334949284779597e-01 7.6288370795296245e-01 7.5256289636327622e-01
7.4238488211596021e-01 7.3234752601463171e-01 7.2244872492728618e-01 7.1268641109915265e-01 7.0305855147956464e-01
6.9356314706317335e-01 6.8419823224459719e-01 6.7496187418651843e-01 6.6585217220099224e-01 6.5686725714346750e-01
6.4800529081937697e-01 6.3926446540306614e-01 6.3064300286859520e-01 6.2213915443241774e-01 6.1375120000748140e-01
6.0547744766850542e-01 5.9731623312840654e-01 5.8926591922531912e-01 5.8132489542033028e-01 5.7349157730523359e-01
5.6576440612064971e-01 5.5814184828379609e-01 5.5062239492602316e-01 5.4320456143964790e-01 5.3588688703414888e-01
5.2866793430138515e-01 5.2154628878946241e-01 5.1452055858552015e-01 5.0758937390678227e-01 5.0075138669987496e-01
4.9400527024841523e-01 4.8734971878830358e-01 4.8078344713093557e-01 4.7430519029390972e-01 4.6791370313911962e-01
4.6160776001828552e-01 4.5538615442535857e-01 4.4924769865602876e-01 4.4319122347399365e-01 4.3721557778390086e-01
4.3131962831075654e-01 4.2550225928575891e-01 4.1976237213834899e-01 4.1409888519439697e-01 4.0851073338028954e-01
4.0299686793291478e-01 3.9755625611540779e-01 3.9218788093843493e-01 3.8689074088692443e-01 3.8166384965228239e-01
3.7650623586976018e-01 3.7141694286095728e-01 3.6639502838144544e-01 3.6143956437320846e-01 3.5654963672189943e-01
3.5172434501901328e-01 3.4696280232829579e-01 3.4226413495707497e-01 3.3762748223177219e-01 3.3305199627774762e-01
3.2853684180349596e-01 3.2408119588894380e-01 3.1968424777773841e-01 3.1534519867361155e-01 3.1106326154055530e-01
3.0683766090688813e-01 3.0266763267296426e-01 2.9855242392259740e-01 2.9449129273803010e-01 2.9048350801842027e-01
2.8652834930171167e-01 2.8262510658997009e-01 2.7877308017785829e-01 2.7497158048439907e-01 2.7121992788793392e-01
2.6751745256412462e-01 2.6386349432690004e-01 2.6025740247248841e-01 2.5669853562631850e-01 2.5318626159266877e-01
2.4971995720718354e-01 2.4629900819206618e-01 2.4292280901402563e-01 2.3959076274464408e-01 2.3630228092351846e-01
2.3305678342376535e-01 2.2985369832002167e-01 2.2669246175884616e-01 2.2357251783148069e-01 2.2049331844890929e-01
2.1745432321916880e-01 2.1445499932688783e-01 2.1149482141498144e-01 2.0857327146848004e-01 2.0568983870040114e-01
2.0284401943976604e-01 2.0003531702142130e-01 1.9726324167804599e-01 1.9452731043391402e-01 1.9182704700056608e-01
1.8916198167437770e-01 1.8653165123588344e-01 1.8393559885088084e-01 1.8137337397327791e-01 1.7884453224959973e-01
1.7634863542523593e-01 1.7388525125224241e-01 1.7145395339876757e-01 1.6905432136008169e-01 1.6668594037109052e-01
1.6434840132036665e-01 1.6204130066570688e-01 1.5976424035106618e-01 1.5751682772493769e-01 1.5529867546015819e-01
1.5310940147503249e-01 1.5094862885580707e-01 1.4881598578045718e-01 1.4671110544379484e-01 1.4463362598375351e-01
1.4258319040899092e-01 1.4055944652768915e-01 1.3856204687748974e-01 1.3659064865666881e-01 1.3464491365640630e-01
1.3272450819420012e-01 1.3082910304837103e-01 1.2895837339364213e-01 1.2711199873781265e-01 1.2528966285941134e-01
1.2349105374641756e-01 1.2171586353596986e-01 1.1996378845505173e-01 1.1823452876211782e-01 1.1652778868972380e-01
1.1484327638801961e-01 1.1318070386919254e-01 1.1153978695277944e-01 1.0992024521187505e-01 1.0832180192018548e-01
1.0674418399992769e-01 1.0518712197055757e-01 1.0365034989832456e-01 1.0213360534659532e-01 1.0063662932698936e-01
9.9159166251264974e-02 9.7700963883974534e-02 9.6261773295835962e-02 9.4841348817873428e-02 9.3439447996227276e-02
9.2055831547688260e-02 9.0690263315935660e-02 8.9342510228411331e-02 8.8012342253891429e-02 8.6699532360706044e-02
8.5403856475584128e-02 8.4125093443141896e-02 8.2863024985984080e-02 8.1617435665412685e-02 8.0388112842733062e-02
7.9174846641143493e-02 7.7977429908209661e-02 7.6795658178889781e-02 7.5629329639115728e-02 7.4478245089953710e-02
7.3342207912248103e-02 7.2221024031827064e-02 7.1114501885225945e-02 7.0022452385910761e-02 6.8944688890991479e-02
6.7881027168450458e-02 6.6831285364849169e-02 6.5795283973477225e-02 6.4772845803028556e-02 6.3763795946680801e-02
6.2767961751651669e-02 6.1785172789201148e-02 6.0815260825057393e-02 5.9858059790287577e-02 5.8913405752569759e-02
5.7981136887894191e-02 5.7061093452682510e-02 5.6153117756271964e-02 5.5257054133826422e-02 5.4372748919636837e-02
5.3500050420772105e-02 5.2638808891131372e-02 5.1788876505864945e-02 5.0950107336147354e-02 5.0122357324306366e-02
4.9305484259319243e-02 4.8499347752635869e-02 4.7703809214351578e-02 4.6918731829721727e-02 4.6143980535982010e-02
4.5379421999521163e-02 4.4624924593352100e-02 4.3880358374905226e-02 4.3145595064128850e-02 4.2420508021892900e-02
4.1704972228691739e-02 4.0998864263647405e-02 4.0302062283785300e-02 3.9614446003616965e-02 3.8935896674993531e-02
3.8266297067221844e-02 3.7605531447481688e-02 3.6953485561492139e-02 3.6310046614435487e-02 3.5675103252157392e-02
3.5048545542616605e-02 3.4430264957581835e-02 3.3820154354582632e-02 3.3218107959093635e-02 3.2624021346983278e-02
3.2037791427166340e-02 3.1459316424514716e-02 3.0888495862994469e-02 3.0325230549015147e-02 2.9769422555015357e-02
2.9220975203265720e-02 2.8679793049885216e-02 2.8145781869070463e-02 2.7618848637539717e-02 2.7098901519172047e-02
2.6585849849867671e-02 2.6079604122596356e-02 2.5580075972643668e-02 2.5087178163056167e-02 2.4600824570288671e-02
2.4120930170012267e-02 2.3647411023137499e-02 2.3180184262011627e-02 2.2719168076792418e-02 2.2264281702001121e-02
2.1815445403263078e-02 2.1372580464206647e-02 2.0935609173537761e-02 2.0504454812290795e-02 2.0079041641240414e-02
1.9659294888467183e-02 1.9245140737102040e-02 1.8836506313223755e-02 1.8433319673904158e-02 1.8035509795416238e-02
1.7643006561603891e-02 1.7255740752380899e-02 1.6873644032391555e-02 1.6496648939823388e-02 1.6124688875347792e-02
1.5757698091213634e-02 1.5395611680482646e-02 1.5038365566394485e-02 1.4685896491875350e-02 1.4338142009180710e-02
1.3995040469664266e-02 1.3656531013687800e-02 1.3322553560652262e-02 1.2993048799157525e-02 1.2667958177290606e-02
1.2347223893038994e-02 1.2030788884814458e-02 1.1718596822117511e-02 1.1410592096299910e-02 1.1106719811460941e-02
1.0806925775450060e-02 1.0511156490982998e-02 1.0219359146882878e-02 9.9314816094114855e-03 9.6474724137328716e-03
9.3672807554677773e-03 9.0908564823645177e-03 8.8181500860711193e-03 8.5491126940134832e-03 8.2836960613733579e-03
8.0218525631707838e-03 7.7635351864465685e-03 7.5086975225370223e-03 7.2572937594544973e-03 7.0092786743605195e-03
6.7646076261301813e-03 6.5232365480138998e-03 6.2851219403949887e-03 6.0502208636273869e-03 5.8184909309735300e-03
5.5898903016277091e-03 5.3643776738254711e-03 5.1419122780385074e-03 4.9224538702609122e-03 4.7059627253757674e-03
4.4923996305976099e-03 4.2817258790122659e-03 4.0739032631877392e-03 3.8688940688609841e-03 3.6666610687164924e-03
3.4671675162341598e-03 3.2703771396105918e-03 3.0762541357672313e-03 2.8847631644254856e-03 2.6958693422570179e-03
2.5095382371091990e-03 2.3257358623008373e-03 2.1444286709895732e-03 1.9655835506104946e-03 1.7891678173820869e-03
1.6151492108847365e-03 1.4434958887007410e-03 1.2741764211267048e-03 1.1071597859496629e-03 9.4241536328815156e-04
7.7991293049733956e-04 6.1962265713921827e-04 4.6151510001329887e-04 3.0556119825198014e-04 1.5173226847375876e-04
0. 0. 0. 0. 0.
0. 5.4383329664155645e-05 9.3944898415945083e-04 4.3251847212615047e-03 1.2334244035325348e-02
2.7137722173468548e-02 5.0697119791449641e-02 8.4607638668976470e-02 1.3001641279549414e-01 1.8759487452762702e-01
2.5754900895683441e-01 3.3965493779430744e-01 4.3331024634064264e-01 5.3759384878832961e-01 6.5132908316254046e-01
7.7314622535699939e-01 9.0154178511424377e-01 1.0349328562818201e+00 1.1717054897399350e+00 1.3102565818166738e+00
1.4490291582473986e+00 1.5865412121263560e+00 1.7214084470448441e+00 1.8523614026473965e+00 1.9782575145276269e+00
2.0980886961566938e+00 2.2109850373516764e+00 2.3162151996095730e+00 2.4131840597491703e+00 2.5014281146549706e+00
2.5806091153285706e+00 2.6505063508648590e+00 2.7110079545661563e+00 2.7621015568249447e+00 2.8038645637913220e+00
2.8364542979766156e+00 2.8600981973448825e+00 2.8750842333755031e+00 2.8817516761559574e+00 2.8804823057701157e+00
2.8716921439699092e+00 2.8558237581894161e+00 2.8333391711552594e+00 2.8047133934346959e+00 2.7704285829676252e+00
2.7309688247181469e+00 2.6868155147671331e+00 2.6384433262347358e+00 2.5863167291097398e+00 2.5308870321738226e+00
2.4725899125317596e+00 2.4118433966060167e+00 2.3490462556752334e+00 2.2845767789603002e+00 2.2187918877813502e+00
2.1520265552815943e+00 2.0845934975626363e+00 2.0167831036919637e+00 1.9488635738636404e+00 1.8810812369508270e+00
1.8136610207193371e+00 1.7468070500507196e+00 1.6807033505858371e+00 1.6155146372447149e+00 1.5513871690559142e+00
1.4884496536383409e+00 1.4268141864958608e+00 1.3665772120042590e+00 1.3078204945836447e+00 1.2506120900523854e+00
1.1950073085502879e+00 1.1410496616995687e+00 1.0887717878420631e+00 1.0381963502565981e+00 9.8933690422003551e-01
9.4219872964247031e-01 8.9677962677415124e-01 8.5307067316958651e-01 8.1105694069385592e-01 7.7071817188505065e-01
7.3202941544290212e-01 6.9496162100761794e-01 6.5948219372701189e-01 6.2555550939233484e-01 5.9314339115629977e-01
5.6220554903693554e-01 5.3269998356387660e-01 5.0458335504023211e-01 4.7781131998032222e-01 4.5233883634534777e-01
4.2812043923464138e-01 4.0511048870905242e-01 3.8326339142174781e-01 3.6253379771729577e-01 3.4287677583286325e-01
3.2424796479760154e-01 3.0660370758054967e-01 2.8990116598452254e-01 2.7409841872609064e-01 2.5915454407883409e-01
2.4502968839369110e-01 2.3168512174254197e-01 2.1908328186436687e-01 2.0718780752542632e-01 1.9596356233750800e-01
1.8537665001230508e-01 1.7539442196444632e-01 1.6598547811304609e-01 1.5711966166996927e-01 1.4876804864444715e-01
1.4090293273673637e-01 1.3349780623990259e-01 1.2652733751724909e-01 1.1996734557434463e-01 1.1379477219856060e-01
1.0798765209582406e-01 1.0252508141368288e-01 9.7387185001678311e-02 9.2555082724584015e-02 8.8010855111109620e-02
8.3737508589961873e-02 7.9718940536826377e-02 7.5939904329596963e-02 7.2385974585237101e-02 6.9043512729294765e-02
6.5899633029043336e-02 6.2942169202580001e-02 6.0159641699440547e-02 5.7541225732930634e-02 5.5076720130546430e-02
5.2756517056398833e-02 5.0571572648238083e-02 4.8513378601664936e-02 4.6573934725081756e-02 4.4745722480991068e-02
4.3021679522073253e-02 4.1395175224364866e-02 3.9859987214311721e-02 3.8410278881708670e-02 3.7040577866510604e-02
3.5745755503880039e-02 3.4521007208912380e-02 3.3361833779917971e-02 3.2264023597108116e-02 3.1223635691821294e-02
3.0236983660070216e-02 2.9300620393215571e-02 2.8411323597772320e-02 2.7566082075896281e-02 2.6762082737777249e-02
2.5996698317105604e-02 2.5267475760840985e-02 2.4572125264713973e-02 2.3908509926274246e-02 2.3274635987705516e-02
2.2668643641204911e-02 2.2088798370316409e-02 2.1533482801290083e-02 2.1001189039288493e-02 2.0490511464994254e-02
2.0000139967999431e-02 1.9528853594166895e-02 1.9075514584991349e-02 1.8639062787818239e-02 1.8218510416650235e-02
1.7812937144080498e-02 1.7421485505751177e-02 1.7043356599549031e-02 1.6677806062561751e-02 1.6324140309613155e-02
1.5981713017976018e-02 1.5649921843605585e-02 1.5328205354974755e-02 1.5016040171312250e-02 1.4712938292708366e-02
1.4418444610242331e-02 1.4132134584901757e-02 1.3853612084676337e-02 1.3582507369821917e-02 1.3318475216818060e-02
1.3061193172097418e-02 1.2810359927147186e-02 1.2565693807050415e-02 1.2326931365025051e-02 1.2093826075940506e-02
1.1866147122233661e-02 1.1643678266026136e-02 1.1426216801644407e-02 1.1213572583084475e-02 1.1005567121320226e-02
1.0802032746662471e-02 1.0602811831688208e-02 1.0407756070544782e-02 1.0216725810699157e-02 1.0029589433467268e-02
9.8462227798860602e-03 9.6665086187306404e-03 9.4903361536790021e-03 9.3176005668363371e-03 9.1482025960089031e-03
8.9820481433065535e-03 8.8190479128032462e-03 8.6591170751522117e-03 8.5021749571883021e-03 8.3481447546937537e-03
8.1969532666261724e-03 8.0485306492223962e-03 7.9028101885199598e-03 7.7597280899136256e-03 7.6192232834934315e-03
7.4812372439735375e-03 7.3457138241272979e-03 7.2125991007052359e-03 7.0818412319012813e-03 6.9533903254870300e-03
6.8271983168139705e-03 6.7032188559211503e-03 6.5814072030662141e-03 6.4617201320263939e-03 6.3441158405819764e-03
6.2285538676237207e-03 6.1149950163802147e-03 6.0034012832899109e-03 5.8937357920846312e-03 5.7859627326801166e-03
5.6800473044990030e-03 5.5759556638887986e-03 5.4736548753111791e-03 5.3731128660109428e-03 5.2742983838981461e-03
5.1771809583849582e-03 5.0817308639591330e-03 4.9879190862693046e-03 4.8957172905357560e-03 4.8050977921015592e-03
4.7160335289582467e-03 4.6284980360953021e-03 4.5424654215287241e-03 4.4579103438822931e-03 4.3748079913988880e-03
4.2931340622749670e-03 4.2128647462132407e-03 4.1339767071033873e-03 4.0564470667446839e-03 3.9802533895282599e-03
3.9053736680121076e-03 3.8317863093158128e-03 3.7594701222811860e-03 3.6884043053326127e-03 3.6185684349951674e-03
3.5499424550168301e-03 3.4825066660512660e-03 3.4162417158645347e-03 3.3511285900229004e-03 3.2871486030347646e-03
3.2242833899080170e-03 3.1625148980992668e-03 3.1018253798278661e-03 3.0421973847258310e-03 2.9836137528083811e-03
2.9260576077371064e-03 2.8695123503632708e-03 2.8139616525287708e-03 2.7593894511106498e-03 2.7057799422959966e-03
2.6531175760685227e-03 2.6013870509009052e-03 2.5505733086344240e-03 2.5006615295404683e-03 2.4516371275501436e-03
2.4034857456453340e-03 2.3561932514012535e-03 2.3097457326723414e-03 2.2641294934160616e-03 2.2193310496436136e-03
2.1753371254977782e-03 2.1321346494441173e-03 2.0897107505768314e-03 2.0480527550303662e-03 2.0071481824917164e-03
1.9669847428123305e-03 1.9275503327108034e-03 1.8888330325659355e-03 1.8508211032951805e-03 1.8135029833145980e-03
1.7768672855772646e-03 1.7409027946878666e-03 1.7055984640891586e-03 1.6709434133182904e-03 1.6369269253308227e-03
1.6035384438881917e-03 1.5707675710093030e-03 1.5386040644797400e-03 1.5070378354209296e-03 1.4760589459142243e-03
1.4456576066784674e-03 1.4158241748004133e-03 1.3865491515145517e-03 1.3578231800324136e-03 1.3296370434173130e-03
1.3019816625059188e-03 1.2748480938728074e-03 1.2482275278369870e-03 1.2221112865106742e-03 1.1964908218862064e-03
1.1713577139624703e-03 1.1467036689077198e-03 1.1225205172586891e-03 1.0988002121543120e-03 1.0755348276031765e-03
1.0527165567835728e-03 1.0303377103750150e-03 1.0083907149206553e-03 9.8686811121878604e-04 9.6576255274356815e-04
9.4506680409354657e-04 9.2477373946662708e-04 9.0487634116191706e-04 8.8536769810608137e-04 8.6624100440530968e-04
8.4748955791986991e-04 8.2910675886310736e-04 8.1108610842155551e-04 7.9342120739794852e-04 7.7610575487466887e-04
7.5913354689786591e-04 7.4249847518158968e-04 7.2619452583109687e-04 7.1021577808524222e-04 6.9455640307671332e-04
6.7921066261025093e-04 6.6417290795844214e-04 6.4943757867335500e-04 6.3499920141575628e-04 6.2085238879914031e-04
6.0699183824991856e-04 5.9341233088238896e-04 5.8010873038847818e-04 5.6707598194186137e-04 5.5430911111587280e-04
5.4180322281523891e-04 5.2955350022104025e-04 5.1755520374872563e-04 5.0580367001857793e-04 4.9429431083891986e-04
4.8302261220136561e-04 4.7198413328763435e-04 4.6117450548847222e-04 4.5058943143359842e-04 4.4022468403297037e-04
4.3007610552883886e-04 4.2013960655883260e-04 4.1041116522908330e-04 4.0088682619821882e-04 3.9156269977118005e-04
3.8243496100300207e-04 3.7349984881274514e-04 3.6475366510662147e-04 3.5619277391102898e-04 3.4781360051482253e-04
3.3961263062063513e-04 3.3158640950565685e-04 3.2373154119109092e-04 3.1604468762060252e-04 3.0852256784754707e-04
3.0116195723081836e-04 2.9395968663908575e-04 2.8691264166377101e-04 2.8001776184017647e-04 2.7327203987681688e-04
2.6667252089326854e-04 2.6021630166557681e-04 2.5390052988028163e-04 2.4772240339593181e-04 2.4167916951265550e-04
2.3576812424967210e-04 2.2998661163024531e-04 2.2433202297460642e-04 2.1880179620031078e-04 2.1339341513026532e-04
2.0810440880823181e-04 2.0293235082175821e-04 1.9787485863260665e-04 1.9292959291436311e-04 1.8809425689761319e-04
1.8336659572205580e-04 1.7874439579616125e-04 1.7422548416372047e-04 1.6980772787763936e-04 1.6548903338088530e-04
1.6126734589430591e-04 1.5714064881157744e-04 1.5310696310104604e-04 1.4916434671449329e-04 1.4531089400280153e-04
1.4154473513841234e-04 1.3786403554466153e-04 1.3426699533172857e-04 1.3075184873951283e-04 1.2731686358694039e-04
1.2396034072819674e-04 1.2068061351527565e-04 1.1747604726729168e-04 1.1434503874632306e-04 1.1128601563955686e-04
1.0829743604811193e-04 1.0537778798212988e-04 1.0252558886227753e-04 9.9739385027582898e-05 9.7017751249615057e-05
9.4359290252773662e-05 9.1762632240957511e-05 8.9226434430383569e-05 8.6749380588361721e-05 8.4330180578390864e-05
8.1967569911181246e-05 7.9660309301724484e-05 7.7407184232279429e-05 7.5207004521348451e-05 7.3058603898526649e-05
7.0960839585107720e-05 6.8912591880629977e-05 6.6912763755002085e-05 6.4960280446513426e-05 6.3054089065330086e-05
6.1193158202771814e-05 5.9376477546041213e-05 5.7603057498502742e-05 5.5871928805544500e-05 5.4182142185708361e-05
5.2532767967318744e-05 5.0922895730446966e-05 4.9351633954125953e-05 4.7818109668823321e-05 4.6321468114150300e-05
4.4860872401664663e-05 4.3435503182825573e-05 4.2044558321957873e-05 4.0687252574273750e-05 3.9362817268785450e-05
3.8070499996214428e-05 3.6809564301621984e-05 3.5579289382025496e-05 3.4378969788611451e-05 3.3207915133769052e-05
3.2065449802711312e-05 3.0950912669766876e-05 2.9863656819185611e-05 2.8803049270468119e-05 2.7768470708167169e-05
2.6759315216115260e-05 2.5774990015931323e-05 2.4814915209964844e-05 2.3878523528387922e-05 2.2965260080560611e-05
2.2074582110528148e-05 2.1205958756658535e-05 2.0358870815317476e-05 1.9532810508535560e-05 1.8727281255713447e-05
1.7941797449145505e-05 1.7175884233475961e-05 1.6429077288930018e-05 1.5700922618341645e-05 1.4990976337865471e-05
1.4298804471386687e-05 1.3623982748522034e-05 1.2966096406226424e-05 1.2324739993882115e-05 1.1699517181902770e-05
1.1090040573734860e-05 1.0495931521266495e-05 9.9168199435395021e-06 9.3523441487842465e-06 8.8021506596591475e-06
8.2658940417265321e-06 7.7432367350197678e-06 7.2338488887770244e-06 6.7374081991923703e-06 6.2535997501888662e-06
5.7821158571569505e-06 5.3226559136389283e-06 4.8749262408651290e-06 4.4386399401326240e-06 4.0135167480073166e-06
3.5992828942305738e-06 3.1956709623667747e-06 2.8024197531120341e-06 2.4192741502208947e-06 2.0459849890155880e-06
1.6823089274468580e-06 1.3280083196495871e-06 9.8285109196557868e-07 6.4661062138351467e-07 3.1906561636122974e-07
0. 0. 0. 0. 0.

View File

@@ -0,0 +1,303 @@
Cu functions (universal 4) - JB Adams et al J. Mater. Res., 4(1), 102 (1989)
29 63.550 3.6150 FCC
500 5.0100200400801306e-04 500 1.0000000000000009e-02 4.9499999999999886e+00
0. -3.1589719908208558e-01 -5.2405175291223927e-01 -6.9885553834123115e-01 -8.5420409172727574e-01
-9.9627285782417374e-01 -1.1284756487892835e+00 -1.2529454148045645e+00 -1.3711252149943363e+00 -1.4840478277127076e+00
-1.5924840805403662e+00 -1.6954285804552853e+00 -1.7942937845174001e+00 -1.8901318213864968e+00 -1.9832501645057476e+00
-2.0739063371790252e+00 -2.1623187777983759e+00 -2.2486748239473968e+00 -2.3331367007241965e+00 -2.4158460932269890e+00
-2.4969276936450484e+00 -2.5764919917168783e+00 -2.6546374977553171e+00 -2.7314525337618534e+00 -2.8070166913917660e+00
-2.8814020298474361e+00 -2.9546740684873072e+00 -3.0268926157701515e+00 -3.0981124665488835e+00 -3.1683839923973949e+00
-3.2377536446699082e+00 -3.3062643854300688e+00 -3.3739560586949437e+00 -3.4408657118035535e+00 -3.5070278749074930e+00
-3.5724748051301987e+00 -3.6372367006631805e+00 -3.7013418893374563e+00 -3.7648169952241943e+00 -3.8276870863304424e+00
-3.8899758061050136e+00 -3.9517054906525857e+00 -4.0128972737054482e+00 -4.0735711808432313e+00 -4.1324020819066334e+00
-4.1903921077370399e+00 -4.2479051536742531e+00 -4.3049572584920952e+00 -4.3615637243846948e+00 -4.4177391662932166e+00
-4.4734975570518145e+00 -4.5288522686579995e+00 -4.5838161101054880e+00 -4.6384013621277234e+00 -4.6926198091528875e+00
-4.7464827687459206e+00 -4.8000011187718314e+00 -4.8531853225280486e+00 -4.9060454519053280e+00 -4.9585912090057604e+00
-5.0108319460781274e+00 -5.0627766840981678e+00 -5.1144341300432075e+00 -5.1658126929883963e+00 -5.2169204991179470e+00
-5.2677654057399081e+00 -5.3183550143895957e+00 -5.3686966830900360e+00 -5.4187975378393958e+00 -5.4686644833797118e+00
-5.5183042133078573e+00 -5.5677232195759814e+00 -5.6169278014231168e+00 -5.6659240737851633e+00 -5.7147179752168427e+00
-5.7633152753617765e+00 -5.8117215820040258e+00 -5.8599423477251662e+00 -5.9079828761981332e+00 -5.9558483281427925e+00
-6.0035437269616239e+00 -6.0510739641062798e+00 -6.0984438040355258e+00 -6.1456578891786364e+00 -6.1927207443901580e+00
-6.2396367812953599e+00 -6.2864103024253382e+00 -6.3330455051271599e+00 -6.3795464852936732e+00 -6.4259172409138614e+00
-6.4721616754587501e+00 -6.5182836011053098e+00 -6.5642867418169999e+00 -6.6101747362826870e+00 -6.6559511407215268e+00
-6.7016194315664848e+00 -6.7471830080289408e+00 -6.7926321787266488e+00 -6.8376565597087335e+00 -6.8825828851093718e+00
-6.9274142262937062e+00 -6.9721535889762833e+00 -7.0168039151812138e+00 -7.0613680851242009e+00 -7.1058489190182854e+00
-7.1502491788078260e+00 -7.1945715698338404e+00 -7.2388187424386103e+00 -7.2829932935017325e+00 -7.3270977679243288e+00
-7.3711346600521210e+00 -7.4151064150498769e+00 -7.4590154302223652e+00 -7.5028640562861142e+00 -7.5466545985988489e+00
-7.5903893183397599e+00 -7.6340704336532212e+00 -7.6777001207475166e+00 -7.7212805149568453e+00 -7.7648137117691078e+00
-7.8083017678126225e+00 -7.8517467018166371e+00 -7.8951504955327323e+00 -7.9385150946301337e+00 -7.9818424095582259e+00
-8.0251343163853335e+00 -8.0683926576019758e+00 -8.1116192429086027e+00 -8.1548158499697934e+00 -8.1979842251487867e+00
-8.2411260842174556e+00 -8.2842431130446244e+00 -8.3273369682627845e+00 -8.3704092779143480e+00 -8.4134616420805628e+00
-8.4564956334858721e+00 -8.4995127980902225e+00 -8.5425146556591471e+00 -8.5855027003218538e+00 -8.6284784011075430e+00
-8.6714432024708685e+00 -8.7143985247980709e+00 -8.7573457649043576e+00 -8.8002862965079203e+00 -8.8432214707042931e+00
-8.8861526164113798e+00 -8.9290810408153902e+00 -8.9720080297988716e+00 -9.0149348483554377e+00 -9.0578627409975070e+00
-9.1007929321486927e+00 -9.1437266265307358e+00 -9.1866650095359432e+00 -9.2296092475897353e+00 -9.2725604885091570e+00
-9.3155198618426311e+00 -9.3584884792123262e+00 -9.4014674346366292e+00 -9.4444578048540961e+00 -9.4874606496305205e+00
-9.5304770120671378e+00 -9.5735079188909253e+00 -9.6165543807549625e+00 -9.6596173924971254e+00 -9.7026979334374914e+00
-9.7457969676344760e+00 -9.7889154441094774e+00 -9.8320542972711564e+00 -9.8749024758928954e+00 -9.9176676449848173e+00
-9.9604518241702067e+00 -1.0003255855809869e+01 -1.0046080568707225e+01 -1.0088926778514065e+01 -1.0131795287896693e+01
-1.0174686886752681e+01 -1.0217602352416293e+01 -1.0260542449857894e+01 -1.0303507931886486e+01 -1.0346499539340471e+01
-1.0389518001277565e+01 -1.0432564035159942e+01 -1.0475638347037830e+01 -1.0518741631724708e+01 -1.0561874572971817e+01
-1.0605037843637035e+01 -1.0648232105854731e+01 -1.0691458011195323e+01 -1.0734716200826654e+01 -1.0778007305671338e+01
-1.0821331946557279e+01 -1.0864690734371720e+01 -1.0908084270204256e+01 -1.0951513145493493e+01 -1.0994977942169044e+01
-1.1038479232788461e+01 -1.1082017580672300e+01 -1.1125593540039802e+01 -1.1169207656138724e+01 -1.1212860465371193e+01
-1.1256552495422056e+01 -1.1300284265381379e+01 -1.1344056285864497e+01 -1.1387869059131503e+01 -1.1431723079201220e+01
-1.1475618831971758e+01 -1.1519556795323240e+01 -1.1563537439236882e+01 -1.1607561225897086e+01 -1.1651628609799957e+01
-1.1695740037856638e+01 -1.1739895949496542e+01 -1.1784096776765693e+01 -1.1828342944444898e+01 -1.1872634870038326e+01
-1.1916972964140655e+01 -1.1961357630185660e+01 -1.2005789264757027e+01 -1.2050268257623998e+01 -1.2094794991829531e+01
-1.2139369843773920e+01 -1.2183993183309894e+01 -1.2228665373815033e+01 -1.2273386772283743e+01 -1.2318157729378811e+01
-1.2362978589648264e+01 -1.2407849691329261e+01 -1.2452771366701370e+01 -1.2497743942026659e+01 -1.2542767737650308e+01
-1.2587843068072686e+01 -1.2632970242029501e+01 -1.2678149562554552e+01 -1.2723381327055449e+01 -1.2768665827383359e+01
-1.2814003349896723e+01 -1.2859394175532202e+01 -1.2904838579871580e+01 -1.2950336833201561e+01 -1.2995889200586475e+01
-1.3041495941923358e+01 -1.3087157312007946e+01 -1.3132873560597147e+01 -1.3178644932465090e+01 -1.3224471667467185e+01
-1.3270354000597138e+01 -1.3316292162042373e+01 -1.3362286377246335e+01 -1.3408336866957768e+01 -1.3454443847292225e+01
-1.3500607529781746e+01 -1.3546828121432384e+01 -1.3593105824775080e+01 -1.3639440837916936e+01 -1.3685833354596070e+01
-1.3732283564231011e+01 -1.3778791651965662e+01 -1.3825357798727850e+01 -1.3871982181271051e+01 -1.3918664972226225e+01
-1.3965406340150423e+01 -1.4012206449566122e+01 -1.4059065461010562e+01 -1.4105983531084178e+01 -1.4152960812497383e+01
-1.4199997454109450e+01 -1.4247093600900882e+01 -1.4294249394311976e+01 -1.4341464971842868e+01 -1.4388740467389482e+01
-1.4436076011212833e+01 -1.4483471729977452e+01 -1.4530927746798284e+01 -1.4578444181270356e+01 -1.4626021149517328e+01
-1.4673658764226502e+01 -1.4721357134688901e+01 -1.4769116366835874e+01 -1.4816936563275590e+01 -1.4864817823335784e+01
-1.4912760243093658e+01 -1.4960763915414873e+01 -1.5008828929991182e+01 -1.5056955373373171e+01 -1.5105143329006637e+01
-1.5153392877265333e+01 -1.5201704095489163e+01 -1.5250077058012721e+01 -1.5298511836202465e+01 -1.5347008498487980e+01
-1.5395567110395177e+01 -1.5444187734579373e+01 -1.5492870430854509e+01 -1.5541615256228738e+01 -1.5590422264928975e+01
-1.5639291508440465e+01 -1.5688223035530768e+01 -1.5737216892280117e+01 -1.5786273122116540e+01 -1.5835391765839859e+01
-1.5884572861650554e+01 -1.5933816445184789e+01 -1.5983122549535665e+01 -1.6032491205288238e+01 -1.6081922440538847e+01
-1.6131416280932740e+01 -1.6180972749683292e+01 -1.6230591867602584e+01 -1.6280273653129598e+01 -1.6330018122352271e+01
-1.6379825289037512e+01 -1.6429695164657005e+01 -1.6479627758410516e+01 -1.6529623077253063e+01 -1.6579681125920047e+01
-1.6629801906948160e+01 -1.6679985420709272e+01 -1.6730231665424185e+01 -1.6780540637196850e+01 -1.6830912330026536e+01
-1.6881346735842499e+01 -1.6931843844523655e+01 -1.6982403643917451e+01 -1.7033026119869078e+01 -1.7083711256241486e+01
-1.7134459034938232e+01 -1.7185269435924170e+01 -1.7236142437252170e+01 -1.7287078015076759e+01 -1.7338076143685498e+01
-1.7389136795514560e+01 -1.7440259941169757e+01 -1.7491445549452351e+01 -1.7542693587371559e+01 -1.7594004020176158e+01
-1.7645376811364258e+01 -1.7696811922712527e+01 -1.7748309314288690e+01 -1.7799868944476657e+01 -1.7851490769996190e+01
-1.7903174745917113e+01 -1.7954920825684781e+01 -1.8006728961136105e+01 -1.8058599102520361e+01 -1.8110531198513968e+01
-1.8162525196246406e+01 -1.8214581041310112e+01 -1.8266698677789350e+01 -1.8318878048276588e+01 -1.8371119093861239e+01
-1.8423421754189917e+01 -1.8475785967356501e+01 -1.8528211670354381e+01 -1.8580698798442995e+01 -1.8633247285599964e+01
-1.8685857064432412e+01 -1.8738528066186518e+01 -1.8791260220768891e+01 -1.8844053456762026e+01 -1.8896907701446821e+01
-1.8949822880805868e+01 -1.9002798919552447e+01 -1.9055835741138708e+01 -1.9108933267775342e+01 -1.9162091420446473e+01
-1.9215310118921138e+01 -1.9268589281777849e+01 -1.9321928826411295e+01 -1.9375328669053715e+01 -1.9428788724780020e+01
-1.9482308907539277e+01 -1.9535889130155283e+01 -1.9589529304345319e+01 -1.9643229340734365e+01 -1.9696989148876355e+01
-1.9750808637254977e+01 -1.9804687713312433e+01 -1.9858626283450008e+01 -1.9912624253055810e+01 -1.9966681526506250e+01
-2.0020798007185476e+01 -2.0074973597498911e+01 -2.0129208198888136e+01 -2.0183501711838630e+01 -2.0237854035899204e+01
-2.0292265069692348e+01 -2.0346734710925716e+01 -2.0401262856409858e+01 -2.0455849402064473e+01 -2.0510494242935920e+01
-2.0565197273209719e+01 -2.0619958386219423e+01 -2.0674777474463212e+01 -2.0729654429611969e+01 -2.0784589142526556e+01
-2.0839581503263844e+01 -2.0894631401093307e+01 -2.0949738724508393e+01 -2.1004903361235051e+01 -2.1060125198247988e+01
-2.1115404121775100e+01 -2.1170740017318053e+01 -2.1226132769657397e+01 -2.1281582262894972e+01 -2.1337088380382852e+01
-2.1392651004661843e+01 -2.1448270018015251e+01 -2.1503945301662043e+01 -2.1559676736299366e+01 -2.1615464201984196e+01
-2.1671307578140954e+01 -2.1727206743568217e+01 -2.1783161576455427e+01 -2.1839171954393919e+01 -2.1895237754379082e+01
-2.1951358852829799e+01 -2.2007535125591289e+01 -2.2063766447950343e+01 -2.2120052694642595e+01 -2.2176393739860259e+01
-2.2232789457272474e+01 -2.2289239719961643e+01 -2.2345744400729018e+01 -2.2402303371517178e+01 -2.2458916504038370e+01
-2.2515583669430725e+01 -2.2572304738325670e+01 -2.2629079581086330e+01 -2.2685908067306855e+01 -2.2742790066346515e+01
-2.2799725447076526e+01 -2.2856714077931429e+01 -2.2913755826927172e+01 -2.2970850561669295e+01 -2.3027998149359064e+01
-2.3085198456800299e+01 -2.3142451350411534e+01 -2.3199756696229770e+01 -2.3257114359926845e+01 -2.3314524206806482e+01
-2.3371986101824632e+01 -2.3429499909581864e+01 -2.3487065494349963e+01 -2.3544682720213359e+01 -2.3602351450489550e+01
-2.3660071548650194e+01 -2.3717842877714475e+01 -2.3775665300345281e+01 -2.3833538678952209e+01 -2.3891462875653588e+01
-2.3949437752298763e+01 -2.4007463170460369e+01 -2.4065538991453877e+01 -2.4123665076338057e+01 -2.4181841285923610e+01
-2.4240067480782272e+01 -2.4298343521253173e+01 -2.4356669267446705e+01 -2.4415044579252253e+01 -2.4473469316351384e+01
-2.4531943338216706e+01 -2.4590466504118467e+01 -2.4649038673143195e+01 -2.4707659704179378e+01 -2.4766329455946106e+01
-2.4825047786983760e+01 -2.4883814555664912e+01 -2.4942629620207981e+01 -2.5001492838670174e+01 -2.5060404068966136e+01
-2.5119363168864538e+01 -2.5178369996001948e+01 -2.5237424407882145e+01 -2.5296526261886129e+01 -2.5355675415276437e+01
-2.5414871725205558e+01 -2.5474115048716612e+01 -2.5533405242759045e+01 -2.5592742164177480e+01 -2.5652125669732186e+01
-2.5711555616102487e+01 -2.5771031859886762e+01 -2.5830554257610174e+01 -2.5890122665731269e+01 -2.5949736940650155e+01
-2.6009396938703958e+01 -2.6069102516181829e+01 -2.6128853529326307e+01 -2.6188649834339685e+01 -2.6248491287389243e+01
-2.6308377744605878e+01 -2.6368309062100934e+01 -2.6428285095962565e+01 -2.6488305702088610e+01 -2.6548370736885317e+01
-2.6608480056235521e+01 -2.6668633516188947e+01 -2.6728830972768947e+01 -2.6789072282060033e+01 -2.6849357300140582e+01
1.0000000000000000e+01 1.0801250630455797e+01 1.0617301586939504e+01 1.0436833263885262e+01 1.0259774441010791e+01
1.0086055417347552e+01 9.9156079783837754e+00 9.7483653639170598e+00 9.5842622365983630e+00 9.4232346511569745e+00
9.2652200242883396e+00 9.1101571051919450e+00 8.9579859467472716e+00 8.8086478773091699e+00 8.6620854731154395e+00
8.5182425312888768e+00 8.3770640434238430e+00 8.2384961697429731e+00 8.1024862138128810e+00 7.9689825978078090e+00
7.8379348383064951e+00 7.7092935226140753e+00 7.5830102855955488e+00 7.4590377870116811e+00 7.3373296893445286e+00
7.2178406361032330e+00 7.1005262306008490e+00 6.9853430151894997e+00 6.8722484509459889e+00 6.7612008977976643e+00
6.6521595950793255e+00 6.5450846425111706e+00 6.4399369815891703e+00 6.3366783773799114e+00 6.2352714007088537e+00
6.1356794107365431e+00 6.0378665379119241e+00 5.9417976672963562e+00 5.8474384222482740e+00 5.7547551484639143e+00
5.6637148983634233e+00 5.5742854158160355e+00 5.4864351211977294e+00 5.4001330967728620e+00 5.3153490723937580e+00
5.2320534115112594e+00 5.1502170974889339e+00 5.0698117202151138e+00 4.9908094630057178e+00 4.9131830897922271e+00
4.8369059325884791e+00 4.7619518792290876e+00 4.6882953613761629e+00 4.6159113427851821e+00 4.5447753078280471e+00
4.4748632502644341e+00 4.4061516622586225e+00 4.3386175236344116e+00 4.2722382913651700e+00 4.2069918892916007e+00
4.1428566980642358e+00 4.0798115453044659e+00 4.0178356959802954e+00 3.9569088429914245e+00 3.8970110979596200e+00
3.8381229822198435e+00 3.7802254180071628e+00 3.7232997198366320e+00 3.6673275860703995e+00 3.6122910906684780e+00
3.5581726751199767e+00 3.5049551405497539e+00 3.4526216399971048e+00 3.4011556708634600e+00 3.3505410675235936e+00
3.3007619940993322e+00 3.2518029373895416e+00 3.2036486999552665e+00 3.1562843933552358e+00 3.1096954315288059e+00
3.0638675243237259e+00 3.0187866711643210e+00 2.9744391548584872e+00 2.9308115355394193e+00 2.8878906447394712e+00
2.8456635795935767e+00 2.8041176971686212e+00 2.7632406089169876e+00 2.7230201752508236e+00 2.6834445002347138e+00
2.6445019263941703e+00 2.6061810296373835e+00 2.5684706142875200e+00 2.5313597082236612e+00 2.4948375581276281e+00
2.4588936248343458e+00 2.4235175787838017e+00 2.3886992955719677e+00 2.3544288515990814e+00 2.3206965198123726e+00
2.2874927655419270e+00 2.2548082424273730e+00 2.2226337884330718e+00 2.1909604219504502e+00 2.1597793379851566e+00
2.1290819044273803e+00 2.0988596584032564e+00 2.0691043027062364e+00 2.0398077023055379e+00 2.0109618809312195e+00
1.9825590177335428e+00 1.9545914440145751e+00 1.9270516400317490e+00 1.8999322318704799e+00 1.8732259883849522e+00
1.8469258182056052e+00 1.8210247668116040e+00 1.7955160136671395e+00 1.7703928694199718e+00 1.7456487731607453e+00
1.7212772897420763e+00 1.6972721071559391e+00 1.6736270339679251e+00 1.6503359968072218e+00 1.6273930379113750e+00
1.6047923127241077e+00 1.5825280875452847e+00 1.5605947372321296e+00 1.5389867429502644e+00 1.5176986899731588e+00
1.4967252655299603e+00 1.4760612566992890e+00 1.4557015483494027e+00 1.4356411211222593e+00 1.4158750494618957e+00
1.3963984996850130e+00 1.3772067280935900e+00 1.3582950791282968e+00 1.3396589835618542e+00 1.3212939567312461e+00
1.3031955968085143e+00 1.2853595831085869e+00 1.2677816744339125e+00 1.2504577074545153e+00 1.2333835951233212e+00
1.2165553251254906e+00 1.1999689583611897e+00 1.1836206274610817e+00 1.1675065353339207e+00 1.1516229537450258e+00
1.1359662219258198e+00 1.1205327452130547e+00 1.1053189937170771e+00 1.0903215010191687e+00 1.0755368628964561e+00
1.0609617360743258e+00 1.0465928370056616e+00 1.0324269406761744e+00 1.0184608794353309e+00 1.0046915418523596e+00
9.9111587159659820e-01 9.7773086634202144e-01 9.6453357669496853e-01 9.5152110514480626e-01 9.3869060503712376e-01
9.2603927956864140e-01 9.1356438080370239e-01 9.0126320871155485e-01 8.8913311022427521e-01 8.7717147831462938e-01
8.6537575109353426e-01 8.5374341092675010e-01 8.4227198357020328e-01 8.3095903732382581e-01 8.1980218220310874e-01
8.0879906912829824e-01 7.9794738913080465e-01 7.8724487257618136e-01 7.7668928840378371e-01 7.6627844338220541e-01
7.5601018138057441e-01 7.4588238265517859e-01 7.3589296315095609e-01 7.2603987381787150e-01 7.1632109994155613e-01
7.0673466048788924e-01 6.9727860746149162e-01 6.8795102527759866e-01 6.7875003014695068e-01 6.6967376947370028e-01
6.6072042126578623e-01 6.5188819355765304e-01 6.4317532384495735e-01 6.3458007853101606e-01 6.2610075238490026e-01
6.1773566801053903e-01 6.0948317532703555e-01 6.0134165105970538e-01 5.9330949824153834e-01 5.8538514572508404e-01
5.7756704770434197e-01 5.6985368324655994e-01 5.6224355583360364e-01 5.5473519291279416e-01 5.4732714545698968e-01
5.4001798753364838e-01 5.3280631588273764e-01 5.2569074950327632e-01 5.1866992924830413e-01 5.1174251742814292e-01
5.0490719742172274e-01 4.9816267329578068e-01 4.9150766943189694e-01 4.8494093016099704e-01 4.7846121940521869e-01
4.7206732032718257e-01 4.6575803498626733e-01 4.5953218400168616e-01 4.5338860622255339e-01 4.4732615840449164e-01
4.4134371489266400e-01 4.3544016731128998e-01 4.2961442425926322e-01 4.2386541101190822e-01 4.1819206922867025e-01
4.1259335666658892e-01 4.0706824689955035e-01 4.0161572904300691e-01 3.9623480748423034e-01 3.9092450161784775e-01
3.8568384558664448e-01 3.8051188802739233e-01 3.7540769182181144e-01 3.7037033385227858e-01 3.6539890476236891e-01
3.6049250872219041e-01 3.5565026319808801e-01 3.5087129872705347e-01 3.4615475869537526e-01 3.4149979912160333e-01
3.3690558844382679e-01 3.3237130731094133e-01 3.2789614837797743e-01 3.2347931610542879e-01 3.1912002656234151e-01
3.1481750723327195e-01 3.1057099682888989e-01 3.0637974510021770e-01 3.0224301265637266e-01 2.9816007078585649e-01
2.9413020128113843e-01 2.9015269626666473e-01 2.8622685803004089e-01 2.8235199885637741e-01 2.7852744086590420e-01
2.7475251585446614e-01 2.7102656513704559e-01 2.6734893939429405e-01 2.6371899852185088e-01 2.6013611148246873e-01
2.5659965616091007e-01 2.5310901922152595e-01 2.4966359596849230e-01 2.4626279020852770e-01 2.4290601411627044e-01
2.3959268810202516e-01 2.3632224068197960e-01 2.3309410835077227e-01 2.2990773545640408e-01 2.2676257407740064e-01
2.2365808390219311e-01 2.2059373211072231e-01 2.1756899325815038e-01 2.1458334916067301e-01 2.1163628878335583e-01
2.0872730813005891e-01 2.0585591013519533e-01 2.0302160455757168e-01 2.0022390787598532e-01 1.9746234318676770e-01
1.9473644010305513e-01 1.9204573465593366e-01 1.8938976919722261e-01 1.8676809230404867e-01 1.8418025868501697e-01
1.8162582908807767e-01 1.7910437020998504e-01 1.7661545460728956e-01 1.7415866060892160e-01 1.7173357223026997e-01
1.6933977908869480e-01 1.6697687632057967e-01 1.6464446449973735e-01 1.6234214955720816e-01 1.6006954270248031e-01
1.5782626034600167e-01 1.5561192402300605e-01 1.5342616031865575e-01 1.5126860079441595e-01 1.4913888191569047e-01
1.4703664498065105e-01 1.4496153605028095e-01 1.4291320587954814e-01 1.4089130984976528e-01 1.3889550790202332e-01
1.3692546447178433e-01 1.3498084842450275e-01 1.3306133299233025e-01 1.3116659571184286e-01 1.2929631836281086e-01
1.2745018690792786e-01 1.2562789143357200e-01 1.2382912609148811e-01 1.2205358904140517e-01 1.2030098239464682e-01
1.1857101215854371e-01 1.1686338818183373e-01 1.1517782410088362e-01 1.1351403728677267e-01 1.1187174879324324e-01
1.1025068330544618e-01 1.0865056908951143e-01 1.0707113794291789e-01 1.0551212514563968e-01 1.0397326941204543e-01
1.0245431284357487e-01 1.0095500088214582e-01 9.9475082264262937e-02 9.8014308975870268e-02 9.6572436207889467e-02
9.5149222312433945e-02 9.3744428759716225e-02 9.2357820095600562e-02 9.0989163899807046e-02 8.9638230744780500e-02
8.8304794155128263e-02 8.6988630567732095e-02 8.5689519292436067e-02 8.4407242473327759e-02 8.3141585050604316e-02
8.1892334723027815e-02 8.0659281910912206e-02 7.9442219719687568e-02 7.8240943904000826e-02 7.7055252832346710e-02
7.5884947452225848e-02 7.4729831255814450e-02 7.3589710246154016e-02 7.2464392903814900e-02 7.1353690154065674e-02
7.0257415334517681e-02 6.9175384163253639e-02 6.8107414707390124e-02 6.7053327352138758e-02 6.6012944770277748e-02
6.4986091892085707e-02 6.3972595875702698e-02 6.2972286077917161e-02 6.1984994025379159e-02 6.1010553386208866e-02
6.0048799942037157e-02 5.9099571560412123e-02 5.8162708167624810e-02 5.7238051721903105e-02 5.6325446186999528e-02
5.5424737506136967e-02 5.4535773576326108e-02 5.3658404223060785e-02 5.2792481175329975e-02 5.1937858041017471e-02
5.1094390282629742e-02 5.0261935193351093e-02 4.9440351873451860e-02 4.8629501207008818e-02 4.7829245838954204e-02
4.7039450152438045e-02 4.6259980246510013e-02 4.5490703914087494e-02 4.4731490620259606e-02 4.3982211480854794e-02
4.3242739241325268e-02 4.2512948255901239e-02 4.1792714467042247e-02 4.1081915385161816e-02 4.0380430068628126e-02
3.9688139104032016e-02 3.9004924586723888e-02 3.8330670101623943e-02 3.7665260704261794e-02 3.7008582902100740e-02
3.6360524636098734e-02 3.5720975262514720e-02 3.5089825534961649e-02 3.4466967586696873e-02 3.3852294913154113e-02
3.3245702354694373e-02 3.2647086079598653e-02 3.2056343567280043e-02 3.1473373591718756e-02 3.0898076205114089e-02
3.0330352721755771e-02 2.9770105702100036e-02 2.9217238937061962e-02 2.8671657432515429e-02 2.8133267393987360e-02
2.7601976211552692e-02 2.7077692444942292e-02 2.6560325808820506e-02 2.6049787158272331e-02 2.5545988474477754e-02
2.5048842850559749e-02 2.4558264477628988e-02 2.4074168631003978e-02 2.3596471656606832e-02 2.3125090957536454e-02
2.2659944980823798e-02 2.2200953204335683e-02 2.1748036123873327e-02 2.1301115240412782e-02 2.0860113047532769e-02
2.0424953018977066e-02 1.9995559596398205e-02 1.9571858177249157e-02 1.9153775102828896e-02 1.8741237646474174e-02
1.8334174001923720e-02 1.7932513271801676e-02 1.7536185456265119e-02 1.7145121441799471e-02 1.6759252990136364e-02
1.6378512727332817e-02 1.6002834132978205e-02 1.5632151529535232e-02 1.5266400071822006e-02 1.4905515736628239e-02
1.4549435312452008e-02 1.4198096389378967e-02 1.3851437349077567e-02 1.3509397354926733e-02 1.3171916342264223e-02
1.2838935008766206e-02 1.2510394804928215e-02 1.2186237924689647e-02 1.1866407296154180e-02 1.1550846572444651e-02
1.1239500122655843e-02 1.0932313022929629e-02 1.0629231047647014e-02 1.0330200660712663e-02 1.0035169006965661e-02
9.7440839036889715e-03 9.4568938322237006e-03 9.1735479296908284e-03 8.8939959808188584e-03 8.6181884098633921e-03
8.3460762726380588e-03 8.0776112486364848e-03 7.8127456332576228e-03 7.5514323301215658e-03 7.2936248434884998e-03
7.0392772707632556e-03 6.7883442950981143e-03 6.5407811780883174e-03 6.2965437525517309e-03 6.0555884154033235e-03
5.8178721206175732e-03 5.5833523722726985e-03 5.3519872176873706e-03 5.1237352406410253e-03 4.8985555546731119e-03
4.6764077964680517e-03 4.4572521193242398e-03 4.2410491867018174e-03 4.0277601658472717e-03 3.8173467214993595e-03
3.6097710096757996e-03 3.4049956715283547e-03 3.2029838272795708e-03 3.0036990702375643e-03 2.8071054608720947e-03
2.6131675209760674e-03 2.4218502278956500e-03 2.2331190088270558e-03 2.0469397351849938e-03 1.8632787170468346e-03
1.6821026976564513e-03 1.5033788479984489e-03 1.3270747614446132e-03 1.1531584484569257e-03 9.8159833136179930e-04
8.1236323918953968e-04 6.4542240257081662e-04 4.8074544870146951e-04 3.1830239637042901e-04 1.5806365104042985e-04
0. 0. 0. 0. 0.
0. 5.4383329664155645e-05 9.3944898415945083e-04 4.3251847212615047e-03 1.2334244035325348e-02
2.7137722173468548e-02 5.0697119791449641e-02 8.4607638668976470e-02 1.3001641279549414e-01 1.8759487452762702e-01
2.5754900895683441e-01 3.3965493779430744e-01 4.3331024634064264e-01 5.3759384878832961e-01 6.5132908316254046e-01
7.7314622535699939e-01 9.0154178511424377e-01 1.0349328562818201e+00 1.1717054897399350e+00 1.3102565818166738e+00
1.4490291582473986e+00 1.5865412121263560e+00 1.7214084470448441e+00 1.8523614026473965e+00 1.9782575145276269e+00
2.0980886961566938e+00 2.2109850373516764e+00 2.3162151996095730e+00 2.4131840597491703e+00 2.5014281146549706e+00
2.5806091153285706e+00 2.6505063508648590e+00 2.7110079545661563e+00 2.7621015568249447e+00 2.8038645637913220e+00
2.8364542979766156e+00 2.8600981973448825e+00 2.8750842333755031e+00 2.8817516761559574e+00 2.8804823057701157e+00
2.8716921439699092e+00 2.8558237581894161e+00 2.8333391711552594e+00 2.8047133934346959e+00 2.7704285829676252e+00
2.7309688247181469e+00 2.6868155147671331e+00 2.6384433262347358e+00 2.5863167291097398e+00 2.5308870321738226e+00
2.4725899125317596e+00 2.4118433966060167e+00 2.3490462556752334e+00 2.2845767789603002e+00 2.2187918877813502e+00
2.1520265552815943e+00 2.0845934975626363e+00 2.0167831036919637e+00 1.9488635738636404e+00 1.8810812369508270e+00
1.8136610207193371e+00 1.7468070500507196e+00 1.6807033505858371e+00 1.6155146372447149e+00 1.5513871690559142e+00
1.4884496536383409e+00 1.4268141864958608e+00 1.3665772120042590e+00 1.3078204945836447e+00 1.2506120900523854e+00
1.1950073085502879e+00 1.1410496616995687e+00 1.0887717878420631e+00 1.0381963502565981e+00 9.8933690422003551e-01
9.4219872964247031e-01 8.9677962677415124e-01 8.5307067316958651e-01 8.1105694069385592e-01 7.7071817188505065e-01
7.3202941544290212e-01 6.9496162100761794e-01 6.5948219372701189e-01 6.2555550939233484e-01 5.9314339115629977e-01
5.6220554903693554e-01 5.3269998356387660e-01 5.0458335504023211e-01 4.7781131998032222e-01 4.5233883634534777e-01
4.2812043923464138e-01 4.0511048870905242e-01 3.8326339142174781e-01 3.6253379771729577e-01 3.4287677583286325e-01
3.2424796479760154e-01 3.0660370758054967e-01 2.8990116598452254e-01 2.7409841872609064e-01 2.5915454407883409e-01
2.4502968839369110e-01 2.3168512174254197e-01 2.1908328186436687e-01 2.0718780752542632e-01 1.9596356233750800e-01
1.8537665001230508e-01 1.7539442196444632e-01 1.6598547811304609e-01 1.5711966166996927e-01 1.4876804864444715e-01
1.4090293273673637e-01 1.3349780623990259e-01 1.2652733751724909e-01 1.1996734557434463e-01 1.1379477219856060e-01
1.0798765209582406e-01 1.0252508141368288e-01 9.7387185001678311e-02 9.2555082724584015e-02 8.8010855111109620e-02
8.3737508589961873e-02 7.9718940536826377e-02 7.5939904329596963e-02 7.2385974585237101e-02 6.9043512729294765e-02
6.5899633029043336e-02 6.2942169202580001e-02 6.0159641699440547e-02 5.7541225732930634e-02 5.5076720130546430e-02
5.2756517056398833e-02 5.0571572648238083e-02 4.8513378601664936e-02 4.6573934725081756e-02 4.4745722480991068e-02
4.3021679522073253e-02 4.1395175224364866e-02 3.9859987214311721e-02 3.8410278881708670e-02 3.7040577866510604e-02
3.5745755503880039e-02 3.4521007208912380e-02 3.3361833779917971e-02 3.2264023597108116e-02 3.1223635691821294e-02
3.0236983660070216e-02 2.9300620393215571e-02 2.8411323597772320e-02 2.7566082075896281e-02 2.6762082737777249e-02
2.5996698317105604e-02 2.5267475760840985e-02 2.4572125264713973e-02 2.3908509926274246e-02 2.3274635987705516e-02
2.2668643641204911e-02 2.2088798370316409e-02 2.1533482801290083e-02 2.1001189039288493e-02 2.0490511464994254e-02
2.0000139967999431e-02 1.9528853594166895e-02 1.9075514584991349e-02 1.8639062787818239e-02 1.8218510416650235e-02
1.7812937144080498e-02 1.7421485505751177e-02 1.7043356599549031e-02 1.6677806062561751e-02 1.6324140309613155e-02
1.5981713017976018e-02 1.5649921843605585e-02 1.5328205354974755e-02 1.5016040171312250e-02 1.4712938292708366e-02
1.4418444610242331e-02 1.4132134584901757e-02 1.3853612084676337e-02 1.3582507369821917e-02 1.3318475216818060e-02
1.3061193172097418e-02 1.2810359927147186e-02 1.2565693807050415e-02 1.2326931365025051e-02 1.2093826075940506e-02
1.1866147122233661e-02 1.1643678266026136e-02 1.1426216801644407e-02 1.1213572583084475e-02 1.1005567121320226e-02
1.0802032746662471e-02 1.0602811831688208e-02 1.0407756070544782e-02 1.0216725810699157e-02 1.0029589433467268e-02
9.8462227798860602e-03 9.6665086187306404e-03 9.4903361536790021e-03 9.3176005668363371e-03 9.1482025960089031e-03
8.9820481433065535e-03 8.8190479128032462e-03 8.6591170751522117e-03 8.5021749571883021e-03 8.3481447546937537e-03
8.1969532666261724e-03 8.0485306492223962e-03 7.9028101885199598e-03 7.7597280899136256e-03 7.6192232834934315e-03
7.4812372439735375e-03 7.3457138241272979e-03 7.2125991007052359e-03 7.0818412319012813e-03 6.9533903254870300e-03
6.8271983168139705e-03 6.7032188559211503e-03 6.5814072030662141e-03 6.4617201320263939e-03 6.3441158405819764e-03
6.2285538676237207e-03 6.1149950163802147e-03 6.0034012832899109e-03 5.8937357920846312e-03 5.7859627326801166e-03
5.6800473044990030e-03 5.5759556638887986e-03 5.4736548753111791e-03 5.3731128660109428e-03 5.2742983838981461e-03
5.1771809583849582e-03 5.0817308639591330e-03 4.9879190862693046e-03 4.8957172905357560e-03 4.8050977921015592e-03
4.7160335289582467e-03 4.6284980360953021e-03 4.5424654215287241e-03 4.4579103438822931e-03 4.3748079913988880e-03
4.2931340622749670e-03 4.2128647462132407e-03 4.1339767071033873e-03 4.0564470667446839e-03 3.9802533895282599e-03
3.9053736680121076e-03 3.8317863093158128e-03 3.7594701222811860e-03 3.6884043053326127e-03 3.6185684349951674e-03
3.5499424550168301e-03 3.4825066660512660e-03 3.4162417158645347e-03 3.3511285900229004e-03 3.2871486030347646e-03
3.2242833899080170e-03 3.1625148980992668e-03 3.1018253798278661e-03 3.0421973847258310e-03 2.9836137528083811e-03
2.9260576077371064e-03 2.8695123503632708e-03 2.8139616525287708e-03 2.7593894511106498e-03 2.7057799422959966e-03
2.6531175760685227e-03 2.6013870509009052e-03 2.5505733086344240e-03 2.5006615295404683e-03 2.4516371275501436e-03
2.4034857456453340e-03 2.3561932514012535e-03 2.3097457326723414e-03 2.2641294934160616e-03 2.2193310496436136e-03
2.1753371254977782e-03 2.1321346494441173e-03 2.0897107505768314e-03 2.0480527550303662e-03 2.0071481824917164e-03
1.9669847428123305e-03 1.9275503327108034e-03 1.8888330325659355e-03 1.8508211032951805e-03 1.8135029833145980e-03
1.7768672855772646e-03 1.7409027946878666e-03 1.7055984640891586e-03 1.6709434133182904e-03 1.6369269253308227e-03
1.6035384438881917e-03 1.5707675710093030e-03 1.5386040644797400e-03 1.5070378354209296e-03 1.4760589459142243e-03
1.4456576066784674e-03 1.4158241748004133e-03 1.3865491515145517e-03 1.3578231800324136e-03 1.3296370434173130e-03
1.3019816625059188e-03 1.2748480938728074e-03 1.2482275278369870e-03 1.2221112865106742e-03 1.1964908218862064e-03
1.1713577139624703e-03 1.1467036689077198e-03 1.1225205172586891e-03 1.0988002121543120e-03 1.0755348276031765e-03
1.0527165567835728e-03 1.0303377103750150e-03 1.0083907149206553e-03 9.8686811121878604e-04 9.6576255274356815e-04
9.4506680409354657e-04 9.2477373946662708e-04 9.0487634116191706e-04 8.8536769810608137e-04 8.6624100440530968e-04
8.4748955791986991e-04 8.2910675886310736e-04 8.1108610842155551e-04 7.9342120739794852e-04 7.7610575487466887e-04
7.5913354689786591e-04 7.4249847518158968e-04 7.2619452583109687e-04 7.1021577808524222e-04 6.9455640307671332e-04
6.7921066261025093e-04 6.6417290795844214e-04 6.4943757867335500e-04 6.3499920141575628e-04 6.2085238879914031e-04
6.0699183824991856e-04 5.9341233088238896e-04 5.8010873038847818e-04 5.6707598194186137e-04 5.5430911111587280e-04
5.4180322281523891e-04 5.2955350022104025e-04 5.1755520374872563e-04 5.0580367001857793e-04 4.9429431083891986e-04
4.8302261220136561e-04 4.7198413328763435e-04 4.6117450548847222e-04 4.5058943143359842e-04 4.4022468403297037e-04
4.3007610552883886e-04 4.2013960655883260e-04 4.1041116522908330e-04 4.0088682619821882e-04 3.9156269977118005e-04
3.8243496100300207e-04 3.7349984881274514e-04 3.6475366510662147e-04 3.5619277391102898e-04 3.4781360051482253e-04
3.3961263062063513e-04 3.3158640950565685e-04 3.2373154119109092e-04 3.1604468762060252e-04 3.0852256784754707e-04
3.0116195723081836e-04 2.9395968663908575e-04 2.8691264166377101e-04 2.8001776184017647e-04 2.7327203987681688e-04
2.6667252089326854e-04 2.6021630166557681e-04 2.5390052988028163e-04 2.4772240339593181e-04 2.4167916951265550e-04
2.3576812424967210e-04 2.2998661163024531e-04 2.2433202297460642e-04 2.1880179620031078e-04 2.1339341513026532e-04
2.0810440880823181e-04 2.0293235082175821e-04 1.9787485863260665e-04 1.9292959291436311e-04 1.8809425689761319e-04
1.8336659572205580e-04 1.7874439579616125e-04 1.7422548416372047e-04 1.6980772787763936e-04 1.6548903338088530e-04
1.6126734589430591e-04 1.5714064881157744e-04 1.5310696310104604e-04 1.4916434671449329e-04 1.4531089400280153e-04
1.4154473513841234e-04 1.3786403554466153e-04 1.3426699533172857e-04 1.3075184873951283e-04 1.2731686358694039e-04
1.2396034072819674e-04 1.2068061351527565e-04 1.1747604726729168e-04 1.1434503874632306e-04 1.1128601563955686e-04
1.0829743604811193e-04 1.0537778798212988e-04 1.0252558886227753e-04 9.9739385027582898e-05 9.7017751249615057e-05
9.4359290252773662e-05 9.1762632240957511e-05 8.9226434430383569e-05 8.6749380588361721e-05 8.4330180578390864e-05
8.1967569911181246e-05 7.9660309301724484e-05 7.7407184232279429e-05 7.5207004521348451e-05 7.3058603898526649e-05
7.0960839585107720e-05 6.8912591880629977e-05 6.6912763755002085e-05 6.4960280446513426e-05 6.3054089065330086e-05
6.1193158202771814e-05 5.9376477546041213e-05 5.7603057498502742e-05 5.5871928805544500e-05 5.4182142185708361e-05
5.2532767967318744e-05 5.0922895730446966e-05 4.9351633954125953e-05 4.7818109668823321e-05 4.6321468114150300e-05
4.4860872401664663e-05 4.3435503182825573e-05 4.2044558321957873e-05 4.0687252574273750e-05 3.9362817268785450e-05
3.8070499996214428e-05 3.6809564301621984e-05 3.5579289382025496e-05 3.4378969788611451e-05 3.3207915133769052e-05
3.2065449802711312e-05 3.0950912669766876e-05 2.9863656819185611e-05 2.8803049270468119e-05 2.7768470708167169e-05
2.6759315216115260e-05 2.5774990015931323e-05 2.4814915209964844e-05 2.3878523528387922e-05 2.2965260080560611e-05
2.2074582110528148e-05 2.1205958756658535e-05 2.0358870815317476e-05 1.9532810508535560e-05 1.8727281255713447e-05
1.7941797449145505e-05 1.7175884233475961e-05 1.6429077288930018e-05 1.5700922618341645e-05 1.4990976337865471e-05
1.4298804471386687e-05 1.3623982748522034e-05 1.2966096406226424e-05 1.2324739993882115e-05 1.1699517181902770e-05
1.1090040573734860e-05 1.0495931521266495e-05 9.9168199435395021e-06 9.3523441487842465e-06 8.8021506596591475e-06
8.2658940417265321e-06 7.7432367350197678e-06 7.2338488887770244e-06 6.7374081991923703e-06 6.2535997501888662e-06
5.7821158571569505e-06 5.3226559136389283e-06 4.8749262408651290e-06 4.4386399401326240e-06 4.0135167480073166e-06
3.5992828942305738e-06 3.1956709623667747e-06 2.8024197531120341e-06 2.4192741502208947e-06 2.0459849890155880e-06
1.6823089274468580e-06 1.3280083196495871e-06 9.8285109196557868e-07 6.4661062138351467e-07 3.1906561636122974e-07
0. 0. 0. 0. 0.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

3
data/dem/collision.in Normal file
View File

@@ -0,0 +1,3 @@
2 0 1 0 1 0 1
1 0.1 0.166 0.55 0 1 0 0
1 0.1 0.833 0.45 0 -1 0 0

View File

@@ -0,0 +1,9 @@
pbc_x 0
pbc_y 0
pbc_z 0
ntimes 200
dt 0.01
reneigh_every 1
mass 0.1
k_s 100
k_dn 10

BIN
figures/features-v3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 273 KiB

BIN
figures/gather_bench.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

523
figures/gather_bench.svg Normal file
View File

@@ -0,0 +1,523 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
width="297mm"
height="210mm"
viewBox="0 0 297 210"
version="1.1"
id="svg5"
inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
sodipodi:docname="gather_bench.svg"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<sodipodi:namedview
id="namedview7"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:document-units="mm"
showgrid="false"
inkscape:zoom="0.73508842"
inkscape:cx="551.63432"
inkscape:cy="348.25743"
inkscape:window-width="1920"
inkscape:window-height="1011"
inkscape:window-x="0"
inkscape:window-y="165"
inkscape:window-maximized="1"
inkscape:current-layer="layer1" />
<defs
id="defs2">
<rect
x="144.01516"
y="304.36604"
width="248.99777"
height="100.91557"
id="rect79475" />
<rect
x="309.01869"
y="43.698615"
width="552.19421"
height="71.390348"
id="rect65238" />
<rect
x="762.55856"
y="341.3838"
width="277.62756"
height="105.0235"
id="rect47632" />
<linearGradient
inkscape:collect="always"
id="linearGradient40704">
<stop
style="stop-color:#ccffaa;stop-opacity:1;"
offset="0"
id="stop40700" />
<stop
style="stop-color:#ccffaa;stop-opacity:0;"
offset="1"
id="stop40702" />
</linearGradient>
<marker
style="overflow:visible;"
id="Arrow2Mend"
refX="0.0"
refY="0.0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(0.6) rotate(180) translate(0,0)"
d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
style="stroke:context-stroke;fill-rule:evenodd;fill:context-stroke;stroke-width:0.62500000;stroke-linejoin:round;"
id="path39486" />
</marker>
<marker
style="overflow:visible;"
id="Arrow1Mend"
refX="0.0"
refY="0.0"
orient="auto"
inkscape:stockid="Arrow1Mend"
inkscape:isstock="true">
<path
transform="scale(0.4) rotate(180) translate(10,0)"
style="fill-rule:evenodd;fill:context-stroke;stroke:context-stroke;stroke-width:1.0pt;"
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
id="path39468" />
</marker>
<marker
style="overflow:visible;"
id="Arrow1Lend"
refX="0.0"
refY="0.0"
orient="auto"
inkscape:stockid="Arrow1Lend"
inkscape:isstock="true">
<path
transform="scale(0.8) rotate(180) translate(12.5,0)"
style="fill-rule:evenodd;fill:context-stroke;stroke:context-stroke;stroke-width:1.0pt;"
d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
id="path39462" />
</marker>
<rect
x="707.09731"
y="616.36746"
width="407.71288"
height="417.08306"
id="rect24254" />
<rect
x="47.404365"
y="100.3268"
width="398.49855"
height="110.16514"
id="rect5050" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3-5" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3-5-6" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3-5-6-1" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-0" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-0-6" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-0-6-2" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-0-6-2-8" />
<marker
style="overflow:visible"
id="Arrow2Mend-2"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(-0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path39486-3" />
</marker>
<marker
style="overflow:visible"
id="Arrow2Mend-2-5"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(-0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path39486-3-9" />
</marker>
<marker
style="overflow:visible"
id="Arrow2Mend-2-5-2"
refX="0"
refY="0"
orient="auto"
inkscape:stockid="Arrow2Mend"
inkscape:isstock="true">
<path
transform="scale(-0.6)"
d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
style="fill:context-stroke;fill-rule:evenodd;stroke:context-stroke;stroke-width:0.625;stroke-linejoin:round"
id="path39486-3-9-8" />
</marker>
<linearGradient
inkscape:collect="always"
xlink:href="#linearGradient40704"
id="linearGradient40706"
x1="324.58157"
y1="127.35331"
x2="363.61096"
y2="98.957848"
gradientUnits="userSpaceOnUse" />
<rect
x="47.404366"
y="100.3268"
width="398.49854"
height="110.16514"
id="rect5050-3-5-6-1-7" />
<rect
x="309.01868"
y="43.698616"
width="552.19421"
height="71.39035"
id="rect65238-1" />
</defs>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1">
<rect
style="fill:#d5d5ff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
id="rect55834"
width="250.31726"
height="74.676537"
x="25.257824"
y="97.277718" />
<rect
style="fill:#d5f6ff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0"
id="rect55832"
width="250.35208"
height="64.461151"
x="25.256891"
y="32.817505" />
<rect
style="fill:#ccffaa;stroke:#091600;stroke-width:1.31891"
id="rect6462"
width="82.385742"
height="20.525751"
x="28.355024"
y="48.740646" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,17.244577,26.206534)"
id="text5048"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82948"><tspan
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
id="tspan82946">gather-bench</tspan></tspan></text>
<rect
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect6462-9"
width="18.764017"
height="20.965076"
x="39.518955"
y="140.726" />
<text
xml:space="preserve"
transform="matrix(0.33667319,0,0,0.33667319,25.589293,109.42998)"
id="text5048-3"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82950">L1</tspan></text>
<rect
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect6462-9-0"
width="21.653919"
height="24.193966"
x="97.687294"
y="138.51564" />
<text
xml:space="preserve"
transform="matrix(0.3885252,0,0,0.3885252,81.212654,102.39964)"
id="text5048-3-6"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82952">L2</tspan></text>
<rect
style="fill:#de87aa;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect6462-9-0-6"
width="27.217058"
height="30.409672"
x="149.19933"
y="134.83977" />
<text
xml:space="preserve"
transform="matrix(0.48834178,0,0,0.48834178,128.49215,89.445174)"
id="text5048-3-6-1"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6-2);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82954">L3</tspan></text>
<rect
style="fill:#eeaaff;stroke:#091600;stroke-width:1.5;stroke-miterlimit:4;stroke-dasharray:none"
id="rect6462-9-0-6-7"
width="61.032539"
height="29.96501"
x="204.01265"
y="135.61238" />
<text
xml:space="preserve"
transform="matrix(0.48834178,0,0,0.48834178,182.37007,89.995434)"
id="text5048-3-6-1-9"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-0-6-2-8);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82956">DRAM</tspan></text>
<rect
style="fill:#ffccaa;stroke:#091600;stroke-width:1.10636"
id="rect6462-6"
width="74.980759"
height="15.869514"
x="126.09525"
y="38.773243" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,115.65481,14.295323)"
id="text5048-7"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82958">Single gather</tspan></text>
<rect
style="fill:#ffccaa;stroke:#091600;stroke-width:1.03971"
id="rect6462-6-3"
width="66.071701"
height="15.904838"
x="126.90776"
y="63.642746" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,116.63325,39.114393)"
id="text5048-7-5"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82960">MD gathers</tspan></text>
<rect
style="fill:#afe9dd;stroke:#091600;stroke-width:1.02848"
id="rect6462-6-3-2"
width="64.479698"
height="15.947394"
x="206.65364"
y="52.98967" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,196.01512,28.482594)"
id="text5048-7-5-9"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5-6);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82962">Contiguous</tspan></text>
<rect
style="fill:#afe9dd;stroke:#091600;stroke-width:0.987323"
id="rect6462-6-3-2-2"
width="59.269382"
height="15.988551"
x="208.16559"
y="76.856781" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,197.58604,52.220445)"
id="text5048-7-5-9-7"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect5050-3-5-6-1);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="47.404297"
y="135.7168"
id="tspan82964">&quot;Random&quot;</tspan></text>
<text
xml:space="preserve"
transform="scale(0.26458333)"
id="text24252"
style="fill:black;fill-opacity:1;stroke:none;font-family:sans-serif;font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect24254)" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="M 193.10512,71.273276 206.30683,61.033513"
id="path39049"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="M 193.08841,71.196939 207.86207,84.43804"
id="path39053"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.39816;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 58.548229,151.24436 38.298093,0.25023"
id="path39219"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.24847;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 119.19252,150.09399 29.28333,0.26095"
id="path39219-2"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 177.02022,150.44367 26.36623,0.26095"
id="path39219-2-0"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend)"
d="m 48.145458,92.71788 -0.644819,47.57709"
id="path39377"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2)"
d="M 48.121208,92.873762 106.60807,137.41946"
id="path39377-7"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2-5)"
d="M 48.073928,92.825143 158.88023,133.04546"
id="path39377-7-2"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:3, 1;stroke-dashoffset:0;stroke-opacity:1;marker-end:url(#Arrow2Mend-2-5-2)"
d="M 48.051946,92.813593 233.0959,134.16596"
id="path39377-7-2-9"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<rect
style="fill:#e9afaf;stroke:#091600;stroke-width:1.34518"
id="rect6462-6-3-2-2-3"
width="65.880661"
height="26.700579"
x="38.104012"
y="80.530182" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
d="m 77.365612,69.678744 h 2e-6"
id="path39808"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 111.64767,59.183009 6.84466,0.03069"
id="path41004"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 119.03378,47.056357 -0.58704,25.198541"
id="path41006"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="display:inline;fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1.02423;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 118.07503,72.254897 7.94998,-0.05784"
id="path41008"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.882836;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
d="m 118.26666,47.054814 7.69322,0.173925"
id="path41112"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<path
style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1"
d="M 68.213642,69.068864 67.910274,80.302728"
id="path55728"
inkscape:connector-type="polyline"
inkscape:connector-curvature="0" />
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,-1.3782637,4.0412367)"
id="text65236"
style="font-style:normal;font-weight:normal;font-size:53.3333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect65238);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="309.01953"
y="90.886691"
id="tspan82968"><tspan
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
id="tspan82966">Application Level</tspan></tspan></text>
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,2.7015103,160.71919)"
id="text65236-2"
style="font-style:normal;font-weight:normal;font-size:53.3333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect65238-1);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="309.01953"
y="90.886691"
id="tspan82972"><tspan
style="font-weight:bold;-inkscape-font-specification:'sans-serif Bold'"
id="tspan82970">Hardware Level</tspan></tspan></text>
<text
xml:space="preserve"
transform="matrix(0.26458333,0,0,0.26458333,2.3490396,0.57331532)"
id="text79473"
style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;white-space:pre;shape-inside:url(#rect79475);fill:#000000;fill-opacity:1;stroke:none"><tspan
x="144.01562"
y="339.75586"
id="tspan82974">vgather </tspan><tspan
x="144.01562"
y="389.75586"
id="tspan82976">instructions</tspan></text>
</g>
</svg>

After

Width:  |  Height:  |  Size: 21 KiB

BIN
figures/gromacs_mxn_v2.pdf Normal file

Binary file not shown.

BIN
figures/gromacs_mxn_v2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 128 KiB

BIN
figures/stub_new_v3.pdf Normal file

Binary file not shown.

BIN
figures/stub_new_v3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

BIN
figures/verlet_v2.pdf Normal file

Binary file not shown.

BIN
figures/verlet_v2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 62 KiB

1
gather-bench Submodule

Submodule gather-bench added at 2f654cb043

456
gromacs/atom.c Normal file
View File

@@ -0,0 +1,456 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <atom.h>
#include <allocate.h>
#include <util.h>
void initAtom(Atom *atom) {
atom->x = NULL; atom->y = NULL; atom->z = NULL;
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
atom->cl_x = NULL;
atom->cl_v = NULL;
atom->cl_f = NULL;
atom->cl_type = NULL;
atom->Natoms = 0;
atom->Nlocal = 0;
atom->Nghost = 0;
atom->Nmax = 0;
atom->Nclusters = 0;
atom->Nclusters_local = 0;
atom->Nclusters_ghost = 0;
atom->Nclusters_max = 0;
atom->type = NULL;
atom->ntypes = 0;
atom->epsilon = NULL;
atom->sigma6 = NULL;
atom->cutforcesq = NULL;
atom->cutneighsq = NULL;
atom->iclusters = NULL;
atom->jclusters = NULL;
atom->icluster_bin = NULL;
#ifdef USE_SUPER_CLUSTERS
atom->scl_x = NULL;
atom->scl_v = NULL;
atom->scl_f = NULL;
atom->Nsclusters = 0;
atom->Nsclusters_local = 0;
atom->Nsclusters_ghost = 0;
atom->Nsclusters_max = 0;
atom->scl_type = NULL;
atom->siclusters = NULL;
atom->icluster_idx = NULL;
atom->sicluster_bin = NULL;
#endif //USE_SUPER_CLUSTERS
}
void createAtom(Atom *atom, Parameter *param) {
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
atom->Natoms = 4 * param->nx * param->ny * param->nz;
atom->Nlocal = 0;
atom->ntypes = param->ntypes;
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
int ilo = (int) (xlo / (0.5 * alat) - 1);
int ihi = (int) (xhi / (0.5 * alat) + 1);
int jlo = (int) (ylo / (0.5 * alat) - 1);
int jhi = (int) (yhi / (0.5 * alat) + 1);
int klo = (int) (zlo / (0.5 * alat) - 1);
int khi = (int) (zhi / (0.5 * alat) + 1);
ilo = MAX(ilo, 0);
ihi = MIN(ihi, 2 * param->nx - 1);
jlo = MAX(jlo, 0);
jhi = MIN(jhi, 2 * param->ny - 1);
klo = MAX(klo, 0);
khi = MIN(khi, 2 * param->nz - 1);
MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
int i, j, k, m, n;
int sx = 0; int sy = 0; int sz = 0;
int ox = 0; int oy = 0; int oz = 0;
int subboxdim = 8;
while(oz * subboxdim <= khi) {
k = oz * subboxdim + sz;
j = oy * subboxdim + sy;
i = ox * subboxdim + sx;
if(((i + j + k) % 2 == 0) && (i >= ilo) && (i <= ihi) && (j >= jlo) && (j <= jhi) && (k >= klo) && (k <= khi)) {
xtmp = 0.5 * alat * i;
ytmp = 0.5 * alat * j;
ztmp = 0.5 * alat * k;
if(xtmp >= xlo && xtmp < xhi && ytmp >= ylo && ytmp < yhi && ztmp >= zlo && ztmp < zhi ) {
n = k * (2 * param->ny) * (2 * param->nx) + j * (2 * param->nx) + i + 1;
for(m = 0; m < 5; m++) { myrandom(&n); }
vxtmp = myrandom(&n);
for(m = 0; m < 5; m++){ myrandom(&n); }
vytmp = myrandom(&n);
for(m = 0; m < 5; m++) { myrandom(&n); }
vztmp = myrandom(&n);
if(atom->Nlocal == atom->Nmax) { growAtom(atom); }
atom_x(atom->Nlocal) = xtmp;
atom_y(atom->Nlocal) = ytmp;
atom_z(atom->Nlocal) = ztmp;
atom->vx[atom->Nlocal] = vxtmp;
atom->vy[atom->Nlocal] = vytmp;
atom->vz[atom->Nlocal] = vztmp;
atom->type[atom->Nlocal] = rand() % atom->ntypes;
atom->Nlocal++;
}
}
sx++;
if(sx == subboxdim) { sx = 0; sy++; }
if(sy == subboxdim) { sy = 0; sz++; }
if(sz == subboxdim) { sz = 0; ox++; }
if(ox * subboxdim > ihi) { ox = 0; oy++; }
if(oy * subboxdim > jhi) { oy = 0; oz++; }
}
}
int type_str2int(const char *type) {
if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
fprintf(stderr, "Invalid atom type: %s\n", type);
exit(-1);
return -1;
}
int readAtom(Atom* atom, Parameter* param) {
int len = strlen(param->input_file);
if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp\n", param->input_file);
exit(-1);
return -1;
}
int readAtom_pdb(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int read_atoms = 0;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
while(!feof(fp)) {
readline(line, fp);
char *item = strtok(line, " ");
if(strncmp(item, "CRYST1", 6) == 0) {
param->xlo = 0.0;
param->xhi = atof(strtok(NULL, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
// alpha, beta, gamma, sGroup, z
} else if(strncmp(item, "ATOM", 4) == 0) {
char *label;
int atom_id, comp_id;
MD_FLOAT occupancy, charge;
atom_id = atoi(strtok(NULL, " ")) - 1;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
label = strtok(NULL, " ");
comp_id = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = 0.0;
atom->vy[atom_id] = 0.0;
atom->vz[atom_id] = 0.0;
occupancy = atof(strtok(NULL, " "));
charge = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
read_atoms++;
} else if(strncmp(item, "HEADER", 6) == 0 ||
strncmp(item, "REMARK", 6) == 0 ||
strncmp(item, "MODEL", 5) == 0 ||
strncmp(item, "TER", 3) == 0 ||
strncmp(item, "ENDMDL", 6) == 0) {
// Do nothing
} else {
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
}
if(!read_atoms) {
fprintf(stderr, "Input error: No atoms read!\n");
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_gro(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
char desc[MAXLINE];
int read_atoms = 0;
int atoms_to_read = 0;
int i = 0;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
readline(desc, fp);
for(i = 0; desc[i] != '\n'; i++);
desc[i] = '\0';
readline(line, fp);
atoms_to_read = atoi(strtok(line, " "));
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
while(!feof(fp) && read_atoms < atoms_to_read) {
readline(line, fp);
char *label = strtok(line, " ");
int type = type_str2int(strtok(NULL, " "));
int atom_id = atoi(strtok(NULL, " ")) - 1;
atom_id = read_atoms;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type;
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = atof(strtok(NULL, " "));
atom->vy[atom_id] = atof(strtok(NULL, " "));
atom->vz[atom_id] = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
read_atoms++;
}
if(!feof(fp)) {
readline(line, fp);
param->xlo = 0.0;
param->xhi = atof(strtok(line, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
}
if(read_atoms != atoms_to_read) {
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_dmp(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int natoms = 0;
int read_atoms = 0;
int atom_id = -1;
int ts = -1;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
while(!feof(fp) && ts < 1 && !read_atoms) {
readline(line, fp);
if(strncmp(line, "ITEM: ", 6) == 0) {
char *item = &line[6];
if(strncmp(item, "TIMESTEP", 8) == 0) {
readline(line, fp);
ts = atoi(line);
} else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
readline(line, fp);
natoms = atoi(line);
atom->Natoms = natoms;
atom->Nlocal = natoms;
while(atom->Nlocal >= atom->Nmax) {
growAtom(atom);
}
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
readline(line, fp);
param->xlo = atof(strtok(line, " "));
param->xhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
readline(line, fp);
param->ylo = atof(strtok(line, " "));
param->yhi = atof(strtok(NULL, " "));
param->yprd = param->yhi - param->ylo;
readline(line, fp);
param->zlo = atof(strtok(line, " "));
param->zhi = atof(strtok(NULL, " "));
param->zprd = param->zhi - param->zlo;
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
for(int i = 0; i < natoms; i++) {
readline(line, fp);
atom_id = atoi(strtok(line, " ")) - 1;
atom->type[atom_id] = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom->vx[atom_id] = atof(strtok(NULL, " "));
atom->vy[atom_id] = atof(strtok(NULL, " "));
atom->vz[atom_id] = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
read_atoms++;
}
} else {
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
} else {
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
exit(-1);
return -1;
}
}
if(ts < 0 || !natoms || !read_atoms) {
fprintf(stderr, "Input error: atom data was not read!\n");
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
fclose(fp);
return natoms;
}
void growAtom(Atom *atom) {
int nold = atom->Nmax;
atom->Nmax += DELTA;
#ifdef AOS
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
#else
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
#endif
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
}
void growClusters(Atom *atom) {
int nold = atom->Nclusters_max;
int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
atom->Nclusters_max += DELTA;
atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
}
#ifdef USE_SUPER_CLUSTERS
void growSuperClusters(Atom *atom) {
int nold = atom->Nsclusters_max;
atom->Nsclusters_max += DELTA;
atom->siclusters = (SuperCluster*) reallocate(atom->siclusters, ALIGNMENT, atom->Nsclusters_max * sizeof(SuperCluster), nold * sizeof(SuperCluster));
atom->icluster_idx = (int*) reallocate(atom->icluster_idx, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int), nold * SCLUSTER_SIZE * sizeof(int));
atom->sicluster_bin = (int*) reallocate(atom->sicluster_bin, ALIGNMENT, atom->Nsclusters_max * sizeof(int), nold * sizeof(int));
//atom->scl_type = (int*) reallocate(atom->scl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * SCLUSTER_SIZE * sizeof(int), nold * CLUSTER_M * SCLUSTER_SIZE * sizeof(int));
atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
}
#endif //USE_SUPER_CLUSTERS

451
gromacs/cuda/force_lj.cu Normal file
View File

@@ -0,0 +1,451 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
extern "C" {
#include <stdio.h>
//---
#include <cuda.h>
#include <driver_types.h>
//---
#include <likwid-marker.h>
//---
#include <atom.h>
#include <device.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#include <util.h>
}
extern "C" {
MD_FLOAT *cuda_cl_x;
MD_FLOAT *cuda_cl_v;
MD_FLOAT *cuda_cl_f;
int *cuda_neighbors;
int *cuda_numneigh;
int *cuda_natoms;
int *natoms;
int *ngatoms;
int *cuda_border_map;
int *cuda_jclusters_natoms;
MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
int isReneighboured;
int *cuda_iclusters;
int *cuda_nclusters;
int cuda_max_scl;
MD_FLOAT *cuda_scl_x;
MD_FLOAT *cuda_scl_v;
MD_FLOAT *cuda_scl_f;
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
}
extern __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters,
int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt);
extern __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce);
extern "C"
void initDevice(Atom *atom, Neighbor *neighbor) {
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
cuda_assert("cudaDeviceSetup", cudaSetDevice(0));
cuda_cl_x = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_cl_v = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_cl_f = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_jclusters_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_border_map = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_PBCx = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_PBCy = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_PBCz = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_numneigh = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
cuda_neighbors = (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int));
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
isReneighboured = 1;
#ifdef USE_SUPER_CLUSTERS
cuda_max_scl = atom->Nsclusters_max;
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
#endif //USE_SUPER_CLUSTERS
}
extern "C"
void copyDataToCUDADevice(Atom *atom) {
DEBUG_MESSAGE("copyDataToCUDADevice start\r\n");
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
natoms[ci] = atom->iclusters[ci].natoms;
}
memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int));
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
const int cj = ncj + cg;
ngatoms[cg] = atom->jclusters[cj].natoms;
}
memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
#ifdef USE_SUPER_CLUSTERS
//alignDataToSuperclusters(atom);
if (cuda_max_scl < atom->Nsclusters_max) {
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
cuda_max_scl = atom->Nsclusters_max;
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
}
memcpyToGPU(cuda_scl_x, atom->scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_scl_v, atom->scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_scl_f, atom->scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
#endif //USE_SUPER_CLUSTERS
DEBUG_MESSAGE("copyDataToCUDADevice stop\r\n");
}
extern "C"
void copyDataFromCUDADevice(Atom *atom) {
DEBUG_MESSAGE("copyDataFromCUDADevice start\r\n");
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
#ifdef USE_SUPER_CLUSTERS
memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
//alignDataFromSuperclusters(atom);
#endif //USE_SUPER_CLUSTERS
DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n");
}
extern "C"
void cudaDeviceFree() {
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x));
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v));
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f));
cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh));
cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors));
cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms));
cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map));
cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms));
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx));
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy));
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
free(natoms);
free(ngatoms);
#ifdef USE_SUPER_CLUSTERS
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
#endif //USE_SUPER_CLUSTERS
}
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_natoms,
int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
if (ci_pos >= Nclusters_local) return;
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii];
ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii];
ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii];
}
}
__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
int *cuda_jclusters_natoms,
int *cuda_PBCx,
int *cuda_PBCy,
int *cuda_PBCz,
int Nclusters_local,
int Nclusters_ghost,
MD_FLOAT param_xprd,
MD_FLOAT param_yprd,
MD_FLOAT param_zprd) {
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
if (cg >= Nclusters_ghost) return;
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = Nclusters_local / jfac;
MD_FLOAT xprd = param_xprd;
MD_FLOAT yprd = param_yprd;
MD_FLOAT zprd = param_zprd;
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
}
}
__global__ void cudaUpdatePbcSup_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
int *cuda_jclusters_natoms,
int *cuda_PBCx,
int *cuda_PBCy,
int *cuda_PBCz,
int Nsclusters_local,
int Nclusters_ghost,
MD_FLOAT param_xprd,
MD_FLOAT param_yprd,
MD_FLOAT param_zprd) {
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
if (cg >= Nclusters_ghost) return;
//int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int jfac = SCLUSTER_SIZE / CLUSTER_M;
int ncj = Nsclusters_local / jfac;
MD_FLOAT xprd = param_xprd;
MD_FLOAT yprd = param_yprd;
MD_FLOAT zprd = param_zprd;
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
}
}
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
int Nclusters_local, int Nclusters_max,
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
int ci_cj0 = CJ0_FROM_CI(ci_pos);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
int numneighs = cuda_numneigh[ci_pos];
for(int k = 0; k < numneighs; k++) {
int cj = (&cuda_neighs[ci_pos * maxneighs])[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos];
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos];
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos];
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
int cond;
#if CLUSTER_M == CLUSTER_N
cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
(ci_cj0 != cj || cii_pos != cjj_pos);
#elif CLUSTER_M < CLUSTER_N
cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
(ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
#endif
if(cond) {
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos];
MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos];
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
if(half_neigh) {
atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force);
atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force);
atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force);
}
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix);
atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy);
atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz);
}
}
}
}
__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_natoms,
int Nclusters_local, MD_FLOAT dtforce) {
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
if (ci_pos >= Nclusters_local) return;
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
}
}
extern "C"
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
const int threads_num = 16;
dim3 block_size = dim3(threads_num, 1, 1);
#ifdef USE_SUPER_CLUSTERS
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
cudaInitialIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_v, cuda_scl_f,
cuda_nclusters,
cuda_natoms, atom->Nsclusters_local, param->dtforce, param->dt);
#else
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
#endif //USE_SUPER_CLUSTERS
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
}
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
extern "C"
void cudaUpdatePbc(Atom *atom, Parameter *param) {
const int threads_num = 512;
dim3 block_size = dim3(threads_num, 1, 1);;
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);
#ifdef USE_SUPER_CLUSTERS
cudaUpdatePbcSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_border_map,
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
atom->Nclusters_local, atom->Nclusters_ghost,
param->xprd, param->yprd, param->zprd);
#else
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
atom->Nclusters_local, atom->Nclusters_ghost,
param->xprd, param->yprd, param->zprd);
#endif //USE_SUPER_CLUSTERS
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
}
extern "C"
double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
if (isReneighboured) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
}
isReneighboured = 0;
}
const int threads_num = 1;
dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N);
dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
computeForceLJ_cuda_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_f,
atom->Nclusters_local, atom->Nclusters_max,
cuda_numneigh, cuda_neighbors,
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
sigma6, epsilon);
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
return E-S;
}
extern "C"
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
const int threads_num = 16;
dim3 block_size = dim3(threads_num, 1, 1);
#ifdef USE_SUPER_CLUSTERS
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
cudaFinalIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_v, cuda_scl_f,
cuda_nclusters, cuda_natoms,
atom->Nsclusters_local, param->dt);
#else
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms,
atom->Nclusters_local, param->dt);
#endif //USE_SUPER_CLUSTERS
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
}

View File

@@ -0,0 +1,288 @@
extern "C" {
#include <stdio.h>
//---
#include <cuda.h>
#include <driver_types.h>
//---
#include <likwid-marker.h>
//---
#include <atom.h>
#include <device.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#include <util.h>
}
extern "C" {
extern MD_FLOAT *cuda_cl_x;
extern MD_FLOAT *cuda_cl_v;
extern MD_FLOAT *cuda_cl_f;
extern int *cuda_neighbors;
extern int *cuda_numneigh;
extern int *cuda_natoms;
extern int *natoms;
extern int *ngatoms;
extern int *cuda_border_map;
extern int *cuda_jclusters_natoms;
extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
extern int isReneighboured;
extern int *cuda_iclusters;
extern int *cuda_nclusters;
extern MD_FLOAT *cuda_scl_x;
extern MD_FLOAT *cuda_scl_v;
extern MD_FLOAT *cuda_scl_f;
}
#ifdef USE_SUPER_CLUSTERS
extern "C"
void alignDataToSuperclusters(Atom *atom) {
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
/*
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
*/
memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
}
extern "C"
void alignDataFromSuperclusters(Atom *atom) {
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
/*
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
*/
memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
}
__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters,
int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
if (sci_pos >= Nsclusters_local) return;
//unsigned int ci_pos = cii_pos / CLUSTER_M;
//unsigned int scii_pos = cii_pos % CLUSTER_M;
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
//if (scii_pos >= cuda_natoms[ci_pos]) return;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];
ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];
ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];
}
}
__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
if (sci_pos >= Nsclusters_local) return;
//unsigned int ci_pos = cii_pos / CLUSTER_M;
//unsigned int scii_pos = cii_pos % CLUSTER_M;
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
//if (scii_pos >= cuda_natoms[ci_pos]) return;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
}
}
__global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_iclusters,
int Nsclusters_local,
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
unsigned int ci_pos = scii_pos / CLUSTER_M;
unsigned int cii_pos = scii_pos % CLUSTER_M;
if (ci_pos >= cuda_nclusters[sci_pos]) return;
int ci_cj0 = CJ0_FROM_CI(ci_pos);
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
//int numneighs = cuda_numneigh[ci_pos];
int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];
for(int k = 0; k < numneighs; k++) {
int glob_j = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
int scj = glob_j / SCLUSTER_SIZE;
// TODO Make cj accessible from super cluster data alignment (not reachable right now)
int cj = SCJ_VECTOR_BASE_INDEX(scj) + CLUSTER_M * (glob_j % SCLUSTER_SIZE);
int cj_vec_base = cj;
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];
MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];
MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
//int cond = ci_cj0 != cj || cii_pos != cjj_pos || scj != sci_pos;
int cond = (glob_j != cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] && cii_pos != cjj_pos);
if(cond) {
MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
if(half_neigh) {
atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);
atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);
atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);
}
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);
atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);
atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);
}
}
}
}
extern "C"
double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
if (isReneighboured) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
}
for(int sci = 0; sci < atom->Nsclusters_local; sci++) {
memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));
//memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);
}
memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
isReneighboured = 0;
}
const int threads_num = 1;
dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);
dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,
cuda_nclusters, cuda_iclusters,
atom->Nsclusters_local,
cuda_numneigh, cuda_neighbors,
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
sigma6, epsilon);
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");
return E-S;
}
#endif //USE_SUPER_CLUSTERS

View File

@@ -1,24 +1,8 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <likwid-marker.h>
#include <math.h>
@@ -32,7 +16,8 @@
#include <eam.h>
#include <util.h>
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
/*
if(eam->nmax < atom->Nmax) {
eam->nmax = atom->Nmax;
if(eam->fp != NULL) { free(eam->fp); }
@@ -43,11 +28,13 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
int* neighs;
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
int rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; int rdrho = eam->rdrho;
MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
*/
double S = getTimeStamp();
LIKWID_MARKER_START("force_eam_fp");
/*
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
@@ -207,6 +194,7 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
*/
LIKWID_MARKER_STOP("force_eam");
double E = getTimeStamp();
return E-S;

762
gromacs/force_lj.c Normal file
View File

@@ -0,0 +1,762 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <likwid-marker.h>
#include <timing.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#include <util.h>
#include <simd.h>
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ begin\n");
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_f[CL_X_OFFSET + cii] = 0.0;
ci_f[CL_Y_OFFSET + cii] = 0.0;
ci_f[CL_Z_OFFSET + cii] = 0.0;
}
}
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
int ci_cj1 = CJ1_FROM_CI(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int any = 0;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
for(int cii = 0; cii < CLUSTER_M; cii++) {
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
int cond;
#if CLUSTER_M == CLUSTER_N
cond = neighbor->half_neigh ? (ci_cj0 != cj || cii < cjj) :
(ci_cj0 != cj || cii != cjj);
#elif CLUSTER_M < CLUSTER_N
cond = neighbor->half_neigh ? (ci_cj0 != cj || cii + CLUSTER_M * (ci & 0x1) < cjj) :
(ci_cj0 != cj || cii + CLUSTER_M * (ci & 0x1) != cjj);
#else
cond = neighbor->half_neigh ? (ci_cj0 != cj || cii < cjj) && (ci_cj1 != cj || cii < cjj + CLUSTER_N) :
(ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N);
#endif
if(cond) {
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj];
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj];
MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj];
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
if(neighbor->half_neigh) {
cj_f[CL_X_OFFSET + cjj] -= delx * force;
cj_f[CL_Y_OFFSET + cjj] -= dely * force;
cj_f[CL_Z_OFFSET + cjj] -= delz * force;
}
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
any = 1;
addStat(stats->atoms_within_cutoff, 1);
} else {
addStat(stats->atoms_outside_cutoff, 1);
}
}
}
if(any != 0) {
addStat(stats->clusters_within_cutoff, 1);
} else {
addStat(stats->clusters_outside_cutoff, 1);
}
ci_f[CL_X_OFFSET + cii] += fix;
ci_f[CL_Y_OFFSET + cii] += fiy;
ci_f[CL_Z_OFFSET + cii] += fiz;
}
}
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJ end\n");
return E-S;
}
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_f[CL_X_OFFSET + cii] = 0.0;
ci_f[CL_Y_OFFSET + cii] = 0.0;
ci_f[CL_Z_OFFSET + cii] = 0.0;
}
}
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
int ci_cj1 = CJ1_FROM_CI(ci);
#endif
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
MD_SIMD_FLOAT fix0 = simd_zero();
MD_SIMD_FLOAT fiy0 = simd_zero();
MD_SIMD_FLOAT fiz0 = simd_zero();
MD_SIMD_FLOAT fix2 = simd_zero();
MD_SIMD_FLOAT fiy2 = simd_zero();
MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
unsigned int mask0, mask1, mask2, mask3;
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
#if CLUSTER_M == CLUSTER_N
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x3 * cond0);
mask2 = (unsigned int)(0xf - 0x7 * cond0);
mask3 = (unsigned int)(0xf - 0xf * cond0);
#elif CLUSTER_M < CLUSTER_N
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
#else
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x3 * cond0);
mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
#endif
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
fix0 = simd_add(fix0, tx0);
fiy0 = simd_add(fiy0, ty0);
fiz0 = simd_add(fiz0, tz0);
fix2 = simd_add(fix2, tx2);
fiy2 = simd_add(fiy2, ty2);
fiz2 = simd_add(fiz2, tz2);
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
}
#else
simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
#endif
}
simd_h_dual_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
simd_h_dual_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
simd_h_dual_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
return E-S;
}
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_f[CL_X_OFFSET + cii] = 0.0;
ci_f[CL_Y_OFFSET + cii] = 0.0;
ci_f[CL_Z_OFFSET + cii] = 0.0;
}
}
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
int ci_cj1 = CJ1_FROM_CI(ci);
#endif
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
MD_SIMD_FLOAT fix0 = simd_zero();
MD_SIMD_FLOAT fiy0 = simd_zero();
MD_SIMD_FLOAT fiz0 = simd_zero();
MD_SIMD_FLOAT fix2 = simd_zero();
MD_SIMD_FLOAT fiy2 = simd_zero();
MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
unsigned int mask0, mask1, mask2, mask3;
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
#if CLUSTER_M == CLUSTER_N
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x2 * cond0);
mask2 = (unsigned int)(0xf - 0x4 * cond0);
mask3 = (unsigned int)(0xf - 0x8 * cond0);
#elif CLUSTER_M < CLUSTER_N
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
#else
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
#endif
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
}
simd_h_dual_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
simd_h_dual_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
simd_h_dual_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
addStat(stats->force_iters, (long long int)((double)numneighs));
//addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
return E-S;
}
double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(neighbor->half_neigh) {
return computeForceLJ_2xnn_half(param, atom, neighbor, stats);
}
return computeForceLJ_2xnn_full(param, atom, neighbor, stats);
}
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_f[CL_X_OFFSET + cii] = 0.0;
ci_f[CL_Y_OFFSET + cii] = 0.0;
ci_f[CL_Z_OFFSET + cii] = 0.0;
}
}
#pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
int ci_cj1 = CJ1_FROM_CI(ci);
#endif
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
MD_SIMD_FLOAT fix0 = simd_zero();
MD_SIMD_FLOAT fiy0 = simd_zero();
MD_SIMD_FLOAT fiz0 = simd_zero();
MD_SIMD_FLOAT fix1 = simd_zero();
MD_SIMD_FLOAT fiy1 = simd_zero();
MD_SIMD_FLOAT fiz1 = simd_zero();
MD_SIMD_FLOAT fix2 = simd_zero();
MD_SIMD_FLOAT fiy2 = simd_zero();
MD_SIMD_FLOAT fiz2 = simd_zero();
MD_SIMD_FLOAT fix3 = simd_zero();
MD_SIMD_FLOAT fiy3 = simd_zero();
MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
#if CLUSTER_M == CLUSTER_N
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0));
#elif CLUSTER_M < CLUSTER_N
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1));
#else
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1));
#endif
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
MD_SIMD_FLOAT tx1 = select_by_mask(simd_mul(delx1, force1), cutoff_mask1);
MD_SIMD_FLOAT ty1 = select_by_mask(simd_mul(dely1, force1), cutoff_mask1);
MD_SIMD_FLOAT tz1 = select_by_mask(simd_mul(delz1, force1), cutoff_mask1);
MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
MD_SIMD_FLOAT tx3 = select_by_mask(simd_mul(delx3, force3), cutoff_mask3);
MD_SIMD_FLOAT ty3 = select_by_mask(simd_mul(dely3, force3), cutoff_mask3);
MD_SIMD_FLOAT tz3 = select_by_mask(simd_mul(delz3, force3), cutoff_mask3);
fix0 = simd_add(fix0, tx0);
fiy0 = simd_add(fiy0, ty0);
fiz0 = simd_add(fiz0, tz0);
fix1 = simd_add(fix1, tx1);
fiy1 = simd_add(fiy1, ty1);
fiz1 = simd_add(fiz1, tz1);
fix2 = simd_add(fix2, tx2);
fiy2 = simd_add(fiy2, ty2);
fiz2 = simd_add(fiz2, tz2);
fix3 = simd_add(fix3, tx3);
fiy3 = simd_add(fiy3, ty3);
fiz3 = simd_add(fiz3, tz3);
#ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
if(cj < CJ1_FROM_CI(atom->Nlocal)) {
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
}
#else
simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
#endif
}
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
simd_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy1, fiy2, fiy3);
simd_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz1, fiz2, fiz3);
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
return E-S;
}
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_f[CL_X_OFFSET + cii] = 0.0;
ci_f[CL_Y_OFFSET + cii] = 0.0;
ci_f[CL_Z_OFFSET + cii] = 0.0;
}
}
#pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
int ci_cj1 = CJ1_FROM_CI(ci);
#endif
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
MD_SIMD_FLOAT fix0 = simd_zero();
MD_SIMD_FLOAT fiy0 = simd_zero();
MD_SIMD_FLOAT fiz0 = simd_zero();
MD_SIMD_FLOAT fix1 = simd_zero();
MD_SIMD_FLOAT fiy1 = simd_zero();
MD_SIMD_FLOAT fiz1 = simd_zero();
MD_SIMD_FLOAT fix2 = simd_zero();
MD_SIMD_FLOAT fiy2 = simd_zero();
MD_SIMD_FLOAT fiz2 = simd_zero();
MD_SIMD_FLOAT fix3 = simd_zero();
MD_SIMD_FLOAT fiy3 = simd_zero();
MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
#if CLUSTER_M == CLUSTER_N
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
#elif CLUSTER_M < CLUSTER_N
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
#else
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
#endif
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
}
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
simd_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy1, fiy2, fiy3);
simd_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz1, fiz2, fiz3);
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
return E-S;
}
double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(neighbor->half_neigh) {
return computeForceLJ_4xn_half(param, atom, neighbor, stats);
}
return computeForceLJ_4xn_full(param, atom, neighbor, stats);
}

231
gromacs/includes/atom.h Normal file
View File

@@ -0,0 +1,231 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#ifndef __ATOM_H_
#define __ATOM_H_
#define DELTA 20000
// Nbnxn layouts (as of GROMACS):
// Simd4xN: M=4, N=VECTOR_WIDTH
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
// Cuda: M=8, N=VECTOR_WIDTH
#ifdef CUDA_TARGET
# undef VECTOR_WIDTH
# define VECTOR_WIDTH 8
# define KERNEL_NAME "CUDA"
# define CLUSTER_M 8
# define CLUSTER_N VECTOR_WIDTH
#ifdef USE_SUPER_CLUSTERS
# define XX 0
# define YY 1
# define ZZ 2
# define SCLUSTER_SIZE_X 2
# define SCLUSTER_SIZE_Y 2
# define SCLUSTER_SIZE_Z 2
# define SCLUSTER_SIZE (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)
# define DIM_COORD(dim,coord) ((dim == XX) ? atom_x(coord) : ((dim == YY) ? atom_y(coord) : atom_z(coord)))
# define MIN(a,b) ({int _a = (a), _b = (b); _a < _b ? _a : _b; })
# define SCLUSTER_M CLUSTER_M * SCLUSTER_SIZE
# define computeForceLJ computeForceLJSup_cuda
#else
# define computeForceLJ computeForceLJ_cuda
#endif //USE_SUPER_CLUSTERS
# define initialIntegrate cudaInitialIntegrate
# define finalIntegrate cudaFinalIntegrate
# define updatePbc cudaUpdatePbc
#else
# define CLUSTER_M 4
// Simd2xNN (here used for single-precision)
# if VECTOR_WIDTH > CLUSTER_M * 2
# define KERNEL_NAME "Simd2xNN"
# define CLUSTER_N (VECTOR_WIDTH / 2)
# define computeForceLJ computeForceLJ_2xnn
// Simd4xN
# else
# define KERNEL_NAME "Simd4xN"
# define CLUSTER_N VECTOR_WIDTH
# define computeForceLJ computeForceLJ_4xn
# endif
# ifdef USE_REFERENCE_VERSION
# undef KERNEL_NAME
# undef computeForceLJ
# define KERNEL_NAME "Reference"
# define computeForceLJ computeForceLJ_ref
# endif
# define initialIntegrate cpuInitialIntegrate
# define finalIntegrate cpuFinalIntegrate
# define updatePbc cpuUpdatePbc
#endif
#if CLUSTER_M == CLUSTER_N
# define CJ0_FROM_CI(a) (a)
# define CJ1_FROM_CI(a) (a)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#ifdef USE_SUPER_CLUSTERS
# define CJ1_FROM_SCI(a) (a)
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
#endif //USE_SUPER_CLUSTERS
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
# define CJ0_FROM_CI(a) ((a) << 1)
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
#ifdef USE_SUPER_CLUSTERS
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_M * SCLUSTER_SIZE * (b))
# define SCJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (SCLUSTER_SIZE * CLUSTER_M >> 1))
#endif //USE_SUPER_CLUSTERS
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
# define CJ0_FROM_CI(a) ((a) >> 1)
# define CJ1_FROM_CI(a) ((a) >> 1)
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#ifdef USE_SUPER_CLUSTERS
# define SCI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (CLUSTER_N * SCLUSTER_SIZE >> 1))
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
#endif //USE_SUPER_CLUSTERS
#else
# error "Invalid cluster configuration!"
#endif
#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
# error "Cluster N dimension can be only 2, 4 and 8"
#endif
#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1))
#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3))
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
#ifdef USE_SUPER_CLUSTERS
#define SCI_SCALAR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 1))
#define SCI_VECTOR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 3))
#define SCJ_SCALAR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 1))
#define SCJ_VECTOR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 3))
#endif //USE_SUPER_CLUSTERS
#if CLUSTER_M >= CLUSTER_N
# define CL_X_OFFSET (0 * CLUSTER_M)
# define CL_Y_OFFSET (1 * CLUSTER_M)
# define CL_Z_OFFSET (2 * CLUSTER_M)
#ifdef USE_SUPER_CLUSTERS
# define SCL_CL_X_OFFSET(ci) (ci * CLUSTER_M + 0 * SCLUSTER_M)
# define SCL_CL_Y_OFFSET(ci) (ci * CLUSTER_M + 1 * SCLUSTER_M)
# define SCL_CL_Z_OFFSET(ci) (ci * CLUSTER_M + 2 * SCLUSTER_M)
# define SCL_X_OFFSET (0 * SCLUSTER_M)
# define SCL_Y_OFFSET (1 * SCLUSTER_M)
# define SCL_Z_OFFSET (2 * SCLUSTER_M)
#endif //USE_SUPER_CLUSTERS
#else
# define CL_X_OFFSET (0 * CLUSTER_N)
# define CL_Y_OFFSET (1 * CLUSTER_N)
# define CL_Z_OFFSET (2 * CLUSTER_N)
#ifdef USE_SUPER_CLUSTERS
# define SCL_X_OFFSET (0 * SCLUSTER_SIZE * CLUSTER_N)
# define SCL_Y_OFFSET (1 * SCLUSTER_SIZE * CLUSTER_N)
# define SCL_Z_OFFSET (2 * SCLUSTER_SIZE * CLUSTER_N)
#endif //USE_SUPER_CLUSTERS
#endif
typedef struct {
int natoms;
MD_FLOAT bbminx, bbmaxx;
MD_FLOAT bbminy, bbmaxy;
MD_FLOAT bbminz, bbmaxz;
} Cluster;
typedef struct {
int nclusters;
MD_FLOAT bbminx, bbmaxx;
MD_FLOAT bbminy, bbmaxy;
MD_FLOAT bbminz, bbmaxz;
} SuperCluster;
typedef struct {
int Natoms, Nlocal, Nghost, Nmax;
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
int *border_map;
int *type;
int ntypes;
MD_FLOAT *epsilon;
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
int *PBCx, *PBCy, *PBCz;
// Data in cluster format
MD_FLOAT *cl_x;
MD_FLOAT *cl_v;
MD_FLOAT *cl_f;
int *cl_type;
Cluster *iclusters, *jclusters;
int *icluster_bin;
int dummy_cj;
#ifdef USE_SUPER_CLUSTERS
int Nsclusters, Nsclusters_local, Nsclusters_ghost, Nsclusters_max;
MD_FLOAT *scl_x;
MD_FLOAT *scl_v;
MD_FLOAT *scl_f;
int *scl_type;
int *icluster_idx;
SuperCluster *siclusters;
int *sicluster_bin;
#endif //USE_SUPER_CLUSTERS
} Atom;
extern void initAtom(Atom*);
extern void createAtom(Atom*, Parameter*);
extern int readAtom(Atom*, Parameter*);
extern int readAtom_pdb(Atom*, Parameter*);
extern int readAtom_gro(Atom*, Parameter*);
extern int readAtom_dmp(Atom*, Parameter*);
extern void growAtom(Atom*);
extern void growClusters(Atom*);
extern void growSuperClusters(Atom*);
#ifdef AOS
# define POS_DATA_LAYOUT "AoS"
# define atom_x(i) atom->x[(i) * 3 + 0]
# define atom_y(i) atom->x[(i) * 3 + 1]
# define atom_z(i) atom->x[(i) * 3 + 2]
/*
# define atom_vx(i) atom->vx[(i) * 3 + 0]
# define atom_vy(i) atom->vx[(i) * 3 + 1]
# define atom_vz(i) atom->vx[(i) * 3 + 2]
# define atom_fx(i) atom->fx[(i) * 3 + 0]
# define atom_fy(i) atom->fx[(i) * 3 + 1]
# define atom_fz(i) atom->fx[(i) * 3 + 2]
*/
#else
# define POS_DATA_LAYOUT "SoA"
# define atom_x(i) atom->x[i]
# define atom_y(i) atom->y[i]
# define atom_z(i) atom->z[i]
#endif
// TODO: allow to switch velocites and forces to AoS
# define atom_vx(i) atom->vx[i]
# define atom_vy(i) atom->vy[i]
# define atom_vz(i) atom->vz[i]
# define atom_fx(i) atom->fx[i]
# define atom_fy(i) atom->fy[i]
# define atom_fz(i) atom->fz[i]
#endif

View File

@@ -0,0 +1,56 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdbool.h>
//---
#include <atom.h>
#include <parameter.h>
#include <util.h>
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
}
}
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
}
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
}
}
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
}
#ifdef CUDA_TARGET
void cudaInitialIntegrate(Parameter*, Atom*);
void cudaFinalIntegrate(Parameter*, Atom*);
#endif

View File

@@ -0,0 +1,32 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <parameter.h>
#ifndef __NEIGHBOR_H_
#define __NEIGHBOR_H_
typedef struct {
int every;
int ncalls;
int* neighbors;
int maxneighs;
int* numneigh;
int half_neigh;
} Neighbor;
extern void initNeighbor(Neighbor*, Parameter*);
extern void setupNeighbor(Parameter*, Atom*);
extern void binatoms(Atom*);
extern void buildNeighbor(Atom*, Neighbor*);
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
extern void sortAtom(Atom*);
extern void buildClusters(Atom*);
extern void buildClustersGPU(Atom*);
extern void defineJClusters(Atom*);
extern void binClusters(Atom*);
extern void updateSingleAtoms(Atom*);
#endif

23
gromacs/includes/pbc.h Normal file
View File

@@ -0,0 +1,23 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <parameter.h>
#ifndef __PBC_H_
#define __PBC_H_
extern void initPbc();
extern void cpuUpdatePbc(Atom*, Parameter*, int);
extern void updateAtomsPbc(Atom*, Parameter*);
extern void setupPbc(Atom*, Parameter*);
#ifdef CUDA_TARGET
extern void cudaUpdatePbc(Atom*, Parameter*, int);
#if defined(USE_SUPER_CLUSTERS)
extern void setupPbcGPU(Atom*, Parameter*);
#endif //defined(USE_SUPER_CLUSTERS)
#endif
#endif

35
gromacs/includes/stats.h Normal file
View File

@@ -0,0 +1,35 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <parameter.h>
#ifndef __STATS_H_
#define __STATS_H_
typedef struct {
long long int calculated_forces;
long long int num_neighs;
long long int force_iters;
long long int atoms_within_cutoff;
long long int atoms_outside_cutoff;
long long int clusters_within_cutoff;
long long int clusters_outside_cutoff;
} Stats;
void initStats(Stats *s);
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
#ifdef COMPUTE_STATS
# define addStat(stat, value) stat += value;
# define beginStatTimer() double Si = getTimeStamp();
# define endStatTimer(stat) stat += getTimeStamp() - Si;
#else
# define addStat(stat, value)
# define beginStatTimer()
# define endStatTimer(stat)
#endif
#endif

View File

@@ -1,32 +1,12 @@
/*
* =======================================================================================
*
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
*
* This file is part of MD-Bench.
*
* MD-Bench is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License along
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
* =======================================================================================
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <likwid-marker.h>
#include <timing.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
#include <stdio.h>
@@ -119,114 +99,4 @@
# define DIST_TRACE(l, e)
#endif
double computeForceTracing(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep) {
MEM_TRACER_INIT;
INDEX_TRACER_INIT;
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
for(int i = 0; i < Nlocal; i++) {
fx[i] = 0.0;
fy[i] = 0.0;
fz[i] = 0.0;
}
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) {
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
MEM_TRACE(atom_x(i), 'R');
MEM_TRACE(atom_y(i), 'R');
MEM_TRACE(atom_z(i), 'R');
INDEX_TRACE_ATOM(i);
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
MEM_TRACE(atom->type(i), 'R');
#endif
#if defined(VARIANT) && VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1
#define REPEAT_NEIGHBORS_LOOP
int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS;
for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) {
#endif
//DIST_TRACE_SORT(neighs, numneighs);
INDEX_TRACE(neighs, numneighs);
//DIST_TRACE(neighs, numneighs);
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
MEM_TRACE(neighs[k], 'R');
MEM_TRACE(atom_x(j), 'R');
MEM_TRACE(atom_y(j), 'R');
MEM_TRACE(atom_z(j), 'R');
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
MEM_TRACE(atom->type(j), 'R');
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
}
}
#ifdef REPEAT_NEIGHBORS_LOOP
}
#endif
fx[i] += fix;
fy[i] += fiy;
fz[i] += fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
MEM_TRACE(fx[i], 'R');
MEM_TRACE(fx[i], 'W');
MEM_TRACE(fy[i], 'R');
MEM_TRACE(fy[i], 'W');
MEM_TRACE(fz[i], 'R');
MEM_TRACE(fz[i], 'W');
}
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
INDEX_TRACER_END;
MEM_TRACER_END;
return E-S;
}
extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);

19
gromacs/includes/utils.h Normal file
View File

@@ -0,0 +1,19 @@
/*
* Temporal functions for debugging, remove before proceeding with pull request
*/
#ifndef MD_BENCH_UTILS_H
#define MD_BENCH_UTILS_H
#include <atom.h>
#include <neighbor.h>
#ifdef USE_SUPER_CLUSTERS
void verifyClusters(Atom *atom);
void verifyLayout(Atom *atom);
void checkAlignment(Atom *atom);
void showSuperclusters(Atom *atom);
void printNeighs(Atom *atom, Neighbor *neighbor);
#endif //USE_SUPER_CLUSTERS
#endif //MD_BENCH_UTILS_H

17
gromacs/includes/vtk.h Normal file
View File

@@ -0,0 +1,17 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#ifndef __VTK_H_
#define __VTK_H_
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
extern int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
#endif

21
gromacs/includes/xtc.h Normal file
View File

@@ -0,0 +1,21 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#ifndef __XTC_H_
#define __XTC_H_
#ifdef XTC_OUTPUT
void xtc_init(const char *, Atom*, int);
void xtc_write(Atom*, int, int, int);
void xtc_end();
#else
#define xtc_init(a,b,c)
#define xtc_write(a,b,c,d)
#define xtc_end()
#endif
#endif

337
gromacs/main-stub.c Normal file
View File

@@ -0,0 +1,337 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <string.h>
#include <math.h>
//---
#include <likwid-marker.h>
//---
#include <timing.h>
#include <allocate.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#include <thermo.h>
#include <eam.h>
#include <pbc.h>
#include <timers.h>
#include <util.h>
#define HLINE "----------------------------------------------------------------------------\n"
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
// Patterns
#define P_SEQ 0
#define P_FIX 1
#define P_RAND 2
void init(Parameter *param) {
param->input_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma6 = 1.0;
param->rho = 0.8442;
param->ntypes = 4;
param->ntimes = 200;
param->nx = 1;
param->ny = 1;
param->nz = 1;
param->lattice = 1.0;
param->cutforce = 1000000.0;
param->cutneigh = param->cutforce;
param->mass = 1.0;
param->half_neigh = 0;
// Unused
param->dt = 0.005;
param->dtforce = 0.5 * param->dt;
param->nstat = 100;
param->temp = 1.44;
param->reneigh_every = 20;
param->proc_freq = 2.4;
param->eam_file = NULL;
}
// Show debug messages
#define DEBUG(msg) printf(msg)
// Do not show debug messages
//#define DEBUG(msg)
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
const int maxneighs = nneighs * nreps;
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
const int ncj = atom->Nclusters_local / jfac;
neighbor->numneigh = (int*) malloc(atom->Nclusters_max * sizeof(int));
neighbor->neighbors = (int*) malloc(atom->Nclusters_max * maxneighs * sizeof(int));
if(pattern == P_RAND && ncj <= nneighs) {
fprintf(stderr, "Error: P_RAND: Number of j-clusters should be higher than number of j-cluster neighbors per i-cluster!\n");
exit(-1);
}
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
int j = (pattern == P_SEQ) ? CJ0_FROM_CI(ci) : 0;
int m = (pattern == P_SEQ) ? ncj : nneighs;
int k = 0;
for(int k = 0; k < nneighs; k++) {
if(pattern == P_RAND) {
int found = 0;
do {
int cj = rand() % ncj;
neighptr[k] = cj;
found = 0;
for(int l = 0; l < k; l++) {
if(neighptr[l] == cj) {
found = 1;
}
}
} while(found == 1);
} else {
neighptr[k] = j;
j = (j + 1) % m;
}
}
for(int r = 1; r < nreps; r++) {
for(int k = 0; k < nneighs; k++) {
neighptr[r * nneighs + k] = neighptr[k];
}
}
neighbor->numneigh[ci] = nneighs * nreps;
}
}
int main(int argc, const char *argv[]) {
Eam eam;
Atom atom_data;
Atom *atom = (Atom *)(&atom_data);
Neighbor neighbor;
Stats stats;
Parameter param;
char *pattern_str = NULL;
int pattern = P_SEQ;
int niclusters = 256; // Number of local i-clusters
int iclusters_natoms = CLUSTER_M; // Number of valid atoms within i-clusters
int nneighs = 9; // Number of j-cluster neighbors per i-cluster
int nreps = 1;
int csv = 0;
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-f") == 0)) {
if((param.force_field = str2ff(argv[++i])) < 0) {
fprintf(stderr, "Invalid force field!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-p") == 0)) {
pattern_str = strdup(argv[++i]);
if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
else {
fprintf(stderr, "Invalid pattern!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-e") == 0)) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-ni") == 0)) {
niclusters = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-na") == 0)) {
iclusters_natoms = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nn") == 0)) {
nneighs = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nr") == 0)) {
nreps = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "--freq") == 0)) {
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--csv") == 0)) {
csv = 1;
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-f <string>: force field (lj or eam), default lj\n");
printf("-p <string>: pattern for data accesses (seq, fix or rand)\n");
printf("-n / --nsteps <int>: number of timesteps for simulation\n");
printf("-ni <int>: number of i-clusters (default 256)\n");
printf("-na <int>: number of atoms per i-cluster (default %d)\n", CLUSTER_M);
printf("-nn <int>: number of j-cluster neighbors per i-cluster (default 9)\n");
printf("-nr <int>: number of times neighbor lists should be replicated (default 1)\n");
printf("--freq <real>: set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
printf("--csv: set output as CSV style\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
if(pattern_str == NULL) {
pattern_str = strdup("seq\0");
}
if(param.force_field == FF_EAM) {
DEBUG("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
atom->ntypes = param.ntypes;
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param.epsilon;
atom->sigma6[i] = param.sigma6;
atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG("Creating atoms...\n");
while(atom->Nmax < niclusters * iclusters_natoms) {
growAtom(atom);
}
while(atom->Nclusters_max < niclusters) {
growClusters(atom);
}
for(int ci = 0; ci < niclusters; ++ci) {
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
int *ci_type = &atom->cl_type[ci_sca_base];
for(int cii = 0; cii < iclusters_natoms; ++cii) {
ci_x[CL_X_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
ci_x[CL_Y_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
ci_x[CL_Z_OFFSET + cii] = (MD_FLOAT)(ci * iclusters_natoms + cii) * 0.00001;
ci_v[CL_X_OFFSET + cii] = 0.0;
ci_v[CL_Y_OFFSET + cii] = 0.0;
ci_v[CL_Z_OFFSET + cii] = 0.0;
ci_type[cii] = rand() % atom->ntypes;
atom->Nlocal++;
}
for(int cii = iclusters_natoms; cii < CLUSTER_M; cii++) {
ci_x[CL_X_OFFSET + cii] = INFINITY;
ci_x[CL_Y_OFFSET + cii] = INFINITY;
ci_x[CL_Z_OFFSET + cii] = INFINITY;
}
atom->iclusters[ci].natoms = iclusters_natoms;
atom->Nclusters_local++;
}
const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
if(!csv) {
printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
printf("Floating-point precision: %s\n", PRECISION_STRING);
printf("Pattern: %s\n", pattern_str);
printf("Number of timesteps: %d\n", param.ntimes);
printf("Number of i-clusters: %d\n", niclusters);
printf("Number of atoms per i-cluster: %d\n", iclusters_natoms);
printf("Number of j-cluster neighbors per i-cluster: %d\n", nneighs);
printf("Number of times to replicate neighbor lists: %d\n", nreps);
printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG("Defining j-clusters...\n");
defineJClusters(atom);
DEBUG("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
DEBUG("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, atom, &neighbor, i + 1);
#endif
if(param.force_field == FF_EAM) {
T_accum += computeForceEam(&eam, &param, atom, &neighbor, &stats);
} else {
T_accum += computeForceLJ(&param, atom, &neighbor, &stats);
}
}
double freq_hz = param.proc_freq * 1.e9;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
if(!csv) {
printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
if(param.proc_freq > 0.0) {
printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
}
} else {
printf("steps,pattern,niclusters,iclusters_natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
if(param.proc_freq > 0.0) {
printf(",cy/atom,cy/neigh");
}
printf("\n");
printf("%d,%s,%d,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
param.ntimes, pattern_str, niclusters, iclusters_natoms, nneighs, nreps,
estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
if(param.proc_freq > 0.0) {
printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
}
printf("\n");
}
double timer[NUMTIMER];
timer[FORCE] = T_accum;
displayStatistics(atom, &param, &stats, timer);
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

385
gromacs/main.c Normal file
View File

@@ -0,0 +1,385 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <math.h>
//--
#include <likwid-marker.h>
//--
#include <atom.h>
#include <allocate.h>
#include <device.h>
#include <eam.h>
#include <integrate.h>
#include <neighbor.h>
#include <parameter.h>
#include <pbc.h>
#include <stats.h>
#include <thermo.h>
#include <timers.h>
#include <timing.h>
#include <util.h>
#include <vtk.h>
#include <xtc.h>
#define HLINE "----------------------------------------------------------------------------\n"
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
#ifdef CUDA_TARGET
extern int isReneighboured;
extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
extern void copyDataToCUDADevice(Atom *atom);
extern void copyDataFromCUDADevice(Atom *atom);
extern void cudaDeviceFree();
#ifdef USE_SUPER_CLUSTERS
#include <utils.h>
extern void buildNeighborGPU(Atom *atom, Neighbor *neighbor);
extern void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor);
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
#endif //USE_SUPER_CLUSTERS
#endif //CUDA_TARGET
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
param->xprd = param->nx * param->lattice;
param->yprd = param->ny * param->lattice;
param->zprd = param->nz * param->lattice;
S = getTimeStamp();
initAtom(atom);
initPbc(atom);
initStats(stats);
initNeighbor(neighbor, param);
if(param->input_file == NULL) {
createAtom(atom, param);
} else {
readAtom(atom, param);
}
setupNeighbor(param, atom);
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildClustersGPU(atom);
#else
buildClusters(atom);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
defineJClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
setupPbcGPU(atom, param);
//setupPbc(atom, param);
#else
setupPbc(atom, param);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
binClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildNeighborGPU(atom, neighbor);
#else
buildNeighbor(atom, neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
initDevice(atom, neighbor);
E = getTimeStamp();
return E-S;
}
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
updateSingleAtoms(atom);
updateAtomsPbc(atom, param);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildClustersGPU(atom);
#else
buildClusters(atom);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
defineJClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
//setupPbcGPU(atom, param);
setupPbc(atom, param);
#else
setupPbc(atom, param);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
binClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildNeighborGPU(atom, neighbor);
#else
buildNeighbor(atom, neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
return E-S;
}
void printAtomState(Atom *atom) {
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
/* int nall = atom->Nlocal + atom->Nghost; */
/* for (int i=0; i<nall; i++) { */
/* printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]); */
/* } */
}
int main(int argc, char** argv) {
double timer[NUMTIMER];
Eam eam;
Atom atom;
Neighbor neighbor;
Stats stats;
Parameter param;
LIKWID_MARKER_INIT;
#pragma omp parallel
{
LIKWID_MARKER_REGISTER("force");
//LIKWID_MARKER_REGISTER("reneighbour");
//LIKWID_MARKER_REGISTER("pbc");
}
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0)) {
readParameter(&param, argv[++i]);
continue;
}
if((strcmp(argv[i], "-f") == 0)) {
if((param.force_field = str2ff(argv[++i])) < 0) {
fprintf(stderr, "Invalid force field!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-i") == 0)) {
param.input_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-e") == 0)) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nx") == 0)) {
param.nx = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-ny") == 0)) {
param.ny = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nz") == 0)) {
param.nz = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-half") == 0)) {
param.half_neigh = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-m") == 0) || (strcmp(argv[i], "--mass") == 0)) {
param.mass = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
param.cutforce = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
param.skin = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--freq") == 0)) {
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--vtk") == 0)) {
param.vtk_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "--xtc") == 0)) {
#ifndef XTC_OUTPUT
fprintf(stderr, "XTC not available, set XTC_OUTPUT option in config.mk file and recompile MD-Bench!");
exit(-1);
#else
param.xtc_file = strdup(argv[++i]);
#endif
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
printf("-f <string>: force field (lj or eam), default lj\n");
printf("-i <string>: input file with atom positions (dump)\n");
printf("-e <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-r / --radius <real>: set cutoff radius\n");
printf("-s / --skin <real>: set skin (verlet buffer)\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf("--xtc <string>: XTC file for visualization\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
param.cutneigh = param.cutforce + param.skin;
setup(&param, &eam, &atom, &neighbor, &stats);
printParameter(&param);
printf(HLINE);
//verifyNeigh(&atom, &neighbor);
printf("step\ttemp\t\tpressure\n");
computeThermo(0, &param, &atom);
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
#ifdef CUDA_TARGET
copyDataToCUDADevice(&atom);
#endif
if(param.force_field == FF_EAM) {
timer[FORCE] = computeForceEam(&eam, &param, &atom, &neighbor, &stats);
} else {
timer[FORCE] = computeForceLJ(&param, &atom, &neighbor, &stats);
}
timer[NEIGH] = 0.0;
timer[TOTAL] = getTimeStamp();
if(param.vtk_file != NULL) {
write_data_to_vtk_file(param.vtk_file, &atom, 0);
}
if(param.xtc_file != NULL) {
xtc_init(param.xtc_file, &atom, 0);
}
for(int n = 0; n < param.ntimes; n++) {
//printf("Step:\t%d\r\n", n);
initialIntegrate(&param, &atom);
if((n + 1) % param.reneigh_every) {
if(!((n + 1) % param.prune_every)) {
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
pruneNeighborGPU(&param, &atom, &neighbor);
#else
pruneNeighbor(&param, &atom, &neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
}
copyDataFromCUDADevice(&atom);
updatePbc(&atom, &param, 0);
copyDataToCUDADevice(&atom);
} else {
#ifdef CUDA_TARGET
copyDataFromCUDADevice(&atom);
#endif
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
#ifdef CUDA_TARGET
copyDataToCUDADevice(&atom);
isReneighboured = 1;
#endif
}
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
/*
printf("%d\t%d\r\n", atom.Nsclusters_local, atom.Nclusters_local);
copyDataToCUDADevice(&atom);
verifyLayout(&atom);
//printClusterIndices(&atom);
*/
if(param.force_field == FF_EAM) {
timer[FORCE] += computeForceEam(&eam, &param, &atom, &neighbor, &stats);
} else {
timer[FORCE] += computeForceLJ(&param, &atom, &neighbor, &stats);
}
/*
copyDataFromCUDADevice(&atom);
verifyLayout(&atom);
getchar();
*/
finalIntegrate(&param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
computeThermo(n + 1, &param, &atom);
}
int write_pos = !((n + 1) % param.x_out_every);
int write_vel = !((n + 1) % param.v_out_every);
if(write_pos || write_vel) {
if(param.vtk_file != NULL) {
write_data_to_vtk_file(param.vtk_file, &atom, n + 1);
}
if(param.xtc_file != NULL) {
xtc_write(&atom, n + 1, write_pos, write_vel);
}
}
}
#ifdef CUDA_TARGET
copyDataFromCUDADevice(&atom);
#endif
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
updateSingleAtoms(&atom);
computeThermo(-1, &param, &atom);
if(param.xtc_file != NULL) {
xtc_end();
}
#ifdef CUDA_TARGET
cudaDeviceFree();
#endif
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS
displayStatistics(&atom, &param, &stats, timer);
#endif
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

1326
gromacs/neighbor.c Normal file

File diff suppressed because it is too large Load Diff

411
gromacs/pbc.c Normal file
View File

@@ -0,0 +1,411 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <pbc.h>
#include <atom.h>
#include <allocate.h>
#include <neighbor.h>
#include <util.h>
#define DELTA 20000
static int NmaxGhost;
static void growPbc(Atom*);
/* exported subroutines */
void initPbc(Atom* atom) {
NmaxGhost = 0;
atom->border_map = NULL;
atom->PBCx = NULL; atom->PBCy = NULL; atom->PBCz = NULL;
}
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
DEBUG_MESSAGE("updatePbc start\n");
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
cj_x[CL_X_OFFSET + cjj] = xtmp;
cj_x[CL_Y_OFFSET + cjj] = ytmp;
cj_x[CL_Z_OFFSET + cjj] = ztmp;
if(firstUpdate) {
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
}
}
if(firstUpdate) {
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
}
atom->jclusters[cj].bbminx = bbminx;
atom->jclusters[cj].bbmaxx = bbmaxx;
atom->jclusters[cj].bbminy = bbminy;
atom->jclusters[cj].bbmaxy = bbmaxy;
atom->jclusters[cj].bbminz = bbminz;
atom->jclusters[cj].bbmaxz = bbmaxz;
}
}
DEBUG_MESSAGE("updatePbc end\n");
}
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void gpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
DEBUG_MESSAGE("gpuUpdatePbc start\n");
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int scj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
int sbmap_vec_base = SCJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
MD_FLOAT *scj_x = &atom->scl_x[scj_vec_base];
MD_FLOAT *sbmap_x = &atom->scl_x[sbmap_vec_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
MD_FLOAT sbbminx = INFINITY, sbbmaxx = -INFINITY;
MD_FLOAT sbbminy = INFINITY, sbbmaxy = -INFINITY;
MD_FLOAT sbbminz = INFINITY, sbbmaxz = -INFINITY;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
MD_FLOAT sxtmp = sbmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
MD_FLOAT sytmp = sbmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
MD_FLOAT sztmp = sbmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
cj_x[CL_X_OFFSET + cjj] = xtmp;
cj_x[CL_Y_OFFSET + cjj] = ytmp;
cj_x[CL_Z_OFFSET + cjj] = ztmp;
scj_x[SCL_X_OFFSET + cjj] = sxtmp;
scj_x[SCL_Y_OFFSET + cjj] = sytmp;
scj_x[SCL_Z_OFFSET + cjj] = sztmp;
if(firstUpdate) {
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
if(sbbminx > sxtmp) { sbbminx = sxtmp; }
if(sbbmaxx < sxtmp) { sbbmaxx = sxtmp; }
if(sbbminy > sytmp) { sbbminy = sytmp; }
if(sbbmaxy < sytmp) { sbbmaxy = sytmp; }
if(sbbminz > sztmp) { sbbminz = sztmp; }
if(sbbmaxz < sztmp) { sbbmaxz = sztmp; }
}
}
if(firstUpdate) {
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
scj_x[SCL_X_OFFSET + cjj] = INFINITY;
scj_x[SCL_Y_OFFSET + cjj] = INFINITY;
scj_x[SCL_Z_OFFSET + cjj] = INFINITY;
}
atom->jclusters[cj].bbminx = bbminx;
atom->jclusters[cj].bbmaxx = bbmaxx;
atom->jclusters[cj].bbminy = bbminy;
atom->jclusters[cj].bbmaxy = bbmaxy;
atom->jclusters[cj].bbminz = bbminz;
atom->jclusters[cj].bbmaxz = bbmaxz;
}
}
DEBUG_MESSAGE("gpuUpdatePbc end\n");
}
/* relocate atoms that have left domain according
* to periodic boundary conditions */
void updateAtomsPbc(Atom *atom, Parameter *param) {
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int i = 0; i < atom->Nlocal; i++) {
if(atom_x(i) < 0.0) {
atom_x(i) += xprd;
} else if(atom_x(i) >= xprd) {
atom_x(i) -= xprd;
}
if(atom_y(i) < 0.0) {
atom_y(i) += yprd;
} else if(atom_y(i) >= yprd) {
atom_y(i) -= yprd;
}
if(atom_z(i) < 0.0) {
atom_z(i) += zprd;
} else if(atom_z(i) >= zprd) {
atom_z(i) -= zprd;
}
}
}
/* setup periodic boundary conditions by
* defining ghost atoms around domain
* only creates mapping and coordinate corrections
* that are then enforced in updatePbc */
#define ADDGHOST(dx,dy,dz); \
Nghost++; \
const int cg = ncj + Nghost; \
const int cj_natoms = atom->jclusters[cj].natoms; \
atom->border_map[Nghost] = cj; \
atom->PBCx[Nghost] = dx; \
atom->PBCy[Nghost] = dy; \
atom->PBCz[Nghost] = dz; \
atom->jclusters[cg].natoms = cj_natoms; \
Nghost_atoms += cj_natoms; \
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); \
int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg); \
for(int cjj = 0; cjj < cj_natoms; cjj++) { \
atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj]; \
}
/* internal subroutines */
void growPbc(Atom* atom) {
int nold = NmaxGhost;
NmaxGhost += DELTA;
atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
atom->PBCx = (int*) reallocate(atom->PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
atom->PBCy = (int*) reallocate(atom->PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
atom->PBCz = (int*) reallocate(atom->PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
}
void setupPbc(Atom *atom, Parameter *param) {
DEBUG_MESSAGE("setupPbc start\n");
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
MD_FLOAT Cutneigh = param->cutneigh;
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
int Nghost = -1;
int Nghost_atoms = 0;
for(int cj = 0; cj < ncj; cj++) {
if(atom->jclusters[cj].natoms > 0) {
if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
growClusters(atom);
}
if((Nghost + 7) * jfac >= NmaxGhost) {
growPbc(atom);
}
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
/* Setup ghost atoms */
/* 6 planes */
if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); }
if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); }
if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); }
if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
/* 8 corners */
if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); }
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); }
if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
/* 12 edges */
if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); }
if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); }
if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); }
if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
}
}
if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
growClusters(atom);
}
// Add dummy cluster at the end
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
}
// increase by one to make it the ghost atom count
atom->dummy_cj = ncj + Nghost + 1;
atom->Nghost = Nghost_atoms;
atom->Nclusters_ghost = Nghost + 1;
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
// Update created ghost clusters positions
cpuUpdatePbc(atom, param, 1);
DEBUG_MESSAGE("setupPbc end\n");
}
void setupPbcGPU(Atom *atom, Parameter *param) {
DEBUG_MESSAGE("setupPbcGPU start\n");
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
MD_FLOAT Cutneigh = param->cutneigh;
//int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int jfac = SCLUSTER_M / CLUSTER_M;
int ncj = atom->Nsclusters_local * jfac;
int Nghost = -1;
int Nghost_atoms = 0;
for(int cj = 0; cj < ncj; cj++) {
if(atom->jclusters[cj].natoms > 0) {
if(atom->Nsclusters_local + (Nghost + (jfac - 1) + 7) / jfac >= atom->Nclusters_max) {
growClusters(atom);
//growSuperClusters(atom);
}
if((Nghost + 7) * CLUSTER_M >= NmaxGhost) {
growPbc(atom);
}
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
/* Setup ghost atoms */
/* 6 planes */
if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); }
if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); }
if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); }
if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
/* 8 corners */
if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); }
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); }
if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
/* 12 edges */
if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); }
if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); }
if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); }
if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); }
if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
}
}
if(ncj + (Nghost + (jfac - 1) + 1) / jfac >= atom->Nclusters_max) {
growClusters(atom);
//growSuperClusters(atom);
}
// Add dummy cluster at the end
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
}
// increase by one to make it the ghost atom count
atom->dummy_cj = ncj + Nghost + 1;
atom->Nghost = Nghost_atoms;
atom->Nclusters_ghost = Nghost + 1;
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
// Update created ghost clusters positions
gpuUpdatePbc(atom, param, 1);
DEBUG_MESSAGE("setupPbcGPU end\n");
}

58
gromacs/stats.c Normal file
View File

@@ -0,0 +1,58 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <atom.h>
#include <parameter.h>
#include <stats.h>
#include <timers.h>
void initStats(Stats *s) {
s->calculated_forces = 0;
s->num_neighs = 0;
s->force_iters = 0;
s->atoms_within_cutoff = 0;
s->atoms_outside_cutoff = 0;
s->clusters_within_cutoff = 0;
s->clusters_outside_cutoff = 0;
}
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
#ifdef COMPUTE_STATS
const int MxN = CLUSTER_M * CLUSTER_N;
double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
(double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
#ifdef EXPLICIT_TYPES
force_useful_volume += 1e-9 * (double)((atom->Nlocal * (param->ntimes + 1)) + stats->num_neighs) * sizeof(int);
#endif
printf("Statistics:\n");
printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
printf("\tAverage atoms per cluster: %.4f\n", avg_atoms_cluster);
printf("\tAverage neighbors per atom: %.4f\n", avg_neigh_atom);
printf("\tAverage neighbors per cluster: %.4f\n", avg_neigh_cluster);
printf("\tAverage SIMD iterations per atom: %.4f\n", avg_simd);
printf("\tTotal number of computed pair interactions: %lld\n", stats->num_neighs * MxN);
printf("\tTotal number of SIMD iterations: %lld\n", stats->force_iters);
printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->force_iters);
#ifdef USE_REFERENCE_VERSION
const double atoms_eff = (double)stats->atoms_within_cutoff / (double)(stats->atoms_within_cutoff + stats->atoms_outside_cutoff) * 100.0;
printf("\tAtoms within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->atoms_within_cutoff, stats->atoms_outside_cutoff, atoms_eff);
const double clusters_eff = (double)stats->clusters_within_cutoff / (double)(stats->clusters_within_cutoff + stats->clusters_outside_cutoff) * 100.0;
printf("\tClusters within/outside cutoff radius: %lld/%lld (%.2f%%)\n", stats->clusters_within_cutoff, stats->clusters_outside_cutoff, clusters_eff);
#endif
#endif
}

59
gromacs/tracing.c Normal file
View File

@@ -0,0 +1,59 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <tracing.h>
void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep) {
MEM_TRACER_INIT;
INDEX_TRACER_INIT;
int Nlocal = atom->Nlocal;
int* neighs;
//MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MEM_TRACE(atom_x(i), 'R');
MEM_TRACE(atom_y(i), 'R');
MEM_TRACE(atom_z(i), 'R');
INDEX_TRACE_ATOM(i);
#ifdef EXPLICIT_TYPES
MEM_TRACE(atom->type[i], 'R');
#endif
DIST_TRACE_SORT(neighs, numneighs);
INDEX_TRACE(neighs, numneighs);
DIST_TRACE(neighs, numneighs);
for(int k = 0; k < numneighs; k++) {
MEM_TRACE(neighs[k], 'R');
MEM_TRACE(atom_x(j), 'R');
MEM_TRACE(atom_y(j), 'R');
MEM_TRACE(atom_z(j), 'R');
#ifdef EXPLICIT_TYPES
MEM_TRACE(atom->type[j], 'R');
#endif
}
/*
MEM_TRACE(fx[i], 'R');
MEM_TRACE(fx[i], 'W');
MEM_TRACE(fy[i], 'R');
MEM_TRACE(fy[i], 'W');
MEM_TRACE(fz[i], 'R');
MEM_TRACE(fz[i], 'W');
*/
}
INDEX_TRACER_END;
MEM_TRACER_END;
}

332
gromacs/utils.c Normal file
View File

@@ -0,0 +1,332 @@
/*
* Temporal functions for debugging, remove before proceeding with pull request
*/
#include <stdio.h>
#include <stdlib.h>
#include <utils.h>
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
#ifdef USE_SUPER_CLUSTERS
/*
void verifyClusters(Atom *atom) {
unsigned int count = 0;
for (int i = 0; i < atom->Nsclusters_local; i++) {
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
for(int cii = 0; cii < CLUSTER_M; cii++, count++);
}
}
MD_FLOAT *x = malloc(count * sizeof(MD_FLOAT));
MD_FLOAT *y = malloc(count * sizeof(MD_FLOAT));
MD_FLOAT *z = malloc(count * sizeof(MD_FLOAT));
count = 0;
unsigned int diffs = 0;
printf("######### %d #########\r\n", atom->Nsclusters_local);
for (int i = 0; i < atom->Nsclusters_local; i++) {
printf("######### %d\t #########\r\n", atom->siclusters[i].nclusters);
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
//printf("%d\t", atom.siclusters[i].iclusters[j]);
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[i].iclusters[j])];
if (atom->iclusters[atom->siclusters[i].iclusters[j]].bbminx < atom->siclusters[i].bbminx ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxx > atom->siclusters[i].bbmaxx ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminy < atom->siclusters[i].bbminy ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxy > atom->siclusters[i].bbmaxy ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminz < atom->siclusters[i].bbminz ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxz > atom->siclusters[i].bbmaxz) diffs++;
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
x[count] = ci_x[CL_X_OFFSET + cii];
y[count] = ci_x[CL_Y_OFFSET + cii];
z[count] = ci_x[CL_Z_OFFSET + cii];
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
printf("######### \t #########\r\n");
}
printf("######### Diffs: %d\t #########\r\n", diffs);
printf("\r\n");
count = 0;
diffs = 0;
for (int i = 0; i < atom->Nclusters_local; i++) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
}
}
printf("######### Diffs: %d\t #########\r\n", diffs);
}
*/
void verifyLayout(Atom *atom) {
printf("verifyLayout start\r\n");
/*
unsigned int count = 0;
for (int i = 0; i < atom->Nsclusters_local; i++) {
for (int j = 0; j < atom->siclusters[i].nclusters; j++, count++);
}
MD_FLOAT *scl_x = malloc(atom->Nsclusters_local * SCLUSTER_SIZE * 3 * CLUSTER_M * sizeof(MD_FLOAT));
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
const unsigned int atom_offset = scci;
/*
for(int cii = 0, scii = atom_offset; cii < CLUSTER_M; cii++, scii += 3) {
scl_x[CL_X_OFFSET + scii] = ci_x[CL_X_OFFSET + cii];
scl_x[CL_Y_OFFSET + scii] = ci_x[CL_Y_OFFSET + cii];
scl_x[CL_Z_OFFSET + scii] = ci_x[CL_Z_OFFSET + cii];
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
memcpy(&scl_x[atom_offset], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&scl_x[atom_offset + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&scl_x[atom_offset + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
*/
//alignDataToSuperclusters(atom);
//for (int sci = 0; sci < 2; sci++) {
for (int sci = 4; sci < 6; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE;
MD_FLOAT *sci_x = &atom->scl_f[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[cii],
sci_x[cii + SCLUSTER_SIZE * CLUSTER_M], sci_x[cii + 2 * SCLUSTER_SIZE * CLUSTER_M]);
*/
printf("%d\t%d\t%f\t%f\t%f\r\n", atom->icluster_idx[SCLUSTER_SIZE * sci + cl_idx], cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
/*
//for (int cii = 0; cii < SCLUSTER_M; ++cii) {
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + cii],
sci_x[SCL_Y_OFFSET(cl_idx) + cii], sci_x[SCL_Z_OFFSET(cl_idx) + cii]);
*/
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_Z_OFFSET(cl_idx) + ciii]);
}
*/
/*
for (int scii = scl_offset; scii < scl_offset + SCLUSTER_SIZE; scii++) {
for (int cii = 0; cii < CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", sci_x[SCL_X_OFFSET(scii) + cii],
sci_x[SCL_Y_OFFSET(scii) + cii], sci_x[SCL_Z_OFFSET(scii) + cii]);
}
/*
const unsigned int cl_offset = scii * 3 * CLUSTER_M;
//MD_FLOAT *sci_x = &scl_x[CI_VECTOR_BASE_INDEX(scii)];
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", sci_x[CL_X_OFFSET + cii],
sci_x[CL_Y_OFFSET + cii], sci_x[CL_Z_OFFSET + cii]);
}
*/
/*
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", scl_x[CL_X_OFFSET + cii],
scl_x[CL_Y_OFFSET + cii], scl_x[CL_Z_OFFSET + cii]);
}
*/
//}
printf("##########\t##########\r\n");
}
printf("\r\n");
//for (int ci = 0; ci < 16; ci++) {
for (int ci = 35; ci < 37; ci++) {
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
MD_FLOAT *ci_x = &atom->cl_f[CI_VECTOR_BASE_INDEX(ci)];
//for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
for(int cii = 0; cii < CLUSTER_M; cii++) {
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
ci_x[CL_Y_OFFSET + cii],
ci_x[CL_Z_OFFSET + cii]);
}
printf("##########\t##########\r\n");
}
printf("verifyLayout end\r\n");
/*
for (int i = 0; i < atom->Nclusters_local; i++) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
}
}
*/
}
void checkAlignment(Atom *atom) {
alignDataToSuperclusters(atom);
for (int sci = 4; sci < 6; sci++) {
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
}
for (int ci = 35; ci < 37; ci++) {
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(ci)];
for(int cii = 0; cii < CLUSTER_M; cii++) {
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
ci_x[CL_Y_OFFSET + cii],
ci_x[CL_Z_OFFSET + cii]);
}
printf("##########\t##########\r\n");
}
}
void showSuperclusters(Atom *atom) {
for (int sci = 4; sci < 6; sci++) {
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
}
}
void printNeighs(Atom *atom, Neighbor *neighbor) {
for (int i = 0; i < atom->Nclusters_local; ++i) {
int neigh_num = neighbor->numneigh[i];
for (int j = 0; j < neigh_num; j++) {
printf("%d ", neighbor->neighbors[ i * neighbor->maxneighs + j]);
}
printf("\r\n");
}
}
void printClusterIndices(Atom *atom) {
for (int i = 0; i < atom->Nsclusters_local; ++i) {
int clusters_num = atom->siclusters[i].nclusters;
for (int j = 0; j < clusters_num; j++) {
printf("%d ", atom->icluster_idx[j + SCLUSTER_SIZE * i]);
}
printf("\r\n");
}
}
void verifyNeigh(Atom *atom, Neighbor *neighbor) {
buildNeighbor(atom, neighbor);
int *numneigh = (int*) malloc(atom->Nclusters_local * sizeof(int));
int *neighbors = (int*) malloc(atom->Nclusters_local * neighbor->maxneighs * sizeof(int*));
for (int i = 0; i < atom->Nclusters_local; ++i) {
int neigh_num = neighbor->numneigh[i];
numneigh[i] = neighbor->numneigh[i];
neighbor->numneigh[i] = 0;
for (int j = 0; j < neigh_num; j++) {
neighbors[i * neighbor->maxneighs + j] = neighbor->neighbors[i * neighbor->maxneighs + j];
neighbor->neighbors[i * neighbor->maxneighs + j] = 0;
}
}
buildNeighborGPU(atom, neighbor);
unsigned int num_diff = 0;
unsigned int neigh_diff = 0;
for (int i = 0; i < atom->Nclusters_local; ++i) {
int neigh_num = neighbor->numneigh[i];
if (numneigh[i] != neigh_num) num_diff++;
for (int j = 0; j < neigh_num; j++) {
if (neighbors[i * neighbor->maxneighs + j] !=
neighbor->neighbors[ i * neighbor->maxneighs + j]) neigh_diff++;
}
}
printf("%d\t%d\r\n", num_diff, neigh_diff);
}
#endif //USE_SUPER_CLUSTERS

243
gromacs/vtk.c Normal file
View File

@@ -0,0 +1,243 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <atom.h>
#include <vtk.h>
void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
write_local_atoms_to_vtk_file(filename, atom, timestep);
write_ghost_atoms_to_vtk_file(filename, atom, timestep);
write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
#ifdef USE_SUPER_CLUSTERS
write_super_clusters_to_vtk_file(filename, atom, timestep);
#endif //#ifdef USE_SUPER_CLUSTERS
}
#ifdef USE_SUPER_CLUSTERS
int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_sup_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nsclusters_local * SCLUSTER_M);
for(int ci = 0; ci < atom->Nsclusters_local; ++ci) {
int factor = (rand() % 1000) + 1;
//double factor = ci * 10;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->scl_x[ci_vec_base];
for(int cii = 0; cii < SCLUSTER_M; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[SCL_X_OFFSET + cii] * factor, ci_x[SCL_Y_OFFSET + cii] * factor, ci_x[SCL_Z_OFFSET + cii] * factor);
}
}
fprintf(fp, "\n\n");
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1 %d\n", i);
}
fprintf(fp, "\n\n");
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1\n");
}
fprintf(fp, "\n\n");
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
fprintf(fp, "SCALARS mass double\n");
fprintf(fp, "LOOKUP_TABLE default\n");
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fp, "1.0\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
#endif //USE_SUPER_CLUSTERS
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
fprintf(fp, "\n\n");
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1 %d\n", i);
}
fprintf(fp, "\n\n");
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1\n");
}
fprintf(fp, "\n\n");
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
fprintf(fp, "SCALARS mass double\n");
fprintf(fp, "LOOKUP_TABLE default\n");
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fp, "1.0\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nghost);
for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
fprintf(fp, "\n\n");
fprintf(fp, "CELLS %d %d\n", atom->Nghost, atom->Nghost * 2);
for(int i = 0; i < atom->Nghost; ++i) {
fprintf(fp, "1 %d\n", i);
}
fprintf(fp, "\n\n");
fprintf(fp, "CELL_TYPES %d\n", atom->Nghost);
for(int i = 0; i < atom->Nghost; ++i) {
fprintf(fp, "1\n");
}
fprintf(fp, "\n\n");
fprintf(fp, "POINT_DATA %d\n", atom->Nghost);
fprintf(fp, "SCALARS mass double\n");
fprintf(fp, "LOOKUP_TABLE default\n");
for(int i = 0; i < atom->Nghost; i++) {
fprintf(fp, "1.0\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_edges_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
int N = atom->Nclusters_local;
int tot_lines = 0;
int i = 0;
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET POLYDATA\n");
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
for(int ci = 0; ci < N; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
tot_lines += atom->iclusters[ci].natoms;
}
fprintf(fp, "\n\n");
fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
for(int ci = 0; ci < N; ++ci) {
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%d ", i++);
}
fprintf(fp, "\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_ghost_edges_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
int N = atom->Nclusters_local + atom->Nclusters_ghost;
int tot_lines = 0;
int i = 0;
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET POLYDATA\n");
fprintf(fp, "POINTS %d double\n", atom->Nghost);
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
tot_lines += atom->iclusters[ci].natoms;
}
fprintf(fp, "\n\n");
fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%d ", i++);
}
fprintf(fp, "\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}

56
gromacs/xtc.c Normal file
View File

@@ -0,0 +1,56 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
//---
#include <atom.h>
#include <allocate.h>
#include <xtc.h>
#ifdef XTC_OUTPUT
#include <gromacs/fileio/xtcio.h>
static struct t_fileio *xtc_file = NULL;
static rvec *x_buf = NULL;
static rvec basis[3];
void xtc_init(const char *filename, Atom *atom, int timestep) {
basis[0][XX] = 1.0;
basis[0][YY] = 0.0;
basis[0][ZZ] = 0.0;
basis[1][XX] = 0.0;
basis[1][YY] = 1.0;
basis[1][ZZ] = 0.0;
basis[2][XX] = 0.0;
basis[2][YY] = 0.0;
basis[2][ZZ] = 1.0;
xtc_file = open_xtc(filename, "w");
x_buf = (rvec *) allocate(ALIGNMENT, sizeof(rvec) * (atom->Nlocal + 1));
xtc_write(atom, timestep, 1, 1);
}
void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
int i = 0;
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
i++;
}
}
write_xtc(xtc_file, atom->Nlocal, timestep, 0.0, (const rvec *) basis, (const rvec *) x_buf, 1000);
}
void xtc_end() {
free(x_buf);
close_xtc(xtc_file);
}
#endif

View File

@@ -1,4 +1,4 @@
CC = cc
CC = clang
LINKER = $(CC)
ANSI_CFLAGS = -ansi
@@ -6,7 +6,9 @@ ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
CFLAGS = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
#CFLAGS = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
ASFLAGS = -masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE

View File

@@ -7,9 +7,14 @@ ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
#CFLAGS = -O0 -g -std=c99 -fargument-noalias
CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
ASFLAGS = -masm=intel
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops # -fopenmp
CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -Ofast -march=native -mavx2 -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O3 -march=native -ffast-math -funroll-loops # -fopenmp
#CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp
ASFLAGS = #-masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE -DLIKWID_PERFMON
DEFINES = -D_GNU_SOURCE -DNO_ZMM_INTRIN
INCLUDES = $(LIKWID_INC)
LIBS = -lm $(LIKWID_LIB) -llikwid
LIBS = -lm

11
include_GROMACS.mk Normal file
View File

@@ -0,0 +1,11 @@
GROMACS_PATH=/apps/Gromacs/2018.1-mkl
GROMACS_INC ?= -I${GROMACS_PATH}/include
GROMACS_DEFINES ?=
GROMACS_LIB ?= -L${GROMACS_PATH}/lib64
ifeq ($(strip $(XTC_OUTPUT)),true)
INCLUDES += ${GROMACS_INC}
DEFINES += ${GROMACS_DEFINES}
LIBS += -lgromacs
LFLAGS += ${GROMACS_LIB}
endif

View File

@@ -4,14 +4,15 @@ LINKER = $(CC)
OPENMP = #-qopenmp
PROFILE = #-profile-functions -g -pg
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
#OPTS = -fast -xCORE-AVX2 $(PROFILE)
#OPTS = -fast -xAVX $(PROFILE)
#OPTS = -fast -xSSE4.2 $(PROFILE)
#OPTS = -fast -no-vec $(PROFILE)
#OPTS = -fast -xHost $(PROFILE)
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xAVX $(PROFILE)
#OPTS = -Ofast -xAVX2 $(PROFILE)
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
#OPTS = -Ofast -no-vec $(PROFILE)
#OPTS = -Ofast -xHost $(PROFILE)
CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS)
ASFLAGS = #-masm=intel
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
DEFINES = -D_GNU_SOURCE #-DLIKWID_PERFMON
INCLUDES = #$(LIKWID_INC)
LIBS = -lm #$(LIKWID_LIB) -llikwid
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE
INCLUDES =
LIBS = -lm

18
include_ICX.mk Normal file
View File

@@ -0,0 +1,18 @@
CC = icx
LINKER = $(CC)
OPENMP = #-qopenmp
PROFILE = #-profile-functions -g -pg
#OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
#OPTS = -Ofast -xCORE-AVX2 $(PROFILE)
#OPTS = -Ofast -xAVX $(PROFILE)
#OPTS = -Ofast -xAVX2 $(PROFILE)
#OPTS = -Ofast -xSSE4.2 $(PROFILE)
#OPTS = -Ofast -no-vec $(PROFILE)
OPTS = -Ofast -xHost $(PROFILE)
CFLAGS = $(PROFILE) $(OPENMP) $(OPTS)
ASFLAGS = #-masm=intel
LFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
DEFINES = -std=c11 -pedantic-errors -D_GNU_SOURCE -DNO_ZMM_INTRIN
INCLUDES =
LIBS = -lm

27
include_ISA.mk Normal file
View File

@@ -0,0 +1,27 @@
ifeq ($(strip $(ISA)), SSE)
__ISA_SSE__=true
__SIMD_WIDTH_DBL__=2
else ifeq ($(strip $(ISA)), AVX)
__ISA_AVX__=true
__SIMD_WIDTH_DBL__=4
else ifeq ($(strip $(ISA)), AVX_FMA)
__ISA_AVX__=true
__ISA_AVX_FMA__=true
__SIMD_WIDTH_DBL__=4
else ifeq ($(strip $(ISA)), AVX2)
__ISA_AVX2__=true
#__SIMD_KERNEL__=true
__SIMD_WIDTH_DBL__=4
else ifeq ($(strip $(ISA)), AVX512)
__ISA_AVX512__=true
__SIMD_KERNEL__=true
__SIMD_WIDTH_DBL__=8
endif
# SIMD width is specified in double-precision, hence it may
# need to be adjusted for single-precision
ifeq ($(strip $(DATA_TYPE)), SP)
VECTOR_WIDTH=$(shell echo $$(( $(__SIMD_WIDTH_DBL__) * 2 )))
else
VECTOR_WIDTH=$(__SIMD_WIDTH_DBL__)
endif

24
include_NVCC.mk Normal file
View File

@@ -0,0 +1,24 @@
CC = nvcc
LINKER = $(CC)
ANSI_CFLAGS = -ansi
ANSI_CFLAGS += -std=c99
ANSI_CFLAGS += -pedantic
ANSI_CFLAGS += -Wextra
#
# A100 + Native
#CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
CFLAGS = -O3 -arch=compute_61 -code=sm_61,sm_80,sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# A40 + Native
#CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# Cascade Lake
#CFLAGS = -O3 -march=cascadelake -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# For GROMACS kernels, we need at least sm_61 due to atomicAdd with doubles
# TODO: Check if this is required for full neighbor-lists and just compile kernel for that case if not
#CFLAGS = -O3 -g -arch=sm_61 # -fopenmp
ASFLAGS = -masm=intel
LFLAGS =
DEFINES = -D_GNU_SOURCE -DCUDA_TARGET -DNO_ZMM_INTRIN #-DLIKWID_PERFMON
INCLUDES = $(LIKWID_INC)
LIBS = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid

17
include_ONEAPI.mk Normal file
View File

@@ -0,0 +1,17 @@
CC = icx
LINKER = $(CC)
OPENMP = -qopenmp-simd
PROFILE = #-g -pg
#OPTS = -Ofast -no-vec
#OPTS = -Ofast -xSSE4.2
#OPTS = -Ofast -xAVX
#OPTS = -Ofast -xCORE-AVX2
OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high
#OPTS = -Ofast -xHost
CFLAGS = $(PROFILE) $(OPTS) $(OPENMP)
ASFLAGS = -masm=intel
LFLAGS = $(PROFILE) $(OPTS)
DEFINES = -D_GNU_SOURCE -DNOCHUNK
INCLUDES =
LIBS = -lm

536
lammps/atom.c Normal file
View File

@@ -0,0 +1,536 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <atom.h>
#include <allocate.h>
#include <device.h>
#include <util.h>
#define DELTA 20000
#ifndef MAXLINE
#define MAXLINE 4096
#endif
#ifndef MAX
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#endif
void initAtom(Atom *atom) {
atom->x = NULL; atom->y = NULL; atom->z = NULL;
atom->vx = NULL; atom->vy = NULL; atom->vz = NULL;
atom->fx = NULL; atom->fy = NULL; atom->fz = NULL;
atom->Natoms = 0;
atom->Nlocal = 0;
atom->Nghost = 0;
atom->Nmax = 0;
atom->type = NULL;
atom->ntypes = 0;
atom->epsilon = NULL;
atom->sigma6 = NULL;
atom->cutforcesq = NULL;
atom->cutneighsq = NULL;
atom->radius = NULL;
atom->av = NULL;
atom->r = NULL;
DeviceAtom *d_atom = &(atom->d_atom);
d_atom->x = NULL; d_atom->y = NULL; d_atom->z = NULL;
d_atom->vx = NULL; d_atom->vy = NULL; d_atom->vz = NULL;
d_atom->fx = NULL; d_atom->fy = NULL; d_atom->fz = NULL;
d_atom->border_map = NULL;
d_atom->type = NULL;
d_atom->epsilon = NULL;
d_atom->sigma6 = NULL;
d_atom->cutforcesq = NULL;
d_atom->cutneighsq = NULL;
}
void createAtom(Atom *atom, Parameter *param) {
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = param->xprd;
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = param->yprd;
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = param->zprd;
atom->Natoms = 4 * param->nx * param->ny * param->nz;
atom->Nlocal = 0;
atom->ntypes = param->ntypes;
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0));
int ilo = (int) (xlo / (0.5 * alat) - 1);
int ihi = (int) (xhi / (0.5 * alat) + 1);
int jlo = (int) (ylo / (0.5 * alat) - 1);
int jhi = (int) (yhi / (0.5 * alat) + 1);
int klo = (int) (zlo / (0.5 * alat) - 1);
int khi = (int) (zhi / (0.5 * alat) + 1);
ilo = MAX(ilo, 0);
ihi = MIN(ihi, 2 * param->nx - 1);
jlo = MAX(jlo, 0);
jhi = MIN(jhi, 2 * param->ny - 1);
klo = MAX(klo, 0);
khi = MIN(khi, 2 * param->nz - 1);
MD_FLOAT xtmp, ytmp, ztmp, vxtmp, vytmp, vztmp;
int i, j, k, m, n;
int sx = 0; int sy = 0; int sz = 0;
int ox = 0; int oy = 0; int oz = 0;
int subboxdim = 8;
while(oz * subboxdim <= khi) {
k = oz * subboxdim + sz;
j = oy * subboxdim + sy;
i = ox * subboxdim + sx;
if(((i + j + k) % 2 == 0) &&
(i >= ilo) && (i <= ihi) &&
(j >= jlo) && (j <= jhi) &&
(k >= klo) && (k <= khi)) {
xtmp = 0.5 * alat * i;
ytmp = 0.5 * alat * j;
ztmp = 0.5 * alat * k;
if( xtmp >= xlo && xtmp < xhi &&
ytmp >= ylo && ytmp < yhi &&
ztmp >= zlo && ztmp < zhi ) {
n = k * (2 * param->ny) * (2 * param->nx) +
j * (2 * param->nx) +
i + 1;
for(m = 0; m < 5; m++) {
myrandom(&n);
}
vxtmp = myrandom(&n);
for(m = 0; m < 5; m++){
myrandom(&n);
}
vytmp = myrandom(&n);
for(m = 0; m < 5; m++) {
myrandom(&n);
}
vztmp = myrandom(&n);
if(atom->Nlocal == atom->Nmax) {
growAtom(atom);
}
atom_x(atom->Nlocal) = xtmp;
atom_y(atom->Nlocal) = ytmp;
atom_z(atom->Nlocal) = ztmp;
atom_vx(atom->Nlocal) = vxtmp;
atom_vy(atom->Nlocal) = vytmp;
atom_vz(atom->Nlocal) = vztmp;
atom->type[atom->Nlocal] = rand() % atom->ntypes;
atom->Nlocal++;
}
}
sx++;
if(sx == subboxdim) { sx = 0; sy++; }
if(sy == subboxdim) { sy = 0; sz++; }
if(sz == subboxdim) { sz = 0; ox++; }
if(ox * subboxdim > ihi) { ox = 0; oy++; }
if(oy * subboxdim > jhi) { oy = 0; oz++; }
}
}
int type_str2int(const char *type) {
if(strncmp(type, "Ar", 2) == 0) { return 0; } // Argon
fprintf(stderr, "Invalid atom type: %s\n", type);
exit(-1);
return -1;
}
int readAtom(Atom* atom, Parameter* param) {
int len = strlen(param->input_file);
if(strncmp(&param->input_file[len - 4], ".pdb", 4) == 0) { return readAtom_pdb(atom, param); }
if(strncmp(&param->input_file[len - 4], ".gro", 4) == 0) { return readAtom_gro(atom, param); }
if(strncmp(&param->input_file[len - 4], ".dmp", 4) == 0) { return readAtom_dmp(atom, param); }
if(strncmp(&param->input_file[len - 3], ".in", 3) == 0) { return readAtom_in(atom, param); }
fprintf(stderr, "Invalid input file extension: %s\nValid choices are: pdb, gro, dmp, in\n", param->input_file);
exit(-1);
return -1;
}
int readAtom_pdb(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int read_atoms = 0;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
while(!feof(fp)) {
readline(line, fp);
char *item = strtok(line, " ");
if(strncmp(item, "CRYST1", 6) == 0) {
param->xlo = 0.0;
param->xhi = atof(strtok(NULL, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
// alpha, beta, gamma, sGroup, z
} else if(strncmp(item, "ATOM", 4) == 0) {
char *label;
int atom_id, comp_id;
MD_FLOAT occupancy, charge;
atom_id = atoi(strtok(NULL, " ")) - 1;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type_str2int(strtok(NULL, " "));
label = strtok(NULL, " ");
comp_id = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom_vx(atom_id) = 0.0;
atom_vy(atom_id) = 0.0;
atom_vz(atom_id) = 0.0;
occupancy = atof(strtok(NULL, " "));
charge = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
read_atoms++;
} else if(strncmp(item, "HEADER", 6) == 0 ||
strncmp(item, "REMARK", 6) == 0 ||
strncmp(item, "MODEL", 5) == 0 ||
strncmp(item, "TER", 3) == 0 ||
strncmp(item, "ENDMDL", 6) == 0) {
// Do nothing
} else {
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
}
if(!read_atoms) {
fprintf(stderr, "Input error: No atoms read!\n");
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_gro(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
char desc[MAXLINE];
int read_atoms = 0;
int atoms_to_read = 0;
int i = 0;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
readline(desc, fp);
for(i = 0; desc[i] != '\n'; i++);
desc[i] = '\0';
readline(line, fp);
atoms_to_read = atoi(strtok(line, " "));
fprintf(stdout, "System: %s with %d atoms\n", desc, atoms_to_read);
while(!feof(fp) && read_atoms < atoms_to_read) {
readline(line, fp);
char *label = strtok(line, " ");
int type = type_str2int(strtok(NULL, " "));
int atom_id = atoi(strtok(NULL, " ")) - 1;
atom_id = read_atoms;
while(atom_id + 1 >= atom->Nmax) {
growAtom(atom);
}
atom->type[atom_id] = type;
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom_vx(atom_id) = atof(strtok(NULL, " "));
atom_vy(atom_id) = atof(strtok(NULL, " "));
atom_vz(atom_id) = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id] + 1, atom->ntypes);
atom->Natoms++;
atom->Nlocal++;
read_atoms++;
}
if(!feof(fp)) {
readline(line, fp);
param->xlo = 0.0;
param->xhi = atof(strtok(line, " "));
param->ylo = 0.0;
param->yhi = atof(strtok(NULL, " "));
param->zlo = 0.0;
param->zhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
param->yprd = param->yhi - param->ylo;
param->zprd = param->zhi - param->zlo;
}
if(read_atoms != atoms_to_read) {
fprintf(stderr, "Input error: Number of atoms read do not match (%d/%d).\n", read_atoms, atoms_to_read);
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", read_atoms, param->input_file);
fclose(fp);
return read_atoms;
}
int readAtom_dmp(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int natoms = 0;
int read_atoms = 0;
int atom_id = -1;
int ts = -1;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
while(!feof(fp) && ts < 1 && !read_atoms) {
readline(line, fp);
if(strncmp(line, "ITEM: ", 6) == 0) {
char *item = &line[6];
if(strncmp(item, "TIMESTEP", 8) == 0) {
readline(line, fp);
ts = atoi(line);
} else if(strncmp(item, "NUMBER OF ATOMS", 15) == 0) {
readline(line, fp);
natoms = atoi(line);
atom->Natoms = natoms;
atom->Nlocal = natoms;
while(atom->Nlocal >= atom->Nmax) {
growAtom(atom);
}
} else if(strncmp(item, "BOX BOUNDS pp pp pp", 19) == 0) {
readline(line, fp);
param->xlo = atof(strtok(line, " "));
param->xhi = atof(strtok(NULL, " "));
param->xprd = param->xhi - param->xlo;
readline(line, fp);
param->ylo = atof(strtok(line, " "));
param->yhi = atof(strtok(NULL, " "));
param->yprd = param->yhi - param->ylo;
readline(line, fp);
param->zlo = atof(strtok(line, " "));
param->zhi = atof(strtok(NULL, " "));
param->zprd = param->zhi - param->zlo;
} else if(strncmp(item, "ATOMS id type x y z vx vy vz", 28) == 0) {
for(int i = 0; i < natoms; i++) {
readline(line, fp);
atom_id = atoi(strtok(line, " ")) - 1;
atom->type[atom_id] = atoi(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom_vx(atom_id) = atof(strtok(NULL, " "));
atom_vy(atom_id) = atof(strtok(NULL, " "));
atom_vz(atom_id) = atof(strtok(NULL, " "));
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
read_atoms++;
}
} else {
fprintf(stderr, "Invalid item: %s\n", item);
exit(-1);
return -1;
}
} else {
fprintf(stderr, "Invalid input from file, expected item reference but got:\n%s\n", line);
exit(-1);
return -1;
}
}
if(ts < 0 || !natoms || !read_atoms) {
fprintf(stderr, "Input error: atom data was not read!\n");
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
return natoms;
}
int readAtom_in(Atom* atom, Parameter* param) {
FILE *fp = fopen(param->input_file, "r");
char line[MAXLINE];
int natoms = 0;
int atom_id = 0;
if(!fp) {
fprintf(stderr, "Could not open input file: %s\n", param->input_file);
exit(-1);
return -1;
}
readline(line, fp);
natoms = atoi(strtok(line, " "));
param->xlo = atof(strtok(NULL, " "));
param->xhi = atof(strtok(NULL, " "));
param->ylo = atof(strtok(NULL, " "));
param->yhi = atof(strtok(NULL, " "));
param->zlo = atof(strtok(NULL, " "));
param->zhi = atof(strtok(NULL, " "));
atom->Natoms = natoms;
atom->Nlocal = natoms;
atom->ntypes = 1;
while(atom->Nlocal >= atom->Nmax) {
growAtom(atom);
}
for(int i = 0; i < natoms; i++) {
readline(line, fp);
// TODO: store mass per atom
char *s_mass = strtok(line, " ");
if(strncmp(s_mass, "inf", 3) == 0) {
// Set atom's mass to INFINITY
} else {
param->mass = atof(s_mass);
}
atom->radius[atom_id] = atof(strtok(NULL, " "));
atom_x(atom_id) = atof(strtok(NULL, " "));
atom_y(atom_id) = atof(strtok(NULL, " "));
atom_z(atom_id) = atof(strtok(NULL, " "));
atom_vx(atom_id) = atof(strtok(NULL, " "));
atom_vy(atom_id) = atof(strtok(NULL, " "));
atom_vz(atom_id) = atof(strtok(NULL, " "));
atom->type[atom_id] = 0;
atom->ntypes = MAX(atom->type[atom_id], atom->ntypes);
atom_id++;
}
if(!natoms) {
fprintf(stderr, "Input error: atom data was not read!\n");
exit(-1);
return -1;
}
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param->epsilon;
atom->sigma6[i] = param->sigma6;
atom->cutneighsq[i] = param->cutneigh * param->cutneigh;
atom->cutforcesq[i] = param->cutforce * param->cutforce;
}
fprintf(stdout, "Read %d atoms from %s\n", natoms, param->input_file);
return natoms;
}
void growAtom(Atom *atom) {
DeviceAtom *d_atom = &(atom->d_atom);
int nold = atom->Nmax;
atom->Nmax += DELTA;
#undef REALLOC
#define REALLOC(p,t,ns,os); \
atom->p = (t *) reallocate(atom->p, ALIGNMENT, ns, os); \
atom->d_atom.p = (t *) reallocateGPU(atom->d_atom.p, ns);
#ifdef AOS
REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
#else
REALLOC(x, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
REALLOC(y, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
REALLOC(z, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
REALLOC(vx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
REALLOC(vy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
REALLOC(vz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
REALLOC(fx, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
REALLOC(fy, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
REALLOC(fz, MD_FLOAT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
#endif
REALLOC(type, int, atom->Nmax * sizeof(int), nold * sizeof(int));
// DEM
atom->radius = (MD_FLOAT *) reallocate(atom->radius, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
atom->av = (MD_FLOAT*) reallocate(atom->av, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
atom->r = (MD_FLOAT*) reallocate(atom->r, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 4, nold * sizeof(MD_FLOAT) * 4);
}

178
lammps/cuda/force.cu Normal file
View File

@@ -0,0 +1,178 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
//---
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
//---
#include <likwid-marker.h>
extern "C" {
#include <allocate.h>
#include <atom.h>
#include <allocate.h>
#include <device.h>
#include <neighbor.h>
#include <parameter.h>
#include <timing.h>
#include <util.h>
}
// cuda kernel
__global__ void calc_force(DeviceAtom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= Nlocal) {
return;
}
DeviceAtom *atom = &a;
const int numneighs = neigh_numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
for(int k = 0; k < numneighs; k++) {
int j = neigh_neighbors[Nlocal * k + i];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
}
}
atom_fx(i) = fix;
atom_fy(i) = fiy;
atom_fz(i) = fiz;
}
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, DeviceAtom a) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
DeviceAtom *atom = &a;
atom_vx(i) += dtforce * atom_fx(i);
atom_vy(i) += dtforce * atom_fy(i);
atom_vz(i) += dtforce * atom_fz(i);
atom_x(i) = atom_x(i) + dt * atom_vx(i);
atom_y(i) = atom_y(i) + dt * atom_vy(i);
atom_z(i) = atom_z(i) + dt * atom_vz(i);
}
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, DeviceAtom a) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if( i >= Nlocal ) {
return;
}
DeviceAtom *atom = &a;
atom_vx(i) += dtforce * atom_fx(i);
atom_vy(i) += dtforce * atom_fy(i);
atom_vz(i) += dtforce * atom_fz(i);
}
extern "C" {
void finalIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, atom->d_atom);
cuda_assert("kernel_final_integrate", cudaPeekAtLastError());
cuda_assert("kernel_final_integrate", cudaDeviceSynchronize());
if(reneigh) {
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
}
}
void initialIntegrate_cuda(bool reneigh, Parameter *param, Atom *atom) {
const int Nlocal = atom->Nlocal;
const int num_threads_per_block = get_num_threads();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, atom->d_atom);
cuda_assert("kernel_initial_integrate", cudaPeekAtLastError());
cuda_assert("kernel_initial_integrate", cudaDeviceSynchronize());
if(reneigh) {
memcpyFromGPU(atom->vx, atom->d_atom.vx, sizeof(MD_FLOAT) * atom->Nlocal * 3);
}
}
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor) {
const int num_threads_per_block = get_num_threads();
int Nlocal = atom->Nlocal;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
/*
int nDevices;
cudaGetDeviceCount(&nDevices);
size_t free, total;
for(int i = 0; i < nDevices; ++i) {
cudaMemGetInfo( &free, &total );
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
}
*/
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
// memsetGPU(atom->d_atom.fx, 0, sizeof(MD_FLOAT) * Nlocal * 3);
cudaProfilerStart();
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
calc_force <<< num_blocks, num_threads_per_block >>> (atom->d_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, neighbor->d_neighbor.neighbors, neighbor->d_neighbor.numneigh);
cuda_assert("calc_force", cudaPeekAtLastError());
cuda_assert("calc_force", cudaDeviceSynchronize());
cudaProfilerStop();
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
return E-S;
}
}

293
lammps/cuda/neighbor.cu Normal file
View File

@@ -0,0 +1,293 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
//---
extern "C" {
#include <atom.h>
#include <device.h>
#include <parameter.h>
#include <neighbor.h>
#include <util.h>
}
extern MD_FLOAT xprd, yprd, zprd;
extern MD_FLOAT bininvx, bininvy, bininvz;
extern int mbinxlo, mbinylo, mbinzlo;
extern int nbinx, nbiny, nbinz;
extern int mbinx, mbiny, mbinz; // n bins in x, y, z
extern int mbins; //total number of bins
extern int atoms_per_bin; // max atoms per bin
extern MD_FLOAT cutneighsq; // neighbor cutoff squared
extern int nmax;
extern int nstencil; // # of bins in stencil
extern int* stencil; // stencil list of bin offsets
static int* c_stencil = NULL;
static int* c_resize_needed = NULL;
static int* c_new_maxneighs = NULL;
static Binning c_binning {
.bincount = NULL,
.bins = NULL,
.mbins = 0,
.atoms_per_bin = 0
};
__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) {
int ix, iy, iz;
if(xin >= np.xprd) {
ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
} else if(xin >= 0.0) {
ix = (int)(xin * np.bininvx) - np.mbinxlo;
} else {
ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
}
if(yin >= np.yprd) {
iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
} else if(yin >= 0.0) {
iy = (int)(yin * np.bininvy) - np.mbinylo;
} else {
iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
}
if(zin >= np.zprd) {
iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
} else if(zin >= 0.0) {
iz = (int)(zin * np.bininvz) - np.mbinzlo;
} else {
iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
}
return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
}
/* sorts the contents of a bin to make it comparable to the CPU version */
/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= mbins) {
return;
}
int atoms_in_bin = bincount[i];
int *bin_ptr = &bins[i * atoms_per_bin];
int sorted;
do {
sorted = 1;
int tmp;
for(int index = 0; index < atoms_in_bin - 1; index++){
if (bin_ptr[index] > bin_ptr[index + 1]){
tmp = bin_ptr[index];
bin_ptr[index] = bin_ptr[index + 1];
bin_ptr[index + 1] = tmp;
sorted = 0;
}
}
} while (!sorted);
}
__global__ void binatoms_kernel(DeviceAtom a, int nall, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed) {
DeviceAtom* atom = &a;
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= nall) {
return;
}
MD_FLOAT x = atom_x(i);
MD_FLOAT y = atom_y(i);
MD_FLOAT z = atom_z(i);
int ibin = coord2bin_device(x, y, z, np);
int ac = atomicAdd(&bincount[ibin], 1);
if(ac < atoms_per_bin){
bins[ibin * atoms_per_bin + ac] = i;
} else {
atomicMax(resize_needed, ac);
}
}
__global__ void compute_neighborhood(
DeviceAtom a, DeviceNeighbor neigh, Neighbor_params np, int nlocal, int maxneighs, int nstencil, int* stencil,
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= nlocal) {
return;
}
DeviceAtom *atom = &a;
DeviceNeighbor *neighbor = &neigh;
int* neighptr = &(neighbor->neighbors[i]);
int n = 0;
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
#ifdef EXPLICIT_TYPES
int type_i = atom->type[i];
#endif
for(int k = 0; k < nstencil; k++) {
int jbin = ibin + stencil[k];
int* loc_bin = &bins[jbin * atoms_per_bin];
for(int m = 0; m < bincount[jbin]; m++) {
int j = loc_bin[m];
if ( j == i ){
continue;
}
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
int type_j = atom->type[j];
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
#else
const MD_FLOAT cutoff = cutneighsq;
#endif
if( rsq <= cutoff ) {
int idx = nlocal * n;
neighptr[idx] = j;
n += 1;
}
}
}
neighbor->numneigh[i] = n;
if(n > maxneighs) {
atomicMax(new_maxneighs, n);
}
}
void binatoms_cuda(Atom *atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
int nall = atom->Nlocal + atom->Nghost;
int resize = 1;
const int num_blocks = ceil((float) nall / (float) threads_per_block);
while(resize > 0) {
resize = 0;
memsetGPU(c_binning->bincount, 0, c_binning->mbins * sizeof(int));
memsetGPU(c_resize_needed, 0, sizeof(int));
binatoms_kernel<<<num_blocks, threads_per_block>>>(atom->d_atom, atom->Nlocal + atom->Nghost, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
cuda_assert("binatoms", cudaPeekAtLastError());
cuda_assert("binatoms", cudaDeviceSynchronize());
memcpyFromGPU(&resize, c_resize_needed, sizeof(int));
if(resize) {
c_binning->atoms_per_bin *= 2;
c_binning->bins = (int *) reallocateGPU(c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int));
}
}
atoms_per_bin = c_binning->atoms_per_bin;
const int sortBlocks = ceil((float) mbins / (float) threads_per_block);
sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
cuda_assert("sort_bin", cudaPeekAtLastError());
cuda_assert("sort_bin", cudaDeviceSynchronize());
}
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor) {
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
const int num_threads_per_block = get_num_threads();
int nall = atom->Nlocal + atom->Nghost;
cudaProfilerStart();
// TODO move all of this initialization into its own method
if(c_stencil == NULL) {
c_stencil = (int *) allocateGPU(nstencil * sizeof(int));
memcpyToGPU(c_stencil, stencil, nstencil * sizeof(int));
}
if(c_binning.mbins == 0) {
c_binning.mbins = mbins;
c_binning.atoms_per_bin = atoms_per_bin;
c_binning.bincount = (int *) allocateGPU(c_binning.mbins * sizeof(int));
c_binning.bins = (int *) allocateGPU(c_binning.mbins * c_binning.atoms_per_bin * sizeof(int));
}
Neighbor_params np {
.xprd = xprd,
.yprd = yprd,
.zprd = zprd,
.bininvx = bininvx,
.bininvy = bininvy,
.bininvz = bininvz,
.mbinxlo = mbinxlo,
.mbinylo = mbinylo,
.mbinzlo = mbinzlo,
.nbinx = nbinx,
.nbiny = nbiny,
.nbinz = nbinz,
.mbinx = mbinx,
.mbiny = mbiny,
.mbinz = mbinz
};
if(c_resize_needed == NULL) {
c_resize_needed = (int *) allocateGPU(sizeof(int));
}
/* bin local & ghost atoms */
binatoms_cuda(atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
if(c_new_maxneighs == NULL) {
c_new_maxneighs = (int *) allocateGPU(sizeof(int));
}
int resize = 1;
if(nall > nmax) {
nmax = nall;
d_neighbor->neighbors = (int *) reallocateGPU(d_neighbor->neighbors, nmax * neighbor->maxneighs * sizeof(int));
d_neighbor->numneigh = (int *) reallocateGPU(d_neighbor->numneigh, nmax * sizeof(int));
}
/* loop over each atom, storing neighbors */
while(resize) {
resize = 0;
memsetGPU(c_new_maxneighs, 0, sizeof(int));
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(atom->d_atom, *d_neighbor,
np, atom->Nlocal, neighbor->maxneighs, nstencil, c_stencil,
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
c_new_maxneighs,
cutneighsq);
cuda_assert("compute_neighborhood", cudaPeekAtLastError());
cuda_assert("compute_neighborhood", cudaDeviceSynchronize());
int new_maxneighs;
memcpyFromGPU(&new_maxneighs, c_new_maxneighs, sizeof(int));
if(new_maxneighs > neighbor->maxneighs){
resize = 1;
}
if(resize) {
printf("RESIZE %d\n", neighbor->maxneighs);
neighbor->maxneighs = new_maxneighs * 1.2;
printf("NEW SIZE %d\n", neighbor->maxneighs);
neighbor->neighbors = (int *) reallocateGPU(neighbor->neighbors, atom->Nmax * neighbor->maxneighs * sizeof(int));
}
}
cudaProfilerStop();
}

111
lammps/cuda/pbc.cu Normal file
View File

@@ -0,0 +1,111 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
//---
extern "C" {
#include <allocate.h>
#include <atom.h>
#include <device.h>
#include <pbc.h>
#include <util.h>
}
extern int NmaxGhost;
extern int *PBCx, *PBCy, *PBCz;
static int c_NmaxGhost = 0;
static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
__global__ void computeAtomsPbcUpdate(DeviceAtom a, int nlocal, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) {
const int i = blockIdx.x * blockDim.x + threadIdx.x;
DeviceAtom *atom = &a;
if(i >= nlocal) {
return;
}
if (atom_x(i) < 0.0) {
atom_x(i) += xprd;
} else if (atom_x(i) >= xprd) {
atom_x(i) -= xprd;
}
if (atom_y(i) < 0.0) {
atom_y(i) += yprd;
} else if (atom_y(i) >= yprd) {
atom_y(i) -= yprd;
}
if (atom_z(i) < 0.0) {
atom_z(i) += zprd;
} else if (atom_z(i) >= zprd) {
atom_z(i) -= zprd;
}
}
__global__ void computePbcUpdate(DeviceAtom a, int nlocal, int nghost, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
const int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= nghost) {
return;
}
DeviceAtom* atom = &a;
int *border_map = atom->border_map;
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
}
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void updatePbc_cuda(Atom *atom, Parameter *param, bool reneigh) {
const int num_threads_per_block = get_num_threads();
if(reneigh) {
memcpyToGPU(atom->d_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(atom->d_atom.type, atom->type, sizeof(int) * atom->Nmax);
if(c_NmaxGhost < NmaxGhost) {
c_NmaxGhost = NmaxGhost;
c_PBCx = (int *) reallocateGPU(c_PBCx, NmaxGhost * sizeof(int));
c_PBCy = (int *) reallocateGPU(c_PBCy, NmaxGhost * sizeof(int));
c_PBCz = (int *) reallocateGPU(c_PBCz, NmaxGhost * sizeof(int));
atom->d_atom.border_map = (int *) reallocateGPU(atom->d_atom.border_map, NmaxGhost * sizeof(int));
}
memcpyToGPU(c_PBCx, PBCx, NmaxGhost * sizeof(int));
memcpyToGPU(c_PBCy, PBCy, NmaxGhost * sizeof(int));
memcpyToGPU(c_PBCz, PBCz, NmaxGhost * sizeof(int));
memcpyToGPU(atom->d_atom.border_map, atom->border_map, NmaxGhost * sizeof(int));
cuda_assert("updatePbc.reneigh", cudaPeekAtLastError());
cuda_assert("updatePbc.reneigh", cudaDeviceSynchronize());
}
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, atom->Nghost, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
cuda_assert("updatePbc", cudaPeekAtLastError());
cuda_assert("updatePbc", cudaDeviceSynchronize());
}
void updateAtomsPbc_cuda(Atom* atom, Parameter *param) {
const int num_threads_per_block = get_num_threads();
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(atom->d_atom, atom->Nlocal, xprd, yprd, zprd);
cuda_assert("computeAtomsPbcUpdate", cudaPeekAtLastError());
cuda_assert("computeAtomsPbcUpdate", cudaDeviceSynchronize());
memcpyFromGPU(atom->x, atom->d_atom.x, sizeof(MD_FLOAT) * atom->Nlocal * 3);
}

29
lammps/device_spec.c Normal file
View File

@@ -0,0 +1,29 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <device.h>
#ifdef CUDA_TARGET
void initDevice(Atom *atom, Neighbor *neighbor) {
DeviceAtom *d_atom = &(atom->d_atom);
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
}
#endif

123
lammps/force_dem.c Normal file
View File

@@ -0,0 +1,123 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <math.h>
//---
#include <atom.h>
#include <likwid-marker.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
double computeForceDemFullNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT k_s = param->k_s;
MD_FLOAT k_dn = param->k_dn;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
LIKWID_MARKER_START("force");
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT irad = atom->radius[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT jrad = atom->radius[j];
MD_FLOAT xj = atom_x(j);
MD_FLOAT yj = atom_y(j);
MD_FLOAT zj = atom_z(j);
MD_FLOAT delx = xtmp - xj;
MD_FLOAT dely = ytmp - yj;
MD_FLOAT delz = ztmp - zj;
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT r = sqrt(rsq);
MD_FLOAT p = irad + jrad - r;
if(p > 0) {
MD_FLOAT delvx = atom_vx(i) - atom_vx(j);
MD_FLOAT delvy = atom_vy(i) - atom_vy(j);
MD_FLOAT delvz = atom_vz(i) - atom_vz(j);
MD_FLOAT vr = sqrt(delvx * delvx + delvy * delvy + delvz * delvz);
// normal distance
MD_FLOAT nx = delx / r;
MD_FLOAT ny = dely / r;
MD_FLOAT nz = delz / r;
// normal contact velocity
MD_FLOAT nvx = delvx / vr;
MD_FLOAT nvy = delvy / vr;
MD_FLOAT nvz = delvz / vr;
// forces
atom_fx(i) += k_s * p * nx - k_dn * nvx;
atom_fy(i) += k_s * p * ny - k_dn * nvy;
atom_fz(i) += k_s * p * nz - k_dn * nvz;
atom_fx(j) += -k_s * p * nx - k_dn * nvx;
atom_fy(j) += -k_s * p * ny - k_dn * nvy;
atom_fz(j) += -k_s * p * nz - k_dn * nvz;
// contact position
//MD_FLOAT cterm = jrad / (irad + jrad);
//MD_FLOAT cx = xj + cterm * delx;
//MD_FLOAT cy = yj + cterm * dely;
//MD_FLOAT cz = zj + cterm * delz;
// delta contact and particle position
//MD_FLOAT delcx = cx - xtmp;
//MD_FLOAT delcy = cy - ytmp;
//MD_FLOAT delcz = cz - ztmp;
// contact velocity
//MD_FLOAT cvx = (atom_vx(i) + atom_avx(i) * delcx) - (atom_vx(j) + atom_avx(j) * (cx - xj));
//MD_FLOAT cvy = (atom_vy(i) + atom_avy(i) * delcy) - (atom_vy(j) + atom_avy(j) * (cy - yj));
//MD_FLOAT cvz = (atom_vz(i) + atom_avz(i) * delcz) - (atom_vz(j) + atom_avz(j) * (cz - zj));
// tangential force
//fix += MIN(kdt * vtsq, kf * fnx) * tx;
//fiy += MIN(kdt * vtsq, kf * fny) * ty;
//fiz += MIN(kdt * vtsq, kf * fnz) * tz;
// torque
//MD_FLOAT taux = delcx * ftx;
//MD_FLOAT tauy = delcy * fty;
//MD_FLOAT tauz = delcz * ftz;
}
#ifdef USE_REFERENCE_VERSION
addStat(stats->atoms_within_cutoff, 1);
} else {
addStat(stats->atoms_outside_cutoff, 1);
#endif
}
}
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
return E-S;
}

197
lammps/force_eam.c Normal file
View File

@@ -0,0 +1,197 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <likwid-marker.h>
#include <math.h>
#include <allocate.h>
#include <timing.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#include <eam.h>
#include <util.h>
double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(eam->nmax < atom->Nmax) {
eam->nmax = atom->Nmax;
if(eam->fp != NULL) { free(eam->fp); }
eam->fp = (MD_FLOAT *) allocate(ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT));
}
int Nlocal = atom->Nlocal;
int* neighs;
int ntypes = atom->ntypes; MD_FLOAT* fp = eam->fp;
MD_FLOAT* rhor_spline = eam->rhor_spline; MD_FLOAT* frho_spline = eam->frho_spline; MD_FLOAT* z2r_spline = eam->z2r_spline;
MD_FLOAT rdr = eam->rdr; int nr = eam->nr; int nr_tot = eam->nr_tot; MD_FLOAT rdrho = eam->rdrho;
int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
double S = getTimeStamp();
LIKWID_MARKER_START("force_eam_fp");
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT rhoi = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
#pragma ivdep
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
#else
const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
#endif
if(rsq < cutforcesq) {
MD_FLOAT p = sqrt(rsq) * rdr + 1.0;
int m = (int)(p);
m = m < nr - 1 ? m : nr - 1;
p -= m;
p = p < 1.0 ? p : 1.0;
#ifdef EXPLICIT_TYPES
rhoi += ((rhor_spline[type_ij * nr_tot + m * 7 + 3] * p +
rhor_spline[type_ij * nr_tot + m * 7 + 4]) * p +
rhor_spline[type_ij * nr_tot + m * 7 + 5]) * p +
rhor_spline[type_ij * nr_tot + m * 7 + 6];
#else
rhoi += ((rhor_spline[m * 7 + 3] * p +
rhor_spline[m * 7 + 4]) * p +
rhor_spline[m * 7 + 5]) * p +
rhor_spline[m * 7 + 6];
#endif
}
}
#ifdef EXPLICIT_TYPES
const int type_ii = type_i * type_i;
#endif
MD_FLOAT p = 1.0 * rhoi * rdrho + 1.0;
int m = (int)(p);
m = MAX(1, MIN(m, nrho - 1));
p -= m;
p = MIN(p, 1.0);
#ifdef EXPLICIT_TYPES
fp[i] = (frho_spline[type_ii * nrho_tot + m * 7 + 0] * p +
frho_spline[type_ii * nrho_tot + m * 7 + 1]) * p +
frho_spline[type_ii * nrho_tot + m * 7 + 2];
#else
fp[i] = (frho_spline[m * 7 + 0] * p + frho_spline[m * 7 + 1]) * p + frho_spline[m * 7 + 2];
#endif
}
LIKWID_MARKER_STOP("force_eam_fp");
// We still need to update fp for PBC atoms
for(int i = 0; i < atom->Nghost; i++) {
fp[Nlocal + i] = fp[atom->border_map[i]];
}
LIKWID_MARKER_START("force_eam");
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
#pragma ivdep
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
#else
const MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
#endif
if(rsq < cutforcesq) {
MD_FLOAT r = sqrt(rsq);
MD_FLOAT p = r * rdr + 1.0;
int m = (int)(p);
m = m < nr - 1 ? m : nr - 1;
p -= m;
p = p < 1.0 ? p : 1.0;
// rhoip = derivative of (density at atom j due to atom i)
// rhojp = derivative of (density at atom i due to atom j)
// phi = pair potential energy
// phip = phi'
// z2 = phi * r
// z2p = (phi * r)' = (phi' r) + phi
// psip needs both fp[i] and fp[j] terms since r_ij appears in two
// terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
// hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
#ifdef EXPLICIT_TYPES
MD_FLOAT rhoip = (rhor_spline[type_ij * nr_tot + m * 7 + 0] * p +
rhor_spline[type_ij * nr_tot + m * 7 + 1]) * p +
rhor_spline[type_ij * nr_tot + m * 7 + 2];
MD_FLOAT z2p = (z2r_spline[type_ij * nr_tot + m * 7 + 0] * p +
z2r_spline[type_ij * nr_tot + m * 7 + 1]) * p +
z2r_spline[type_ij * nr_tot + m * 7 + 2];
MD_FLOAT z2 = ((z2r_spline[type_ij * nr_tot + m * 7 + 3] * p +
z2r_spline[type_ij * nr_tot + m * 7 + 4]) * p +
z2r_spline[type_ij * nr_tot + m * 7 + 5]) * p +
z2r_spline[type_ij * nr_tot + m * 7 + 6];
#else
MD_FLOAT rhoip = (rhor_spline[m * 7 + 0] * p + rhor_spline[m * 7 + 1]) * p + rhor_spline[m * 7 + 2];
MD_FLOAT z2p = (z2r_spline[m * 7 + 0] * p + z2r_spline[m * 7 + 1]) * p + z2r_spline[m * 7 + 2];
MD_FLOAT z2 = ((z2r_spline[m * 7 + 3] * p +
z2r_spline[m * 7 + 4]) * p +
z2r_spline[m * 7 + 5]) * p +
z2r_spline[m * 7 + 6];
#endif
MD_FLOAT recip = 1.0 / r;
MD_FLOAT phi = z2 * recip;
MD_FLOAT phip = z2p * recip - phi * recip;
MD_FLOAT psip = fp[i] * rhoip + fp[j] * rhoip + phip;
MD_FLOAT fpair = -psip * recip;
fix += delx * fpair;
fiy += dely * fpair;
fiz += delz * fpair;
//fpair *= 0.5;
}
}
atom_fx(i) = fix;
atom_fy(i) = fiy;
atom_fz(i) = fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
LIKWID_MARKER_STOP("force_eam");
double E = getTimeStamp();
return E-S;
}

250
lammps/force_lj.c Normal file
View File

@@ -0,0 +1,250 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <stdlib.h>
//---
#include <atom.h>
#include <likwid-marker.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#ifdef __SIMD_KERNEL__
#include <simd.h>
#endif
double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int* neighs;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
#ifdef USE_REFERENCE_VERSION
addStat(stats->atoms_within_cutoff, 1);
} else {
addStat(stats->atoms_outside_cutoff, 1);
#endif
}
}
atom_fx(i) += fix;
atom_fy(i) += fiy;
atom_fz(i) += fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
return E-S;
}
double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int* neighs;
#ifndef EXPLICIT_TYPES
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
#endif
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
LIKWID_MARKER_START("forceLJ-halfneigh");
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#ifdef EXPLICIT_TYPES
const int type_i = atom->type[i];
#endif
// Pragma required to vectorize the inner loop
#ifdef ENABLE_OMP_SIMD
#pragma omp simd reduction(+: fix,fiy,fiz)
#endif
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
#ifdef EXPLICIT_TYPES
const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
// We do not need to update forces for ghost atoms
if(j < Nlocal) {
atom_fx(j) -= delx * force;
atom_fy(j) -= dely * force;
atom_fz(j) -= delz * force;
}
}
}
atom_fx(i) += fix;
atom_fy(i) += fiy;
atom_fz(i) += fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
}
LIKWID_MARKER_STOP("forceLJ-halfneigh");
double E = getTimeStamp();
return E-S;
}
double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
for(int i = 0; i < Nlocal; i++) {
atom_fx(i) = 0.0;
atom_fy(i) = 0.0;
atom_fz(i) = 0.0;
}
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#ifndef __SIMD_KERNEL__
fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
exit(-1);
#else
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_SIMD_INT numneighs_vec = simd_int_broadcast(numneighs);
MD_SIMD_FLOAT xtmp = simd_broadcast(atom_x(i));
MD_SIMD_FLOAT ytmp = simd_broadcast(atom_y(i));
MD_SIMD_FLOAT ztmp = simd_broadcast(atom_z(i));
MD_SIMD_FLOAT fix = simd_zero();
MD_SIMD_FLOAT fiy = simd_zero();
MD_SIMD_FLOAT fiz = simd_zero();
for(int k = 0; k < numneighs; k += VECTOR_WIDTH) {
// If the last iteration of this loop is separated from the rest, this mask can be set only there
MD_SIMD_MASK mask_numneighs = simd_mask_int_cond_lt(simd_int_add(simd_int_broadcast(k), simd_int_seq()), numneighs_vec);
MD_SIMD_INT j = simd_int_mask_load(&neighs[k], mask_numneighs);
#ifdef AOS
MD_SIMD_INT j3 = simd_int_add(simd_int_add(j, j), j); // j * 3
MD_SIMD_FLOAT delx = xtmp - simd_gather(j3, &(atom->x[0]), sizeof(MD_FLOAT));
MD_SIMD_FLOAT dely = ytmp - simd_gather(j3, &(atom->x[1]), sizeof(MD_FLOAT));
MD_SIMD_FLOAT delz = ztmp - simd_gather(j3, &(atom->x[2]), sizeof(MD_FLOAT));
#else
MD_SIMD_FLOAT delx = xtmp - simd_gather(j, atom->x, sizeof(MD_FLOAT));
MD_SIMD_FLOAT dely = ytmp - simd_gather(j, atom->y, sizeof(MD_FLOAT));
MD_SIMD_FLOAT delz = ztmp - simd_gather(j, atom->z, sizeof(MD_FLOAT));
#endif
MD_SIMD_FLOAT rsq = simd_fma(delx, delx, simd_fma(dely, dely, simd_mul(delz, delz)));
MD_SIMD_MASK cutoff_mask = simd_mask_and(mask_numneighs, simd_mask_cond_lt(rsq, cutforcesq_vec));
MD_SIMD_FLOAT sr2 = simd_reciprocal(rsq);
MD_SIMD_FLOAT sr6 = simd_mul(sr2, simd_mul(sr2, simd_mul(sr2, sigma6_vec)));
MD_SIMD_FLOAT force = simd_mul(c48_vec, simd_mul(sr6, simd_mul(simd_sub(sr6, c05_vec), simd_mul(sr2, eps_vec))));
fix = simd_masked_add(fix, simd_mul(delx, force), cutoff_mask);
fiy = simd_masked_add(fiy, simd_mul(dely, force), cutoff_mask);
fiz = simd_masked_add(fiz, simd_mul(delz, force), cutoff_mask);
}
atom_fx(i) += simd_h_reduce_sum(fix);
atom_fy(i) += simd_h_reduce_sum(fiy);
atom_fz(i) += simd_h_reduce_sum(fiz);
}
#endif
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
return E-S;
}

102
lammps/includes/atom.h Normal file
View File

@@ -0,0 +1,102 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <parameter.h>
#ifndef __ATOM_H_
#define __ATOM_H_
#ifdef CUDA_TARGET
# define KERNEL_NAME "CUDA"
# define computeForceLJFullNeigh computeForceLJFullNeigh_cuda
# define initialIntegrate initialIntegrate_cuda
# define finalIntegrate finalIntegrate_cuda
# define buildNeighbor buildNeighbor_cuda
# define updatePbc updatePbc_cuda
# define updateAtomsPbc updateAtomsPbc_cuda
#else
# ifdef USE_SIMD_KERNEL
# define KERNEL_NAME "SIMD"
# define computeForceLJFullNeigh computeForceLJFullNeigh_simd
# else
# define KERNEL_NAME "plain-C"
# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c
# endif
# define initialIntegrate initialIntegrate_cpu
# define finalIntegrate finalIntegrate_cpu
# define buildNeighbor buildNeighbor_cpu
# define updatePbc updatePbc_cpu
# define updateAtomsPbc updateAtomsPbc_cpu
#endif
typedef struct {
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
MD_FLOAT *fx, *fy, *fz;
int *border_map;
int *type;
MD_FLOAT *epsilon;
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
} DeviceAtom;
typedef struct {
int Natoms, Nlocal, Nghost, Nmax;
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
MD_FLOAT *fx, *fy, *fz;
int *border_map;
int *type;
int ntypes;
MD_FLOAT *epsilon;
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
// DEM
MD_FLOAT *radius;
MD_FLOAT *av;
MD_FLOAT *r;
// Device data
DeviceAtom d_atom;
} Atom;
extern void initAtom(Atom*);
extern void createAtom(Atom*, Parameter*);
extern int readAtom(Atom*, Parameter*);
extern int readAtom_pdb(Atom*, Parameter*);
extern int readAtom_gro(Atom*, Parameter*);
extern int readAtom_dmp(Atom*, Parameter*);
extern int readAtom_in(Atom*, Parameter*);
extern void growAtom(Atom*);
#ifdef AOS
# define POS_DATA_LAYOUT "AoS"
# define atom_x(i) atom->x[(i) * 3 + 0]
# define atom_y(i) atom->x[(i) * 3 + 1]
# define atom_z(i) atom->x[(i) * 3 + 2]
# define atom_vx(i) atom->vx[(i) * 3 + 0]
# define atom_vy(i) atom->vx[(i) * 3 + 1]
# define atom_vz(i) atom->vx[(i) * 3 + 2]
# define atom_fx(i) atom->fx[(i) * 3 + 0]
# define atom_fy(i) atom->fx[(i) * 3 + 1]
# define atom_fz(i) atom->fx[(i) * 3 + 2]
#else
# define POS_DATA_LAYOUT "SoA"
# define atom_x(i) atom->x[i]
# define atom_y(i) atom->y[i]
# define atom_z(i) atom->z[i]
# define atom_vx(i) atom->vx[i]
# define atom_vy(i) atom->vy[i]
# define atom_vz(i) atom->vz[i]
# define atom_fx(i) atom->fx[i]
# define atom_fy(i) atom->fy[i]
# define atom_fz(i) atom->fz[i]
#endif
#endif

View File

@@ -0,0 +1,34 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdbool.h>
//---
#include <parameter.h>
#include <atom.h>
void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) {
for(int i = 0; i < atom->Nlocal; i++) {
atom_vx(i) += param->dtforce * atom_fx(i);
atom_vy(i) += param->dtforce * atom_fy(i);
atom_vz(i) += param->dtforce * atom_fz(i);
atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
}
}
void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom) {
for(int i = 0; i < atom->Nlocal; i++) {
atom_vx(i) += param->dtforce * atom_fx(i);
atom_vy(i) += param->dtforce * atom_fy(i);
atom_vz(i) += param->dtforce * atom_fz(i);
}
}
#ifdef CUDA_TARGET
void initialIntegrate_cuda(bool, Parameter*, Atom*);
void finalIntegrate_cuda(bool, Parameter*, Atom*);
#endif

View File

@@ -0,0 +1,55 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <parameter.h>
#ifndef __NEIGHBOR_H_
#define __NEIGHBOR_H_
typedef struct {
int *neighbors;
int *numneigh;
} DeviceNeighbor;
typedef struct {
int every;
int ncalls;
int maxneighs;
int half_neigh;
int *neighbors;
int *numneigh;
// Device data
DeviceNeighbor d_neighbor;
} Neighbor;
typedef struct {
MD_FLOAT xprd; MD_FLOAT yprd; MD_FLOAT zprd;
MD_FLOAT bininvx; MD_FLOAT bininvy; MD_FLOAT bininvz;
int mbinxlo; int mbinylo; int mbinzlo;
int nbinx; int nbiny; int nbinz;
int mbinx; int mbiny; int mbinz;
} Neighbor_params;
typedef struct {
int* bincount;
int* bins;
int mbins;
int atoms_per_bin;
} Binning;
extern void initNeighbor(Neighbor*, Parameter*);
extern void setupNeighbor(Parameter*);
extern void binatoms(Atom*);
extern void buildNeighbor_cpu(Atom*, Neighbor*);
extern void sortAtom(Atom*);
#ifdef CUDA_TARGET
extern void buildNeighbor_cuda(Atom*, Neighbor*);
#endif
#endif

24
lammps/includes/pbc.h Normal file
View File

@@ -0,0 +1,24 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdbool.h>
//---
#include <atom.h>
#include <parameter.h>
#ifndef __PBC_H_
#define __PBC_H_
extern void initPbc();
extern void updatePbc_cpu(Atom*, Parameter*, bool);
extern void updateAtomsPbc_cpu(Atom*, Parameter*);
extern void setupPbc(Atom*, Parameter*);
#ifdef CUDA_TARGET
extern void updatePbc_cuda(Atom*, Parameter*, bool);
extern void updateAtomsPbc_cuda(Atom*, Parameter*);
#endif
#endif

32
lammps/includes/stats.h Normal file
View File

@@ -0,0 +1,32 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#include <parameter.h>
#ifndef __STATS_H_
#define __STATS_H_
typedef struct {
long long int total_force_neighs;
long long int total_force_iters;
long long int atoms_within_cutoff;
long long int atoms_outside_cutoff;
} Stats;
void initStats(Stats *s);
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
#ifdef COMPUTE_STATS
# define addStat(stat, value) stat += value;
# define beginStatTimer() double Si = getTimeStamp();
# define endStatTimer(stat) stat += getTimeStamp() - Si;
#else
# define addStat(stat, value)
# define beginStatTimer()
# define endStatTimer(stat)
#endif
#endif

102
lammps/includes/tracing.h Normal file
View File

@@ -0,0 +1,102 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
#include <stdio.h>
#include <stdlib.h>
#endif
#ifndef VECTOR_WIDTH
# define VECTOR_WIDTH 8
#endif
#ifndef TRACER_CONDITION
# define TRACER_CONDITION (!(timestep % param->every))
#endif
#ifdef MEM_TRACER
# define MEM_TRACER_INIT FILE *mem_tracer_fp; \
if(TRACER_CONDITION) { \
char mem_tracer_fn[128]; \
snprintf(mem_tracer_fn, sizeof mem_tracer_fn, "mem_tracer_%d.out", timestep); \
mem_tracer_fp = fopen(mem_tracer_fn, "w");
}
# define MEM_TRACER_END if(TRACER_CONDITION) { fclose(mem_tracer_fp); }
# define MEM_TRACE(addr, op) if(TRACER_CONDITION) { fprintf(mem_tracer_fp, "%c: %p\n", op, (void *)(&(addr))); }
#else
# define MEM_TRACER_INIT
# define MEM_TRACER_END
# define MEM_TRACE(addr, op)
#endif
#ifdef INDEX_TRACER
# define INDEX_TRACER_INIT FILE *index_tracer_fp; \
if(TRACER_CONDITION) { \
char index_tracer_fn[128]; \
snprintf(index_tracer_fn, sizeof index_tracer_fn, "index_tracer_%d.out", timestep); \
index_tracer_fp = fopen(index_tracer_fn, "w"); \
}
# define INDEX_TRACER_END if(TRACER_CONDITION) { fclose(index_tracer_fp); }
# define INDEX_TRACE_NATOMS(nl, ng, mn) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "N: %d %d %d\n", nl, ng, mn); }
# define INDEX_TRACE_ATOM(a) if(TRACER_CONDITION) { fprintf(index_tracer_fp, "A: %d\n", a); }
# define INDEX_TRACE(l, e) if(TRACER_CONDITION) { \
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
fprintf(index_tracer_fp, "I: "); \
for(int __j = 0; __j < __e; ++__j) { \
fprintf(index_tracer_fp, "%d ", l[__i + __j]); \
} \
fprintf(index_tracer_fp, "\n"); \
} \
}
# define DIST_TRACE_SORT(l, e) if(TRACER_CONDITION) { \
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
if(__e > 1) { \
for(int __j = __i; __j < __i + __e - 1; ++__j) { \
for(int __k = __i; __k < __i + __e - (__j - __i) - 1; ++__k) { \
if(l[__k] > l[__k + 1]) { \
int __t = l[__k]; \
l[__k] = l[__k + 1]; \
l[__k + 1] = __t; \
} \
} \
} \
} \
} \
}
# define DIST_TRACE(l, e) if(TRACER_CONDITION) { \
for(int __i = 0; __i < (e); __i += VECTOR_WIDTH) { \
int __e = (((e) - __i) < VECTOR_WIDTH) ? ((e) - __i) : VECTOR_WIDTH; \
if(__e > 1) { \
fprintf(index_tracer_fp, "D: "); \
for(int __j = 0; __j < __e - 1; ++__j) { \
int __dist = abs(l[__i + __j + 1] - l[__i + __j]); \
fprintf(index_tracer_fp, "%d ", __dist); \
} \
fprintf(index_tracer_fp, "\n"); \
} \
} \
}
#else
# define INDEX_TRACER_INIT
# define INDEX_TRACER_END
# define INDEX_TRACE_NATOMS(nl, ng, mn)
# define INDEX_TRACE_ATOM(a)
# define INDEX_TRACE(l, e)
# define DIST_TRACE_SORT(l, e)
# define DIST_TRACE(l, e)
#endif
extern void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timestep);

12
lammps/includes/vtk.h Normal file
View File

@@ -0,0 +1,12 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <atom.h>
#ifndef __VTK_H_
#define __VTK_H_
extern int write_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
#endif

305
lammps/main-stub.c Normal file
View File

@@ -0,0 +1,305 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdio.h>
#include <string.h>
//---
#include <likwid-marker.h>
//---
#include <timing.h>
#include <allocate.h>
#include <neighbor.h>
#include <parameter.h>
#include <atom.h>
#include <stats.h>
#include <thermo.h>
#include <eam.h>
#include <pbc.h>
#include <timers.h>
#include <util.h>
#define HLINE "----------------------------------------------------------------------------\n"
extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
// Patterns
#define P_SEQ 0
#define P_FIX 1
#define P_RAND 2
void init(Parameter *param) {
param->input_file = NULL;
param->force_field = FF_LJ;
param->epsilon = 1.0;
param->sigma6 = 1.0;
param->rho = 0.8442;
param->ntypes = 4;
param->ntimes = 200;
param->nx = 1;
param->ny = 1;
param->nz = 1;
param->lattice = 1.0;
param->cutforce = 1000000.0;
param->cutneigh = param->cutforce;
param->mass = 1.0;
param->half_neigh = 0;
// Unused
param->dt = 0.005;
param->dtforce = 0.5 * param->dt;
param->nstat = 100;
param->temp = 1.44;
param->reneigh_every = 20;
param->proc_freq = 2.4;
param->eam_file = NULL;
}
// Show debug messages
#define DEBUG(msg) printf(msg)
// Do not show debug messages
//#define DEBUG(msg)
void createNeighbors(Atom *atom, Neighbor *neighbor, int pattern, int nneighs, int nreps) {
const int maxneighs = nneighs * nreps;
neighbor->numneigh = (int*) malloc(atom->Nmax * sizeof(int));
neighbor->neighbors = (int*) malloc(atom->Nmax * maxneighs * sizeof(int));
if(pattern == P_RAND && atom->Nlocal <= nneighs) {
fprintf(stderr, "Error: When using random pattern, number of atoms should be higher than number of neighbors per atom!\n");
exit(-1);
}
for(int i = 0; i < atom->Nlocal; i++) {
int *neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
int j = (pattern == P_SEQ) ? (i + 1) : 0;
int m = (pattern == P_SEQ) ? atom->Nlocal : nneighs;
for(int k = 0; k < nneighs; k++) {
if(pattern == P_RAND) {
int found = 0;
do {
j = rand() % atom->Nlocal;
neighptr[k] = j;
found = (int)(i == j);
for(int l = 0; l < k; l++) {
if(neighptr[l] == j) {
found = 1;
}
}
} while(found == 1);
} else {
neighptr[k] = j;
j = (j + 1) % m;
}
}
for(int r = 1; r < nreps; r++) {
for(int k = 0; k < nneighs; k++) {
neighptr[r * nneighs + k] = neighptr[k];
}
}
neighbor->numneigh[i] = nneighs * nreps;
}
}
int main(int argc, const char *argv[]) {
Eam eam;
Atom atom_data;
Atom *atom = (Atom *)(&atom_data);
Neighbor neighbor;
Stats stats;
Parameter param;
char *pattern_str = NULL;
int pattern = P_SEQ;
int natoms = 256;
int nneighs = 76;
int nreps = 1;
int csv = 0;
LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force");
DEBUG("Initializing parameters...\n");
init(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-f") == 0)) {
if((param.force_field = str2ff(argv[++i])) < 0) {
fprintf(stderr, "Invalid force field!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-p") == 0)) {
pattern_str = strdup(argv[++i]);
if(strncmp(pattern_str, "seq", 3) == 0) { pattern = P_SEQ; }
else if(strncmp(pattern_str, "fix", 3) == 0) { pattern = P_FIX; }
else if(strncmp(pattern_str, "rand", 3) == 0) { pattern = P_RAND; }
else {
fprintf(stderr, "Invalid pattern!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-e") == 0)) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-na") == 0)) {
natoms = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nn") == 0)) {
nneighs = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nr") == 0)) {
nreps = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "--freq") == 0)) {
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--csv") == 0)) {
csv = 1;
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-f <string>: force field (lj or eam), default lj\n");
printf("-p <string>: pattern for data accesses (seq, fix or rand)\n");
printf("-n / --nsteps <int>: number of timesteps for simulation\n");
printf("-na <int>: number of atoms (default 256)\n");
printf("-nn <int>: number of neighbors per atom (default 76)\n");
printf("-nr <int>: number of times neighbor lists should be replicated (default 1)\n");
printf("--freq <real>: set CPU frequency (GHz) and display average cycles per atom and neighbors\n");
printf("--csv: set output as CSV style\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
if(pattern_str == NULL) {
pattern_str = strdup("seq\0");
}
if(param.force_field == FF_EAM) {
DEBUG("Initializing EAM parameters...\n");
initEam(&eam, &param);
}
DEBUG("Initializing atoms...\n");
initAtom(atom);
initStats(&stats);
atom->ntypes = param.ntypes;
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
atom->epsilon[i] = param.epsilon;
atom->sigma6[i] = param.sigma6;
atom->cutneighsq[i] = param.cutneigh * param.cutneigh;
atom->cutforcesq[i] = param.cutforce * param.cutforce;
}
DEBUG("Creating atoms...\n");
for(int i = 0; i < natoms; ++i) {
while(atom->Nlocal > atom->Nmax - natoms) {
growAtom(atom);
}
atom->type[atom->Nlocal] = rand() % atom->ntypes;
atom_x(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
atom_y(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
atom_z(atom->Nlocal) = (MD_FLOAT)(i) * 0.00001;
atom_vx(atom->Nlocal) = 0.0;
atom_vy(atom->Nlocal) = 0.0;
atom_vz(atom->Nlocal) = 0.0;
atom->Nlocal++;
}
const double estim_atom_volume = (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT));
const double estim_neighbors_volume = (double)(atom->Nlocal * (nneighs + 2) * sizeof(int));
const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + estim_neighbors_volume);
if(!csv) {
printf("Pattern: %s\n", pattern_str);
printf("Number of timesteps: %d\n", param.ntimes);
printf("Number of atoms: %d\n", natoms);
printf("Number of neighbors per atom: %d\n", nneighs);
printf("Number of times to replicate neighbor lists: %d\n", nreps);
printf("Estimated total data volume (kB): %.4f\n", estim_volume / 1000.0);
printf("Estimated atom data volume (kB): %.4f\n", estim_atom_volume / 1000.0);
printf("Estimated neighborlist data volume (kB): %.4f\n", estim_neighbors_volume / 1000.0);
}
DEBUG("Initializing neighbor lists...\n");
initNeighbor(&neighbor, &param);
DEBUG("Creating neighbor lists...\n");
createNeighbors(atom, &neighbor, pattern, nneighs, nreps);
DEBUG("Computing forces...\n");
double T_accum = 0.0;
for(int i = 0; i < param.ntimes; i++) {
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, atom, &neighbor, i + 1);
#endif
if(param.force_field == FF_EAM) {
computeForceEam(&eam, &param, atom, &neighbor, &stats);
} else {
if(param.half_neigh) {
T_accum += computeForceLJHalfNeigh(&param, atom, &neighbor, &stats);
} else {
T_accum += computeForceLJFullNeigh(&param, atom, &neighbor, &stats);
}
}
}
double freq_hz = param.proc_freq * 1.e9;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
const double cycles_per_neigh = cycles_per_atom / (double)(nneighs);
if(!csv) {
printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
if(param.proc_freq > 0.0) {
printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
}
} else {
printf("steps,pattern,natoms,nneighs,nreps,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
if(param.proc_freq > 0.0) {
printf(",cy/atom,cy/neigh");
}
printf("\n");
printf("%d,%s,%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
param.ntimes, pattern_str, natoms, nneighs, nreps,
estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
if(param.proc_freq > 0.0) {
printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
}
printf("\n");
}
double timer[NUMTIMER];
timer[FORCE] = T_accum;
displayStatistics(atom, &param, &stats, timer);
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

285
lammps/main.c Normal file
View File

@@ -0,0 +1,285 @@
/*
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
* All rights reserved. This file is part of MD-Bench.
* Use of this source code is governed by a LGPL-3.0
* license that can be found in the LICENSE file.
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <math.h>
#include <float.h>
#include <likwid-marker.h>
#include <allocate.h>
#include <atom.h>
#include <device.h>
#include <eam.h>
#include <integrate.h>
#include <thermo.h>
#include <timing.h>
#include <neighbor.h>
#include <parameter.h>
#include <pbc.h>
#include <stats.h>
#include <timers.h>
#include <util.h>
#include <vtk.h>
#define HLINE "----------------------------------------------------------------------------\n"
extern double computeForceLJFullNeigh_plain_c(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJFullNeigh_simd(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
#ifdef CUDA_TARGET
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*);
#endif
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
param->xprd = param->nx * param->lattice;
param->yprd = param->ny * param->lattice;
param->zprd = param->nz * param->lattice;
S = getTimeStamp();
initAtom(atom);
initPbc(atom);
initStats(stats);
initNeighbor(neighbor, param);
if(param->input_file == NULL) {
createAtom(atom, param);
} else {
readAtom(atom, param);
}
setupNeighbor(param);
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
setupPbc(atom, param);
initDevice(atom, neighbor);
updatePbc(atom, param, true);
buildNeighbor(atom, neighbor);
E = getTimeStamp();
return E-S;
}
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("reneighbour");
updateAtomsPbc(atom, param);
setupPbc(atom, param);
updatePbc(atom, param, true);
//sortAtom(atom);
buildNeighbor(atom, neighbor);
LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp();
return E-S;
}
void printAtomState(Atom *atom) {
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
// int nall = atom->Nlocal + atom->Nghost;
// for (int i=0; i<nall; i++) {
// printf("%d %f %f %f\n", i, atom->x[i], atom->y[i], atom->z[i]);
// }
}
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) {
return computeForceEam(eam, param, atom, neighbor, stats);
} else if(param->force_field == FF_DEM) {
if(param->half_neigh) {
fprintf(stderr, "Error: DEM cannot use half neighbor-lists!\n");
return 0.0;
} else {
return computeForceDemFullNeigh(param, atom, neighbor, stats);
}
}
if(param->half_neigh) {
return computeForceLJHalfNeigh(param, atom, neighbor, stats);
}
#ifdef CUDA_TARGET
return computeForceLJFullNeigh(param, atom, neighbor);
#else
return computeForceLJFullNeigh(param, atom, neighbor, stats);
#endif
}
void writeInput(Parameter *param, Atom *atom) {
FILE *fpin = fopen("input.in", "w");
fprintf(fpin, "0,%f,0,%f,0,%f\n", param->xprd, param->yprd, param->zprd);
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fpin, "1,%f,%f,%f,%f,%f,%f\n", atom_x(i), atom_y(i), atom_z(i), atom_vx(i), atom_vy(i), atom_vz(i));
}
fclose(fpin);
}
int main(int argc, char** argv) {
double timer[NUMTIMER];
Eam eam;
Atom atom;
Neighbor neighbor;
Stats stats;
Parameter param;
LIKWID_MARKER_INIT;
#pragma omp parallel
{
LIKWID_MARKER_REGISTER("force");
//LIKWID_MARKER_REGISTER("reneighbour");
//LIKWID_MARKER_REGISTER("pbc");
}
initParameter(&param);
for(int i = 0; i < argc; i++) {
if((strcmp(argv[i], "-p") == 0)) {
readParameter(&param, argv[++i]);
continue;
}
if((strcmp(argv[i], "-f") == 0)) {
if((param.force_field = str2ff(argv[++i])) < 0) {
fprintf(stderr, "Invalid force field!\n");
exit(-1);
}
continue;
}
if((strcmp(argv[i], "-i") == 0)) {
param.input_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-e") == 0)) {
param.eam_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) {
param.ntimes = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nx") == 0)) {
param.nx = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-ny") == 0)) {
param.ny = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-nz") == 0)) {
param.nz = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-half") == 0)) {
param.half_neigh = atoi(argv[++i]);
continue;
}
if((strcmp(argv[i], "-r") == 0) || (strcmp(argv[i], "--radius") == 0)) {
param.cutforce = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "-s") == 0) || (strcmp(argv[i], "--skin") == 0)) {
param.skin = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--freq") == 0)) {
param.proc_freq = atof(argv[++i]);
continue;
}
if((strcmp(argv[i], "--vtk") == 0)) {
param.vtk_file = strdup(argv[++i]);
continue;
}
if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
printf("MD Bench: A minimalistic re-implementation of miniMD\n");
printf(HLINE);
printf("-p <string>: file to read parameters from (can be specified more than once)\n");
printf("-f <string>: force field (lj, eam or dem), default lj\n");
printf("-i <string>: input file with atom positions (dump)\n");
printf("-e <string>: input file for EAM\n");
printf("-n / --nsteps <int>: set number of timesteps for simulation\n");
printf("-nx/-ny/-nz <int>: set linear dimension of systembox in x/y/z direction\n");
printf("-r / --radius <real>: set cutoff radius\n");
printf("-s / --skin <real>: set skin (verlet buffer)\n");
printf("--freq <real>: processor frequency (GHz)\n");
printf("--vtk <string>: VTK file for visualization\n");
printf(HLINE);
exit(EXIT_SUCCESS);
}
}
param.cutneigh = param.cutforce + param.skin;
setup(&param, &eam, &atom, &neighbor, &stats);
printParameter(&param);
printf(HLINE);
printf("step\ttemp\t\tpressure\n");
computeThermo(0, &param, &atom);
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
//writeInput(&param, &atom);
timer[FORCE] = computeForce(&eam, &param, &atom, &neighbor, &stats);
timer[NEIGH] = 0.0;
timer[TOTAL] = getTimeStamp();
if(param.vtk_file != NULL) {
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
}
for(int n = 0; n < param.ntimes; n++) {
bool reneigh = (n + 1) % param.reneigh_every == 0;
initialIntegrate(reneigh, &param, &atom);
if((n + 1) % param.reneigh_every) {
updatePbc(&atom, &param, false);
} else {
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
}
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
traceAddresses(&param, &atom, &neighbor, n + 1);
#endif
timer[FORCE] += computeForce(&eam, &param, &atom, &neighbor, &stats);
finalIntegrate(reneigh, &param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
#ifdef CUDA_TARGET
memcpyFromGPU(atom.x, atom.d_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3);
#endif
computeThermo(n + 1, &param, &atom);
}
if(param.vtk_file != NULL) {
write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1);
}
}
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
computeThermo(-1, &param, &atom);
printf(HLINE);
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
printf(HLINE);
printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
#ifdef COMPUTE_STATS
displayStatistics(&atom, &param, &stats, timer);
#endif
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS;
}

Some files were not shown because too many files have changed in this diff Show More