added static analysis and likwid files

This commit is contained in:
JanLJL 2023-02-09 17:33:22 +01:00
parent 3b076cdb49
commit cb5598bc91
11 changed files with 11709 additions and 0 deletions

View File

@ -0,0 +1,88 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Initializing parameters...
Initializing atoms...
Creating atoms...
Pattern: seq
Number of timesteps: 200
Number of atoms: 256
Number of neighbors per atom: 1024
Number of times to replicate neighbor lists: 1
Estimated total data volume (kB): 1062.9120
Estimated atom data volume (kB): 6.1440
Estimated neighborlist data volume (kB): 1050.6240
Initializing neighbor lists...
Creating neighbor lists...
Computing forces...
Total time: 0.2735, Mega atom updates/s: 0.1872
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
Statistics:
Vector width: 8, Processor frequency: 2.0000 GHz
Average neighbors per atom: 1018.9055
Average SIMD iterations per atom: 127.3632
Total number of computed pair interactions: 52428800
Total number of SIMD iterations: 6553600
Useful read data volume for force computation: 1.47GB
Cycles/SIMD iteration: 83.4598
--------------------------------------------------------------------------------
Region force, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 0.110776 |
| call count | 200 |
+-------------------+------------+
+------------------------------------------+---------+------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+------------+
| INSTR_RETIRED_ANY | FIXC0 | 267036300 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 |
| PWR_PKG_ENERGY | PWR0 | 10.9296 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 |
| CAS_COUNT_RD | MBOX0C0 | 8643 |
| CAS_COUNT_WR | MBOX0C1 | 1367 |
| CAS_COUNT_RD | MBOX1C0 | 9124 |
| CAS_COUNT_WR | MBOX1C1 | 1354 |
| CAS_COUNT_RD | MBOX2C0 | 9138 |
| CAS_COUNT_WR | MBOX2C1 | 1356 |
| CAS_COUNT_RD | MBOX3C0 | 5586 |
| CAS_COUNT_WR | MBOX3C1 | 1297 |
| CAS_COUNT_RD | MBOX4C0 | 5328 |
| CAS_COUNT_WR | MBOX4C1 | 1269 |
| CAS_COUNT_RD | MBOX5C0 | 5280 |
| CAS_COUNT_WR | MBOX5C1 | 1295 |
+------------------------------------------+---------+------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 0.1108 |
| Runtime unhalted [s] | 0.0878 |
| Clock [MHz] | 1995.2564 |
| CPI | 0.8202 |
| Energy [J] | 10.9296 |
| Power [W] | 98.6643 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 14233.3287 |
| AVX DP [MFLOP/s] | 14231.8898 |
| Packed [MUOPS/s] | 1778.9862 |
| Scalar [MUOPS/s] | 1.4389 |
| Memory read bandwidth [MBytes/s] | 24.9001 |
| Memory read data volume [GBytes] | 0.0028 |
| Memory write bandwidth [MBytes/s] | 4.5861 |
| Memory write data volume [GBytes] | 0.0005 |
| Memory bandwidth [MBytes/s] | 29.4863 |
| Memory data volume [GBytes] | 0.0033 |
| Operational intensity | 482.7104 |
+-----------------------------------+------------+

View File

@ -0,0 +1,168 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Parameters:
Force field: lj
Kernel: plain-C
Data layout: AoS
Floating-point precision: double
Unit cells (nx, ny, nz): 32, 32, 32
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
Periodic (x, y, z): 1, 1, 1
Lattice size: 1.679596e+00
Epsilon: 1.000000e+00
Sigma: 1.000000e+00
Spring constant: 1.000000e+00
Damping constant: 1.000000e+00
Temperature: 1.440000e+00
RHO: 8.442000e-01
Mass: 1.000000e+00
Number of types: 4
Number of timesteps: 200
Report stats every (timesteps): 100
Reneighbor every (timesteps): 20
Prune every (timesteps): 1000
Output positions every (timesteps): 20
Output velocities every (timesteps): 5
Delta time (dt): 5.000000e-03
Cutoff radius: 2.500000e+00
Skin: 3.000000e-01
Half neighbor lists: 0
Processor frequency (GHz): 2.0000
----------------------------------------------------------------------------
step temp pressure
0 1.440000e+00 1.215639e+00
100 8.200895e-01 6.923143e-01
200 7.961495e-01 6.721043e-01
----------------------------------------------------------------------------
System: 131072 atoms 47265 ghost atoms, Steps: 200
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
----------------------------------------------------------------------------
Performance: 2.28 million atom updates per second
Statistics:
Vector width: 8, Processor frequency: 2.0000 GHz
Average neighbors per atom: 76.0352
Average SIMD iterations per atom: 9.9181
Total number of computed pair interactions: 2003182862
Total number of SIMD iterations: 261297661
Useful read data volume for force computation: 57.46GB
Cycles/SIMD iteration: 40.4432
--------------------------------------------------------------------------------
Region force, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 5.115807 |
| call count | 201 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 12592470000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 |
| PWR_PKG_ENERGY | PWR0 | 307.9429 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 |
| CAS_COUNT_RD | MBOX0C0 | 22734550 |
| CAS_COUNT_WR | MBOX0C1 | 1147714 |
| CAS_COUNT_RD | MBOX1C0 | 22755180 |
| CAS_COUNT_WR | MBOX1C1 | 1144415 |
| CAS_COUNT_RD | MBOX2C0 | 22762780 |
| CAS_COUNT_WR | MBOX2C1 | 1129051 |
| CAS_COUNT_RD | MBOX3C0 | 22905660 |
| CAS_COUNT_WR | MBOX3C1 | 1143324 |
| CAS_COUNT_RD | MBOX4C0 | 22914860 |
| CAS_COUNT_WR | MBOX4C1 | 1169116 |
| CAS_COUNT_RD | MBOX5C0 | 22890220 |
| CAS_COUNT_WR | MBOX5C1 | 1180739 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 5.1158 |
| Runtime unhalted [s] | 4.0885 |
| Clock [MHz] | 1995.2508 |
| CPI | 0.8098 |
| Energy [J] | 307.9429 |
| Power [W] | 60.1944 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 12644.6041 |
| AVX DP [MFLOP/s] | 12629.1535 |
| Packed [MUOPS/s] | 1578.6442 |
| Scalar [MUOPS/s] | 15.4506 |
| Memory read bandwidth [MBytes/s] | 1713.4438 |
| Memory read data volume [GBytes] | 8.7656 |
| Memory write bandwidth [MBytes/s] | 86.5003 |
| Memory write data volume [GBytes] | 0.4425 |
| Memory bandwidth [MBytes/s] | 1799.9442 |
| Memory data volume [GBytes] | 9.2082 |
| Operational intensity | 7.0250 |
+-----------------------------------+------------+
Region reneighbour, Group 1: MEM_DP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 5.897385 |
| call count | 10 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 18212540000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 |
| PWR_PKG_ENERGY | PWR0 | 338.9000 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 |
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 |
| CAS_COUNT_RD | MBOX0C0 | 2086787 |
| CAS_COUNT_WR | MBOX0C1 | 1115626 |
| CAS_COUNT_RD | MBOX1C0 | 2089964 |
| CAS_COUNT_WR | MBOX1C1 | 1117021 |
| CAS_COUNT_RD | MBOX2C0 | 2103832 |
| CAS_COUNT_WR | MBOX2C1 | 1117965 |
| CAS_COUNT_RD | MBOX3C0 | 2086930 |
| CAS_COUNT_WR | MBOX3C1 | 1102471 |
| CAS_COUNT_RD | MBOX4C0 | 2094688 |
| CAS_COUNT_WR | MBOX4C1 | 1103018 |
| CAS_COUNT_RD | MBOX5C0 | 2097438 |
| CAS_COUNT_WR | MBOX5C1 | 1102525 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 5.8974 |
| Runtime unhalted [s] | 4.7026 |
| Clock [MHz] | 1995.2473 |
| CPI | 0.6440 |
| Energy [J] | 338.9000 |
| Power [W] | 57.4661 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| DP [MFLOP/s] | 1059.4978 |
| AVX DP [MFLOP/s] | 1.3335 |
| Packed [MUOPS/s] | 0.1667 |
| Scalar [MUOPS/s] | 1058.1643 |
| Memory read bandwidth [MBytes/s] | 136.3006 |
| Memory read data volume [GBytes] | 0.8038 |
| Memory write bandwidth [MBytes/s] | 72.2612 |
| Memory write data volume [GBytes] | 0.4262 |
| Memory bandwidth [MBytes/s] | 208.5618 |
| Memory data volume [GBytes] | 1.2300 |
| Operational intensity | 5.0800 |
+-----------------------------------+------------+

View File

@ -0,0 +1,88 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Initializing parameters...
Initializing atoms...
Creating atoms...
Pattern: seq
Number of timesteps: 200
Number of atoms: 256
Number of neighbors per atom: 1024
Number of times to replicate neighbor lists: 1
Estimated total data volume (kB): 1056.7680
Estimated atom data volume (kB): 3.0720
Estimated neighborlist data volume (kB): 1050.6240
Initializing neighbor lists...
Creating neighbor lists...
Computing forces...
Total time: 0.2466, Mega atom updates/s: 0.2076
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
Statistics:
Vector width: 16, Processor frequency: 2.0000 GHz
Average neighbors per atom: 1018.9055
Average SIMD iterations per atom: 63.6816
Total number of computed pair interactions: 52428800
Total number of SIMD iterations: 3276800
Useful read data volume for force computation: 0.84GB
Cycles/SIMD iteration: 150.4999
--------------------------------------------------------------------------------
Region force, Group 1: MEM_SP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 0.085843 |
| call count | 200 |
+-------------------+------------+
+------------------------------------------+---------+------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+------------+
| INSTR_RETIRED_ANY | FIXC0 | 129769100 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 172300100 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 215371300 |
| PWR_PKG_ENERGY | PWR0 | 9.2849 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 154000 |
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 89088000 |
| CAS_COUNT_RD | MBOX0C0 | 8354 |
| CAS_COUNT_WR | MBOX0C1 | 1126 |
| CAS_COUNT_RD | MBOX1C0 | 7863 |
| CAS_COUNT_WR | MBOX1C1 | 1105 |
| CAS_COUNT_RD | MBOX2C0 | 7990 |
| CAS_COUNT_WR | MBOX2C1 | 1113 |
| CAS_COUNT_RD | MBOX3C0 | 4775 |
| CAS_COUNT_WR | MBOX3C1 | 1112 |
| CAS_COUNT_RD | MBOX4C0 | 4201 |
| CAS_COUNT_WR | MBOX4C1 | 1127 |
| CAS_COUNT_RD | MBOX5C0 | 4035 |
| CAS_COUNT_WR | MBOX5C1 | 1120 |
+------------------------------------------+---------+------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 0.0858 |
| Runtime unhalted [s] | 0.0691 |
| Clock [MHz] | 1995.2787 |
| CPI | 1.3277 |
| Energy [J] | 9.2849 |
| Power [W] | 108.1610 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| SP [MFLOP/s] | 16606.5397 |
| AVX SP [MFLOP/s] | 16604.7458 |
| Packed [MUOPS/s] | 1037.7966 |
| Scalar [MUOPS/s] | 1.7940 |
| Memory read bandwidth [MBytes/s] | 27.7476 |
| Memory read data volume [GBytes] | 0.0024 |
| Memory write bandwidth [MBytes/s] | 4.9974 |
| Memory write data volume [GBytes] | 0.0004 |
| Memory bandwidth [MBytes/s] | 32.7450 |
| Memory data volume [GBytes] | 0.0028 |
| Operational intensity | 507.1471 |
+-----------------------------------+------------+

View File

@ -0,0 +1,168 @@
--------------------------------------------------------------------------------
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
CPU type: Intel Cascadelake SP processor
CPU clock: 2.49 GHz
--------------------------------------------------------------------------------
Parameters:
Force field: lj
Kernel: plain-C
Data layout: AoS
Floating-point precision: single
Unit cells (nx, ny, nz): 32, 32, 32
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
Periodic (x, y, z): 1, 1, 1
Lattice size: 1.679596e+00
Epsilon: 1.000000e+00
Sigma: 1.000000e+00
Spring constant: 1.000000e+00
Damping constant: 1.000000e+00
Temperature: 1.440000e+00
RHO: 8.442000e-01
Mass: 1.000000e+00
Number of types: 4
Number of timesteps: 200
Report stats every (timesteps): 100
Reneighbor every (timesteps): 20
Prune every (timesteps): 1000
Output positions every (timesteps): 20
Output velocities every (timesteps): 5
Delta time (dt): 5.000000e-03
Cutoff radius: 2.500000e+00
Skin: 3.000000e-01
Half neighbor lists: 0
Processor frequency (GHz): 2.0000
----------------------------------------------------------------------------
step temp pressure
0 1.440000e+00 1.215639e+00
100 8.200897e-01 6.923144e-01
200 7.961481e-01 6.721031e-01
----------------------------------------------------------------------------
System: 131072 atoms 47265 ghost atoms, Steps: 200
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
----------------------------------------------------------------------------
Performance: 2.42 million atom updates per second
Statistics:
Vector width: 16, Processor frequency: 2.0000 GHz
Average neighbors per atom: 76.0351
Average SIMD iterations per atom: 5.0875
Total number of computed pair interactions: 2003181259
Total number of SIMD iterations: 134032075
Useful read data volume for force computation: 32.79GB
Cycles/SIMD iteration: 68.9511
--------------------------------------------------------------------------------
Region force, Group 1: MEM_SP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 4.452877 |
| call count | 201 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 7428719000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 8875251000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 11094050000 |
| PWR_PKG_ENERGY | PWR0 | 265.5057 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 79036820 |
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 3935012000 |
| CAS_COUNT_RD | MBOX0C0 | 19716700 |
| CAS_COUNT_WR | MBOX0C1 | 595747 |
| CAS_COUNT_RD | MBOX1C0 | 19734880 |
| CAS_COUNT_WR | MBOX1C1 | 597090 |
| CAS_COUNT_RD | MBOX2C0 | 19732800 |
| CAS_COUNT_WR | MBOX2C1 | 595219 |
| CAS_COUNT_RD | MBOX3C0 | 19886430 |
| CAS_COUNT_WR | MBOX3C1 | 632443 |
| CAS_COUNT_RD | MBOX4C0 | 19887210 |
| CAS_COUNT_WR | MBOX4C1 | 633169 |
| CAS_COUNT_RD | MBOX5C0 | 19935560 |
| CAS_COUNT_WR | MBOX5C1 | 634112 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 4.4529 |
| Runtime unhalted [s] | 3.5585 |
| Clock [MHz] | 1995.2693 |
| CPI | 1.1947 |
| Energy [J] | 265.5057 |
| Power [W] | 59.6257 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| SP [MFLOP/s] | 14156.9661 |
| AVX SP [MFLOP/s] | 14139.2165 |
| Packed [MUOPS/s] | 883.7010 |
| Scalar [MUOPS/s] | 17.7496 |
| Memory read bandwidth [MBytes/s] | 1708.8254 |
| Memory read data volume [GBytes] | 7.6092 |
| Memory write bandwidth [MBytes/s] | 53.0035 |
| Memory write data volume [GBytes] | 0.2360 |
| Memory bandwidth [MBytes/s] | 1761.8288 |
| Memory data volume [GBytes] | 7.8452 |
| Operational intensity | 8.0354 |
+-----------------------------------+------------+
Region reneighbour, Group 1: MEM_SP
+-------------------+------------+
| Region Info | HWThread 0 |
+-------------------+------------+
| RDTSC Runtime [s] | 5.935627 |
| call count | 10 |
+-------------------+------------+
+------------------------------------------+---------+-------------+
| Event | Counter | HWThread 0 |
+------------------------------------------+---------+-------------+
| INSTR_RETIRED_ANY | FIXC0 | 18208530000 |
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11805500000 |
| CPU_CLK_UNHALTED_REF | FIXC2 | 14756870000 |
| PWR_PKG_ENERGY | PWR0 | 340.7903 |
| PWR_DRAM_ENERGY | PWR3 | 0 |
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 6240406000 |
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 491520 |
| CAS_COUNT_RD | MBOX0C0 | 1772377 |
| CAS_COUNT_WR | MBOX0C1 | 975760 |
| CAS_COUNT_RD | MBOX1C0 | 1770611 |
| CAS_COUNT_WR | MBOX1C1 | 977433 |
| CAS_COUNT_RD | MBOX2C0 | 1771722 |
| CAS_COUNT_WR | MBOX2C1 | 979122 |
| CAS_COUNT_RD | MBOX3C0 | 1782901 |
| CAS_COUNT_WR | MBOX3C1 | 967621 |
| CAS_COUNT_RD | MBOX4C0 | 1780789 |
| CAS_COUNT_WR | MBOX4C1 | 967179 |
| CAS_COUNT_RD | MBOX5C0 | 1784733 |
| CAS_COUNT_WR | MBOX5C1 | 969349 |
+------------------------------------------+---------+-------------+
+-----------------------------------+------------+
| Metric | HWThread 0 |
+-----------------------------------+------------+
| Runtime (RDTSC) [s] | 5.9356 |
| Runtime unhalted [s] | 4.7334 |
| Clock [MHz] | 1995.2675 |
| CPI | 0.6483 |
| Energy [J] | 340.7903 |
| Power [W] | 57.4144 |
| Energy DRAM [J] | 0 |
| Power DRAM [W] | 0 |
| SP [MFLOP/s] | 1052.6723 |
| AVX SP [MFLOP/s] | 1.3249 |
| Packed [MUOPS/s] | 0.0828 |
| Scalar [MUOPS/s] | 1051.3474 |
| Memory read bandwidth [MBytes/s] | 114.9736 |
| Memory read data volume [GBytes] | 0.6824 |
| Memory write bandwidth [MBytes/s] | 62.9308 |
| Memory write data volume [GBytes] | 0.3735 |
| Memory bandwidth [MBytes/s] | 177.9044 |
| Memory data volume [GBytes] | 1.0560 |
| Operational intensity | 5.9171 |
+-----------------------------------+------------+

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,676 @@
.text
.file "force_lj.c"
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
.LCPI0_0:
.quad 4631952216750555136 # 48
.LCPI0_3:
.quad 4607182418800017408 # 1
.LCPI0_4:
.quad -4620693217682128896 # -0.5
.section .rodata.cst4,"aM",@progbits,4
.p2align 2
.LCPI0_1:
.long 3 # 0x3
.LCPI0_2:
.long 2 # 0x2
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI0_5:
.zero 16,255
.text
.globl computeForceLJFullNeigh_plain_c
.p2align 4, 0x90
.type computeForceLJFullNeigh_plain_c,@function
computeForceLJFullNeigh_plain_c: #
.LcomputeForceLJFullNeigh_plain_c$local:
.cfi_startproc
# %bb.0: #
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $264, %rsp # imm = 0x108
.cfi_def_cfa_offset 320
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq %rcx, %rbx
movq %rdx, %r15
movq %rsi, %r12
movl 4(%rsi), %r14d
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, (%rsp) # 8-byte Spill
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
testl %r14d, %r14d
jle .LBB0_2
# %bb.1: #
movq 64(%r12), %rdi
leaq (,%r14,8), %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB0_2: #
xorl %eax, %eax
callq getTimeStamp
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
movl $.L.str, %edi
callq likwid_markerStartRegion
testl %r14d, %r14d
jle .LBB0_19
# %bb.3: #
vmovsd (%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd %xmm0, %xmm0, %xmm13
movq 16(%r15), %r11
movq 24(%r15), %rsi
movslq 8(%r15), %rdi
movq 16(%r12), %r15
movq 64(%r12), %r8
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
movq %rbx, 24(%rsp) # 8-byte Spill
vmovdqu (%rbx), %xmm14
decq %r14
vmovq %r15, %xmm0
vpbroadcastq %xmm0, %ymm3
vbroadcastsd %xmm13, %ymm2
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
vbroadcastsd %xmm12, %ymm8
vbroadcastsd %xmm15, %ymm9
shlq $2, %rdi
xorl %r10d, %r10d
movq %r14, 56(%rsp) # 8-byte Spill
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
movq %rsi, 48(%rsp) # 8-byte Spill
movq %rdi, 40(%rsp) # 8-byte Spill
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
jmp .LBB0_6
.p2align 4, 0x90
.LBB0_17: #
# in Loop: Header=BB0_6 Depth=1
movq %r13, %rdx
.LBB0_5: #
# in Loop: Header=BB0_6 Depth=1
vaddsd (%r8,%r12,8), %xmm10, %xmm0
vmovsd %xmm0, (%r8,%r12,8)
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
vmovsd %xmm0, (%r8,%rbx,8)
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
vmovsd %xmm0, (%r8,%rbp,8)
leal 3(%r13), %eax
addl $6, %r13d
testl %eax, %eax
cmovnsl %eax, %r13d
sarl $2, %r13d
movslq %r13d, %rax
vmovq %rax, %xmm0
vmovq %rdx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vpaddq %xmm0, %xmm14, %xmm14
addq %rdi, %r11
cmpq %r14, %r10
leaq 1(%r10), %r10
je .LBB0_18
.LBB0_6: #
# =>This Loop Header: Depth=1
# Child Loop BB0_9 Depth 2
# Child Loop BB0_13 Depth 2
movl (%rsi,%r10,4), %r13d
leal (%r10,%r10,2), %r12d
leal (%r10,%r10,2), %ebx
incl %ebx
leal (%r10,%r10,2), %ebp
addl $2, %ebp
testl %r13d, %r13d
jle .LBB0_4
# %bb.7: #
# in Loop: Header=BB0_6 Depth=1
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
movq %r13, %rdx
movl $4294967292, %eax # imm = 0xFFFFFFFC
andq %rax, %rdx
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
vmovapd %xmm2, (%rsp) # 16-byte Spill
je .LBB0_16
# %bb.8: #
# in Loop: Header=BB0_6 Depth=1
movq %rbp, 64(%rsp) # 8-byte Spill
movq %rbx, 72(%rsp) # 8-byte Spill
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
vbroadcastsd %xmm0, %ymm14
vbroadcastsd %xmm1, %ymm5
vbroadcastsd %xmm2, %ymm10
vxorpd %xmm0, %xmm0, %xmm0
vxorpd %xmm15, %xmm15, %xmm15
vxorpd %xmm13, %xmm13, %xmm13
xorl %ebp, %ebp
vmovapd %ymm8, %ymm9
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
.p2align 4, 0x90
movl $111, %ebx # OSACA START MARKER
.byte 100 # OSACA START MARKER
.byte 103 # OSACA START MARKER
.byte 144 # OSACA START MARKER
# LLVM-MCA-BEGIN
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
.LBB0_9: #
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
vpmovsxdq %xmm11, %ymm1
vpsllq $3, %ymm1, %ymm1
vpaddq %ymm1, %ymm3, %ymm1
vmovq %xmm1, %r14
vpextrq $1, %xmm1, %r9
vextracti128 $1, %ymm1, %xmm1
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
vpsubd .LCPI0_5, %xmm11, %xmm6
vpmovsxdq %xmm6, %ymm6
vpsllq $3, %ymm6, %ymm6
vmovq %xmm1, %rdi
vpaddq %ymm6, %ymm3, %ymm6
vmovq %xmm6, %rcx
vpextrq $1, %xmm1, %rbx
vpextrq $1, %xmm6, %rax
vextracti128 $1, %ymm6, %xmm1
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
vmovq %xmm1, %rdi
vpextrq $1, %xmm1, %rsi
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
vpaddd %xmm12, %xmm11, %xmm4
vpmovsxdq %xmm4, %ymm4
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
vpsllq $3, %ymm4, %ymm4
vpaddq %ymm4, %ymm3, %ymm4
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
vpextrq $1, %xmm4, %rax
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
vmovq %xmm4, %rcx
vextracti128 $1, %ymm4, %xmm4
vmovq %xmm4, %rsi
vinsertf128 $1, %xmm6, %ymm2, %ymm2
vpextrq $1, %xmm4, %rdi
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
vsubpd %ymm2, %ymm14, %ymm2
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
vinsertf128 $1, %xmm1, %ymm7, %ymm1
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
vinsertf128 $1, %xmm4, %ymm6, %ymm4
vsubpd %ymm1, %ymm5, %ymm1
vsubpd %ymm4, %ymm10, %ymm4
vmulpd %ymm2, %ymm2, %ymm6
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
vdivpd %ymm6, %ymm7, %ymm7
vmulpd %ymm7, %ymm7, %ymm11
vmulpd %ymm9, %ymm11, %ymm11
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
vmulpd %ymm7, %ymm11, %ymm11
vaddpd %ymm12, %ymm11, %ymm12
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
vmulpd %ymm7, %ymm11, %ymm7
vmulpd %ymm7, %ymm12, %ymm7
vcmpltpd %ymm8, %ymm6, %ymm6
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
addq $4, %rbp
cmpq %rdx, %rbp
jb .LBB0_9
# LLVM-MCA-END
movl $222, %ebx # OSACA END MARKER
.byte 100 # OSACA END MARKER
.byte 103 # OSACA END MARKER
.byte 144 # OSACA END MARKER
# %bb.10: #
# in Loop: Header=BB0_6 Depth=1
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
vaddsd %xmm1, %xmm0, %xmm1
vextractf128 $1, %ymm0, %xmm0
vaddsd %xmm0, %xmm1, %xmm1
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
vaddsd %xmm0, %xmm1, %xmm10
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
vaddsd %xmm1, %xmm15, %xmm1
vextractf128 $1, %ymm15, %xmm2
vaddsd %xmm2, %xmm1, %xmm1
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
vaddsd %xmm2, %xmm1, %xmm11
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
vaddsd %xmm1, %xmm13, %xmm1
vextractf128 $1, %ymm13, %xmm2
vaddsd %xmm2, %xmm1, %xmm1
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
vaddsd %xmm2, %xmm1, %xmm5
movq 56(%rsp), %r14 # 8-byte Reload
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
movq 48(%rsp), %rsi # 8-byte Reload
movq 40(%rsp), %rdi # 8-byte Reload
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
vmovapd %ymm9, %ymm8
movq 72(%rsp), %rbx # 8-byte Reload
movq 64(%rsp), %rbp # 8-byte Reload
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
cmpq %r13, %rdx
jae .LBB0_17
jmp .LBB0_11
.p2align 4, 0x90
.LBB0_4: #
# in Loop: Header=BB0_6 Depth=1
movslq %r13d, %rdx
vxorpd %xmm5, %xmm5, %xmm5
vxorpd %xmm11, %xmm11, %xmm11
vxorpd %xmm10, %xmm10, %xmm10
jmp .LBB0_5
.p2align 4, 0x90
.LBB0_16: #
# in Loop: Header=BB0_6 Depth=1
vxorpd %xmm10, %xmm10, %xmm10
vxorpd %xmm11, %xmm11, %xmm11
vxorpd %xmm5, %xmm5, %xmm5
cmpq %r13, %rdx
jae .LBB0_17
.LBB0_11: #
# in Loop: Header=BB0_6 Depth=1
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
jmp .LBB0_13
.p2align 4, 0x90
.LBB0_12: #
# in Loop: Header=BB0_13 Depth=2
incq %rdx
cmpq %rdx, %r13
je .LBB0_17
.LBB0_13: #
# Parent Loop BB0_6 Depth=1
# => This Inner Loop Header: Depth=2
movl (%r11,%rdx,4), %eax
leal (%rax,%rax,2), %ecx
movslq %ecx, %rcx
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
leal (%rax,%rax,2), %ecx
incl %ecx
movslq %ecx, %rcx
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
leal 2(%rax,%rax,2), %eax
cltq
vmovapd (%rsp), %xmm1 # 16-byte Reload
vsubsd (%r15,%rax,8), %xmm1, %xmm1
vmulsd %xmm6, %xmm6, %xmm7
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
vucomisd %xmm13, %xmm7
jae .LBB0_12
# %bb.14: #
# in Loop: Header=BB0_13 Depth=2
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
vdivsd %xmm7, %xmm0, %xmm7
vmulsd %xmm7, %xmm7, %xmm0
vmulsd %xmm0, %xmm12, %xmm0
vmulsd %xmm7, %xmm0, %xmm0
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
vmulsd %xmm7, %xmm15, %xmm7
vmulsd %xmm0, %xmm7, %xmm0
vmulsd %xmm4, %xmm0, %xmm0
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
jmp .LBB0_12
.LBB0_18: #
movq 24(%rsp), %rax # 8-byte Reload
vmovdqu %xmm14, (%rax)
.LBB0_19: #
movl $.L.str, %edi
vzeroupper
callq likwid_markerStopRegion
xorl %eax, %eax
callq getTimeStamp
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
addq $264, %rsp # imm = 0x108
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %r12
.cfi_def_cfa_offset 40
popq %r13
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end0:
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
.cfi_endproc
# -- End function
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function computeForceLJHalfNeigh
.LCPI1_0:
.quad 4631952216750555136 # 48
.LCPI1_1:
.quad 4607182418800017408 # 1
.LCPI1_2:
.quad -4620693217682128896 # -0.5
.LCPI1_3:
.quad 4741671816366391296 # 1.0E+9
.text
.globl computeForceLJHalfNeigh
.p2align 4, 0x90
.type computeForceLJHalfNeigh,@function
computeForceLJHalfNeigh: #
.LcomputeForceLJHalfNeigh$local:
.cfi_startproc
# %bb.0: #
pushq %rbp
.cfi_def_cfa_offset 16
pushq %r15
.cfi_def_cfa_offset 24
pushq %r14
.cfi_def_cfa_offset 32
pushq %r13
.cfi_def_cfa_offset 40
pushq %r12
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $56, %rsp
.cfi_def_cfa_offset 112
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq %rcx, 24(%rsp) # 8-byte Spill
movq %rdx, %r12
movq %rsi, %r13
movl 4(%rsi), %r15d
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, (%rsp) # 8-byte Spill
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
movq %rdi, 40(%rsp) # 8-byte Spill
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
vmovsd %xmm0, 16(%rsp) # 8-byte Spill
testl %r15d, %r15d
jle .LBB1_2
# %bb.1: #
movq 64(%r13), %rdi
leaq (,%r15,8), %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB1_2: #
xorl %r14d, %r14d
xorl %eax, %eax
callq getTimeStamp
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
movl $.L.str.1, %edi
callq likwid_markerStartRegion
testl %r15d, %r15d
jle .LBB1_8
# %bb.3: #
vmovsd (%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd %xmm0, %xmm0, %xmm12
movq 16(%r12), %rax
movq 24(%r12), %rcx
movq %rcx, (%rsp) # 8-byte Spill
movslq 8(%r12), %rdx
movq 16(%r13), %rsi
movq 64(%r13), %rdi
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
movq 24(%rsp), %rcx # 8-byte Reload
vmovdqu (%rcx), %xmm10
shlq $2, %rdx
movq %rdx, 48(%rsp) # 8-byte Spill
xorl %r13d, %r13d
xorl %r14d, %r14d
jmp .LBB1_4
.p2align 4, 0x90
.LBB1_14: #
# in Loop: Header=BB1_4 Depth=1
movq 8(%rsp), %rbp # 8-byte Reload
.LBB1_6: #
# in Loop: Header=BB1_4 Depth=1
addl %r10d, %r14d
vaddsd (%rdi,%r12,8), %xmm14, %xmm0
vmovsd %xmm0, (%rdi,%r12,8)
vaddsd (%rdi,%rbp,8), %xmm15, %xmm0
vmovsd %xmm0, (%rdi,%rbp,8)
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
vmovsd %xmm0, (%rdi,%r11,8)
leal 3(%r10), %ecx
addl $6, %r10d
testl %ecx, %ecx
cmovnsl %ecx, %r10d
sarl $2, %r10d
movslq %r10d, %rcx
vmovq %rcx, %xmm0
vmovq %rdx, %xmm1
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vpaddq %xmm0, %xmm10, %xmm10
incq %r13
addq 48(%rsp), %rax # 8-byte Folded Reload
cmpq %r15, %r13
je .LBB1_7
.LBB1_4: #
# =>This Loop Header: Depth=1
# Child Loop BB1_10 Depth 2
movq (%rsp), %rcx # 8-byte Reload
movslq (%rcx,%r13,4), %r10
leaq (,%r13,2), %rcx
addq %r13, %rcx
leal 1(%rcx), %ebp
leal 2(%rcx), %r11d
movl %ecx, %r12d
testq %r10, %r10
jle .LBB1_5
# %bb.9: #
# in Loop: Header=BB1_4 Depth=1
vmovsd (%rsi,%r12,8), %xmm9 # xmm9 = mem[0],zero
movq %rbp, 8(%rsp) # 8-byte Spill
vmovsd (%rsi,%rbp,8), %xmm4 # xmm4 = mem[0],zero
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
movl %r10d, %edx
vxorpd %xmm14, %xmm14, %xmm14
xorl %ebx, %ebx
vxorpd %xmm15, %xmm15, %xmm15
vxorpd %xmm13, %xmm13, %xmm13
jmp .LBB1_10
.p2align 4, 0x90
.LBB1_13: #
# in Loop: Header=BB1_10 Depth=2
incq %rbx
cmpq %rbx, %rdx
je .LBB1_14
.LBB1_10: #
# Parent Loop BB1_4 Depth=1
# => This Inner Loop Header: Depth=2
movslq (%rax,%rbx,4), %r9
leaq (%r9,%r9,2), %r8
vsubsd (%rsi,%r8,8), %xmm9, %xmm2
movslq %r8d, %rcx
vsubsd 8(%rsi,%rcx,8), %xmm4, %xmm5
vsubsd 16(%rsi,%rcx,8), %xmm1, %xmm0
vmulsd %xmm2, %xmm2, %xmm6
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
vucomisd %xmm12, %xmm6
jae .LBB1_13
# %bb.11: #
# in Loop: Header=BB1_10 Depth=2
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
vdivsd %xmm6, %xmm3, %xmm6
vmulsd 16(%rsp), %xmm6, %xmm7 # 8-byte Folded Reload
vmulsd %xmm6, %xmm6, %xmm8
vmulsd %xmm7, %xmm8, %xmm7
vaddsd .LCPI1_2(%rip), %xmm7, %xmm3
vmulsd %xmm6, %xmm11, %xmm6
vmulsd %xmm7, %xmm6, %xmm6
vmulsd %xmm3, %xmm6, %xmm3
vmulsd %xmm2, %xmm3, %xmm6
vaddsd %xmm6, %xmm14, %xmm14
vmulsd %xmm5, %xmm3, %xmm2
vaddsd %xmm2, %xmm15, %xmm15
vmulsd %xmm0, %xmm3, %xmm0
vaddsd %xmm0, %xmm13, %xmm13
cmpl %r15d, %r9d
jge .LBB1_13
# %bb.12: #
# in Loop: Header=BB1_10 Depth=2
leaq 1(%rcx), %rbp
addq $2, %rcx
vmovsd (%rdi,%r8,8), %xmm3 # xmm3 = mem[0],zero
vsubsd %xmm6, %xmm3, %xmm3
vmovsd %xmm3, (%rdi,%r8,8)
vmovsd (%rdi,%rbp,8), %xmm3 # xmm3 = mem[0],zero
vsubsd %xmm2, %xmm3, %xmm2
vmovsd %xmm2, (%rdi,%rbp,8)
vmovsd (%rdi,%rcx,8), %xmm2 # xmm2 = mem[0],zero
vsubsd %xmm0, %xmm2, %xmm0
vmovsd %xmm0, (%rdi,%rcx,8)
jmp .LBB1_13
.p2align 4, 0x90
.LBB1_5: #
# in Loop: Header=BB1_4 Depth=1
vxorpd %xmm13, %xmm13, %xmm13
movq %r10, %rdx
vxorpd %xmm15, %xmm15, %xmm15
vxorpd %xmm14, %xmm14, %xmm14
jmp .LBB1_6
.LBB1_7: #
movq 24(%rsp), %rax # 8-byte Reload
vmovdqu %xmm10, (%rax)
.LBB1_8: #
movl $.L.str.1, %edi
callq likwid_markerStopRegion
xorl %eax, %eax
callq getTimeStamp
movq 40(%rsp), %rax # 8-byte Reload
vmovsd 264(%rax), %xmm3 # xmm3 = mem[0],zero
vsubsd 32(%rsp), %xmm0, %xmm2 # 8-byte Folded Reload
vmulsd .LCPI1_3(%rip), %xmm3, %xmm0
vmulsd %xmm2, %xmm0, %xmm0
vmovapd %xmm2, %xmm1
vmovsd %xmm2, 16(%rsp) # 8-byte Spill
movl %r14d, %eax
vxorps %xmm12, %xmm12, %xmm12
vcvtsi2sd %rax, %xmm12, %xmm2
vdivsd %xmm2, %xmm0, %xmm2
movl $.L.str.2, %edi
movl %r14d, %esi
vmovapd %xmm3, %xmm0
movb $3, %al
callq printf
vmovsd 16(%rsp), %xmm0 # 8-byte Reload
# xmm0 = mem[0],zero
addq $56, %rsp
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %r12
.cfi_def_cfa_offset 40
popq %r13
.cfi_def_cfa_offset 32
popq %r14
.cfi_def_cfa_offset 24
popq %r15
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
retq
.Lfunc_end1:
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
.cfi_endproc
# -- End function
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
.p2align 4, 0x90
.type computeForceLJFullNeigh_simd,@function
computeForceLJFullNeigh_simd: #
.LcomputeForceLJFullNeigh_simd$local:
.cfi_startproc
# %bb.0: #
pushq %rax
.cfi_def_cfa_offset 16
movl 4(%rsi), %eax
testl %eax, %eax
jle .LBB2_2
# %bb.1: #
movq 64(%rsi), %rdi
shlq $3, %rax
leaq (%rax,%rax,2), %rdx
xorl %esi, %esi
callq _intel_fast_memset
.LBB2_2: #
xorl %eax, %eax
callq getTimeStamp
movl $.L.str, %edi
callq likwid_markerStartRegion
movq stderr(%rip), %rcx
movl $.L.str.3, %edi
movl $65, %esi
movl $1, %edx
callq fwrite
movl $-1, %edi
callq exit
.Lfunc_end2:
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
.cfi_endproc
# -- End function
.type .L.str,@object #
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "force"
.size .L.str, 6
.type .L.str.1,@object #
.L.str.1:
.asciz "forceLJ-halfneigh"
.size .L.str.1, 18
.type .L.str.2,@object #
.L.str.2:
.asciz "Its: %u Freq: %f Time: %f\nCy/it: %f (half-neigh)\n"
.size .L.str.2, 52
.type .L.str.3,@object #
.L.str.3:
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
.size .L.str.3, 66
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
.section ".note.GNU-stack","",@progbits