added static analysis and likwid files
This commit is contained in:
@@ -0,0 +1,88 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Initializing parameters...
|
||||
Initializing atoms...
|
||||
Creating atoms...
|
||||
Pattern: seq
|
||||
Number of timesteps: 200
|
||||
Number of atoms: 256
|
||||
Number of neighbors per atom: 1024
|
||||
Number of times to replicate neighbor lists: 1
|
||||
Estimated total data volume (kB): 1062.9120
|
||||
Estimated atom data volume (kB): 6.1440
|
||||
Estimated neighborlist data volume (kB): 1050.6240
|
||||
Initializing neighbor lists...
|
||||
Creating neighbor lists...
|
||||
Computing forces...
|
||||
Total time: 0.2735, Mega atom updates/s: 0.1872
|
||||
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
|
||||
Statistics:
|
||||
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 1018.9055
|
||||
Average SIMD iterations per atom: 127.3632
|
||||
Total number of computed pair interactions: 52428800
|
||||
Total number of SIMD iterations: 6553600
|
||||
Useful read data volume for force computation: 1.47GB
|
||||
Cycles/SIMD iteration: 83.4598
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 0.110776 |
|
||||
| call count | 200 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 267036300 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 10.9296 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 8643 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1367 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 9124 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1354 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 9138 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1356 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 5586 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1297 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 5328 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1269 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 5280 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1295 |
|
||||
+------------------------------------------+---------+------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 0.1108 |
|
||||
| Runtime unhalted [s] | 0.0878 |
|
||||
| Clock [MHz] | 1995.2564 |
|
||||
| CPI | 0.8202 |
|
||||
| Energy [J] | 10.9296 |
|
||||
| Power [W] | 98.6643 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 14233.3287 |
|
||||
| AVX DP [MFLOP/s] | 14231.8898 |
|
||||
| Packed [MUOPS/s] | 1778.9862 |
|
||||
| Scalar [MUOPS/s] | 1.4389 |
|
||||
| Memory read bandwidth [MBytes/s] | 24.9001 |
|
||||
| Memory read data volume [GBytes] | 0.0028 |
|
||||
| Memory write bandwidth [MBytes/s] | 4.5861 |
|
||||
| Memory write data volume [GBytes] | 0.0005 |
|
||||
| Memory bandwidth [MBytes/s] | 29.4863 |
|
||||
| Memory data volume [GBytes] | 0.0033 |
|
||||
| Operational intensity | 482.7104 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Parameters:
|
||||
Force field: lj
|
||||
Kernel: plain-C
|
||||
Data layout: AoS
|
||||
Floating-point precision: double
|
||||
Unit cells (nx, ny, nz): 32, 32, 32
|
||||
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||
Periodic (x, y, z): 1, 1, 1
|
||||
Lattice size: 1.679596e+00
|
||||
Epsilon: 1.000000e+00
|
||||
Sigma: 1.000000e+00
|
||||
Spring constant: 1.000000e+00
|
||||
Damping constant: 1.000000e+00
|
||||
Temperature: 1.440000e+00
|
||||
RHO: 8.442000e-01
|
||||
Mass: 1.000000e+00
|
||||
Number of types: 4
|
||||
Number of timesteps: 200
|
||||
Report stats every (timesteps): 100
|
||||
Reneighbor every (timesteps): 20
|
||||
Prune every (timesteps): 1000
|
||||
Output positions every (timesteps): 20
|
||||
Output velocities every (timesteps): 5
|
||||
Delta time (dt): 5.000000e-03
|
||||
Cutoff radius: 2.500000e+00
|
||||
Skin: 3.000000e-01
|
||||
Half neighbor lists: 0
|
||||
Processor frequency (GHz): 2.0000
|
||||
----------------------------------------------------------------------------
|
||||
step temp pressure
|
||||
0 1.440000e+00 1.215639e+00
|
||||
100 8.200895e-01 6.923143e-01
|
||||
200 7.961495e-01 6.721043e-01
|
||||
----------------------------------------------------------------------------
|
||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
|
||||
----------------------------------------------------------------------------
|
||||
Performance: 2.28 million atom updates per second
|
||||
Statistics:
|
||||
Vector width: 8, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 76.0352
|
||||
Average SIMD iterations per atom: 9.9181
|
||||
Total number of computed pair interactions: 2003182862
|
||||
Total number of SIMD iterations: 261297661
|
||||
Useful read data volume for force computation: 57.46GB
|
||||
Cycles/SIMD iteration: 40.4432
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.115807 |
|
||||
| call count | 201 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 12592470000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 307.9429 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 22734550 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1147714 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 22755180 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1144415 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 22762780 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1129051 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 22905660 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1143324 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 22914860 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1169116 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 22890220 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1180739 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.1158 |
|
||||
| Runtime unhalted [s] | 4.0885 |
|
||||
| Clock [MHz] | 1995.2508 |
|
||||
| CPI | 0.8098 |
|
||||
| Energy [J] | 307.9429 |
|
||||
| Power [W] | 60.1944 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 12644.6041 |
|
||||
| AVX DP [MFLOP/s] | 12629.1535 |
|
||||
| Packed [MUOPS/s] | 1578.6442 |
|
||||
| Scalar [MUOPS/s] | 15.4506 |
|
||||
| Memory read bandwidth [MBytes/s] | 1713.4438 |
|
||||
| Memory read data volume [GBytes] | 8.7656 |
|
||||
| Memory write bandwidth [MBytes/s] | 86.5003 |
|
||||
| Memory write data volume [GBytes] | 0.4425 |
|
||||
| Memory bandwidth [MBytes/s] | 1799.9442 |
|
||||
| Memory data volume [GBytes] | 9.2082 |
|
||||
| Operational intensity | 7.0250 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
Region reneighbour, Group 1: MEM_DP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.897385 |
|
||||
| call count | 10 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 18212540000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 338.9000 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 2086787 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1115626 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 2089964 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1117021 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 2103832 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1117965 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 2086930 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1102471 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 2094688 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1103018 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 2097438 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1102525 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.8974 |
|
||||
| Runtime unhalted [s] | 4.7026 |
|
||||
| Clock [MHz] | 1995.2473 |
|
||||
| CPI | 0.6440 |
|
||||
| Energy [J] | 338.9000 |
|
||||
| Power [W] | 57.4661 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| DP [MFLOP/s] | 1059.4978 |
|
||||
| AVX DP [MFLOP/s] | 1.3335 |
|
||||
| Packed [MUOPS/s] | 0.1667 |
|
||||
| Scalar [MUOPS/s] | 1058.1643 |
|
||||
| Memory read bandwidth [MBytes/s] | 136.3006 |
|
||||
| Memory read data volume [GBytes] | 0.8038 |
|
||||
| Memory write bandwidth [MBytes/s] | 72.2612 |
|
||||
| Memory write data volume [GBytes] | 0.4262 |
|
||||
| Memory bandwidth [MBytes/s] | 208.5618 |
|
||||
| Memory data volume [GBytes] | 1.2300 |
|
||||
| Operational intensity | 5.0800 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Initializing parameters...
|
||||
Initializing atoms...
|
||||
Creating atoms...
|
||||
Pattern: seq
|
||||
Number of timesteps: 200
|
||||
Number of atoms: 256
|
||||
Number of neighbors per atom: 1024
|
||||
Number of times to replicate neighbor lists: 1
|
||||
Estimated total data volume (kB): 1056.7680
|
||||
Estimated atom data volume (kB): 3.0720
|
||||
Estimated neighborlist data volume (kB): 1050.6240
|
||||
Initializing neighbor lists...
|
||||
Creating neighbor lists...
|
||||
Computing forces...
|
||||
Total time: 0.2466, Mega atom updates/s: 0.2076
|
||||
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
|
||||
Statistics:
|
||||
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 1018.9055
|
||||
Average SIMD iterations per atom: 63.6816
|
||||
Total number of computed pair interactions: 52428800
|
||||
Total number of SIMD iterations: 3276800
|
||||
Useful read data volume for force computation: 0.84GB
|
||||
Cycles/SIMD iteration: 150.4999
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 0.085843 |
|
||||
| call count | 200 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 129769100 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 172300100 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 215371300 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 9.2849 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 154000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 89088000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 8354 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 1126 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 7863 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 1105 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 7990 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 1113 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 4775 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 1112 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 4201 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 1127 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 4035 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 1120 |
|
||||
+------------------------------------------+---------+------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 0.0858 |
|
||||
| Runtime unhalted [s] | 0.0691 |
|
||||
| Clock [MHz] | 1995.2787 |
|
||||
| CPI | 1.3277 |
|
||||
| Energy [J] | 9.2849 |
|
||||
| Power [W] | 108.1610 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 16606.5397 |
|
||||
| AVX SP [MFLOP/s] | 16604.7458 |
|
||||
| Packed [MUOPS/s] | 1037.7966 |
|
||||
| Scalar [MUOPS/s] | 1.7940 |
|
||||
| Memory read bandwidth [MBytes/s] | 27.7476 |
|
||||
| Memory read data volume [GBytes] | 0.0024 |
|
||||
| Memory write bandwidth [MBytes/s] | 4.9974 |
|
||||
| Memory write data volume [GBytes] | 0.0004 |
|
||||
| Memory bandwidth [MBytes/s] | 32.7450 |
|
||||
| Memory data volume [GBytes] | 0.0028 |
|
||||
| Operational intensity | 507.1471 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
@@ -0,0 +1,168 @@
|
||||
--------------------------------------------------------------------------------
|
||||
CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
|
||||
CPU type: Intel Cascadelake SP processor
|
||||
CPU clock: 2.49 GHz
|
||||
--------------------------------------------------------------------------------
|
||||
Parameters:
|
||||
Force field: lj
|
||||
Kernel: plain-C
|
||||
Data layout: AoS
|
||||
Floating-point precision: single
|
||||
Unit cells (nx, ny, nz): 32, 32, 32
|
||||
Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
|
||||
Periodic (x, y, z): 1, 1, 1
|
||||
Lattice size: 1.679596e+00
|
||||
Epsilon: 1.000000e+00
|
||||
Sigma: 1.000000e+00
|
||||
Spring constant: 1.000000e+00
|
||||
Damping constant: 1.000000e+00
|
||||
Temperature: 1.440000e+00
|
||||
RHO: 8.442000e-01
|
||||
Mass: 1.000000e+00
|
||||
Number of types: 4
|
||||
Number of timesteps: 200
|
||||
Report stats every (timesteps): 100
|
||||
Reneighbor every (timesteps): 20
|
||||
Prune every (timesteps): 1000
|
||||
Output positions every (timesteps): 20
|
||||
Output velocities every (timesteps): 5
|
||||
Delta time (dt): 5.000000e-03
|
||||
Cutoff radius: 2.500000e+00
|
||||
Skin: 3.000000e-01
|
||||
Half neighbor lists: 0
|
||||
Processor frequency (GHz): 2.0000
|
||||
----------------------------------------------------------------------------
|
||||
step temp pressure
|
||||
0 1.440000e+00 1.215639e+00
|
||||
100 8.200897e-01 6.923144e-01
|
||||
200 7.961481e-01 6.721031e-01
|
||||
----------------------------------------------------------------------------
|
||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
|
||||
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
|
||||
----------------------------------------------------------------------------
|
||||
Performance: 2.42 million atom updates per second
|
||||
Statistics:
|
||||
Vector width: 16, Processor frequency: 2.0000 GHz
|
||||
Average neighbors per atom: 76.0351
|
||||
Average SIMD iterations per atom: 5.0875
|
||||
Total number of computed pair interactions: 2003181259
|
||||
Total number of SIMD iterations: 134032075
|
||||
Useful read data volume for force computation: 32.79GB
|
||||
Cycles/SIMD iteration: 68.9511
|
||||
--------------------------------------------------------------------------------
|
||||
Region force, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 4.452877 |
|
||||
| call count | 201 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 7428719000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 8875251000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 11094050000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 265.5057 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 79036820 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 3935012000 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 19716700 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 595747 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 19734880 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 597090 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 19732800 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 595219 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 19886430 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 632443 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 19887210 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 633169 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 19935560 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 634112 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 4.4529 |
|
||||
| Runtime unhalted [s] | 3.5585 |
|
||||
| Clock [MHz] | 1995.2693 |
|
||||
| CPI | 1.1947 |
|
||||
| Energy [J] | 265.5057 |
|
||||
| Power [W] | 59.6257 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 14156.9661 |
|
||||
| AVX SP [MFLOP/s] | 14139.2165 |
|
||||
| Packed [MUOPS/s] | 883.7010 |
|
||||
| Scalar [MUOPS/s] | 17.7496 |
|
||||
| Memory read bandwidth [MBytes/s] | 1708.8254 |
|
||||
| Memory read data volume [GBytes] | 7.6092 |
|
||||
| Memory write bandwidth [MBytes/s] | 53.0035 |
|
||||
| Memory write data volume [GBytes] | 0.2360 |
|
||||
| Memory bandwidth [MBytes/s] | 1761.8288 |
|
||||
| Memory data volume [GBytes] | 7.8452 |
|
||||
| Operational intensity | 8.0354 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
Region reneighbour, Group 1: MEM_SP
|
||||
+-------------------+------------+
|
||||
| Region Info | HWThread 0 |
|
||||
+-------------------+------------+
|
||||
| RDTSC Runtime [s] | 5.935627 |
|
||||
| call count | 10 |
|
||||
+-------------------+------------+
|
||||
|
||||
+------------------------------------------+---------+-------------+
|
||||
| Event | Counter | HWThread 0 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
| INSTR_RETIRED_ANY | FIXC0 | 18208530000 |
|
||||
| CPU_CLK_UNHALTED_CORE | FIXC1 | 11805500000 |
|
||||
| CPU_CLK_UNHALTED_REF | FIXC2 | 14756870000 |
|
||||
| PWR_PKG_ENERGY | PWR0 | 340.7903 |
|
||||
| PWR_DRAM_ENERGY | PWR3 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 6240406000 |
|
||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 |
|
||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 491520 |
|
||||
| CAS_COUNT_RD | MBOX0C0 | 1772377 |
|
||||
| CAS_COUNT_WR | MBOX0C1 | 975760 |
|
||||
| CAS_COUNT_RD | MBOX1C0 | 1770611 |
|
||||
| CAS_COUNT_WR | MBOX1C1 | 977433 |
|
||||
| CAS_COUNT_RD | MBOX2C0 | 1771722 |
|
||||
| CAS_COUNT_WR | MBOX2C1 | 979122 |
|
||||
| CAS_COUNT_RD | MBOX3C0 | 1782901 |
|
||||
| CAS_COUNT_WR | MBOX3C1 | 967621 |
|
||||
| CAS_COUNT_RD | MBOX4C0 | 1780789 |
|
||||
| CAS_COUNT_WR | MBOX4C1 | 967179 |
|
||||
| CAS_COUNT_RD | MBOX5C0 | 1784733 |
|
||||
| CAS_COUNT_WR | MBOX5C1 | 969349 |
|
||||
+------------------------------------------+---------+-------------+
|
||||
|
||||
+-----------------------------------+------------+
|
||||
| Metric | HWThread 0 |
|
||||
+-----------------------------------+------------+
|
||||
| Runtime (RDTSC) [s] | 5.9356 |
|
||||
| Runtime unhalted [s] | 4.7334 |
|
||||
| Clock [MHz] | 1995.2675 |
|
||||
| CPI | 0.6483 |
|
||||
| Energy [J] | 340.7903 |
|
||||
| Power [W] | 57.4144 |
|
||||
| Energy DRAM [J] | 0 |
|
||||
| Power DRAM [W] | 0 |
|
||||
| SP [MFLOP/s] | 1052.6723 |
|
||||
| AVX SP [MFLOP/s] | 1.3249 |
|
||||
| Packed [MUOPS/s] | 0.0828 |
|
||||
| Scalar [MUOPS/s] | 1051.3474 |
|
||||
| Memory read bandwidth [MBytes/s] | 114.9736 |
|
||||
| Memory read data volume [GBytes] | 0.6824 |
|
||||
| Memory write bandwidth [MBytes/s] | 62.9308 |
|
||||
| Memory write data volume [GBytes] | 0.3735 |
|
||||
| Memory bandwidth [MBytes/s] | 177.9044 |
|
||||
| Memory data volume [GBytes] | 1.0560 |
|
||||
| Operational intensity | 5.9171 |
|
||||
+-----------------------------------+------------+
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,676 @@
|
||||
.text
|
||||
.file "force_lj.c"
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c
|
||||
.LCPI0_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI0_3:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI0_4:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.section .rodata.cst4,"aM",@progbits,4
|
||||
.p2align 2
|
||||
.LCPI0_1:
|
||||
.long 3 # 0x3
|
||||
.LCPI0_2:
|
||||
.long 2 # 0x2
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.p2align 4
|
||||
.LCPI0_5:
|
||||
.zero 16,255
|
||||
.text
|
||||
.globl computeForceLJFullNeigh_plain_c
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_plain_c,@function
|
||||
computeForceLJFullNeigh_plain_c: #
|
||||
.LcomputeForceLJFullNeigh_plain_c$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 320
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, %rbx
|
||||
movq %rdx, %r15
|
||||
movq %rsi, %r12
|
||||
movl 4(%rsi), %r14d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 128(%rsp) # 8-byte Spill
|
||||
vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovdqa %xmm0, 80(%rsp) # 16-byte Spill
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_2
|
||||
# %bb.1: #
|
||||
movq 64(%r12), %rdi
|
||||
leaq (,%r14,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB0_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r14d, %r14d
|
||||
jle .LBB0_19
|
||||
# %bb.3: #
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm13
|
||||
movq 16(%r15), %r11
|
||||
movq 24(%r15), %rsi
|
||||
movslq 8(%r15), %rdi
|
||||
movq 16(%r12), %r15
|
||||
movq 64(%r12), %r8
|
||||
vmovsd 128(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI0_0(%rip), %xmm0, %xmm15
|
||||
movq %rbx, 24(%rsp) # 8-byte Spill
|
||||
vmovdqu (%rbx), %xmm14
|
||||
decq %r14
|
||||
vmovq %r15, %xmm0
|
||||
vpbroadcastq %xmm0, %ymm3
|
||||
vbroadcastsd %xmm13, %ymm2
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vbroadcastsd %xmm12, %ymm8
|
||||
vbroadcastsd %xmm15, %ymm9
|
||||
shlq $2, %rdi
|
||||
xorl %r10d, %r10d
|
||||
movq %r14, 56(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm13, 192(%rsp) # 16-byte Spill
|
||||
movq %rsi, 48(%rsp) # 8-byte Spill
|
||||
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||
vmovapd %xmm15, 176(%rsp) # 16-byte Spill
|
||||
vmovupd %ymm2, 224(%rsp) # 32-byte Spill
|
||||
vmovupd %ymm9, 128(%rsp) # 32-byte Spill
|
||||
jmp .LBB0_6
|
||||
.p2align 4, 0x90
|
||||
.LBB0_17: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %r13, %rdx
|
||||
.LBB0_5: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vaddsd (%r8,%r12,8), %xmm10, %xmm0
|
||||
vmovsd %xmm0, (%r8,%r12,8)
|
||||
vaddsd (%r8,%rbx,8), %xmm11, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbx,8)
|
||||
vaddsd (%r8,%rbp,8), %xmm5, %xmm0
|
||||
vmovsd %xmm0, (%r8,%rbp,8)
|
||||
leal 3(%r13), %eax
|
||||
addl $6, %r13d
|
||||
testl %eax, %eax
|
||||
cmovnsl %eax, %r13d
|
||||
sarl $2, %r13d
|
||||
movslq %r13d, %rax
|
||||
vmovq %rax, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm14, %xmm14
|
||||
addq %rdi, %r11
|
||||
cmpq %r14, %r10
|
||||
leaq 1(%r10), %r10
|
||||
je .LBB0_18
|
||||
.LBB0_6: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB0_9 Depth 2
|
||||
# Child Loop BB0_13 Depth 2
|
||||
movl (%rsi,%r10,4), %r13d
|
||||
leal (%r10,%r10,2), %r12d
|
||||
leal (%r10,%r10,2), %ebx
|
||||
incl %ebx
|
||||
leal (%r10,%r10,2), %ebp
|
||||
addl $2, %ebp
|
||||
testl %r13d, %r13d
|
||||
jle .LBB0_4
|
||||
# %bb.7: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero
|
||||
movq %r13, %rdx
|
||||
movl $4294967292, %eax # imm = 0xFFFFFFFC
|
||||
andq %rax, %rdx
|
||||
vmovapd %xmm0, 112(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm1, 96(%rsp) # 16-byte Spill
|
||||
vmovapd %xmm2, (%rsp) # 16-byte Spill
|
||||
je .LBB0_16
|
||||
# %bb.8: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movq %rbp, 64(%rsp) # 8-byte Spill
|
||||
movq %rbx, 72(%rsp) # 8-byte Spill
|
||||
vmovdqa %xmm14, 208(%rsp) # 16-byte Spill
|
||||
vbroadcastsd %xmm0, %ymm14
|
||||
vbroadcastsd %xmm1, %ymm5
|
||||
vbroadcastsd %xmm2, %ymm10
|
||||
vxorpd %xmm0, %xmm0, %xmm0
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
xorl %ebp, %ebp
|
||||
vmovapd %ymm8, %ymm9
|
||||
vmovupd 224(%rsp), %ymm8 # 32-byte Reload
|
||||
.p2align 4, 0x90
|
||||
movl $111, %ebx # OSACA START MARKER
|
||||
.byte 100 # OSACA START MARKER
|
||||
.byte 103 # OSACA START MARKER
|
||||
.byte 144 # OSACA START MARKER
|
||||
# LLVM-MCA-BEGIN
|
||||
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
|
||||
.LBB0_9: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
|
||||
vpmulld (%r11,%rbp,4), %xmm1, %xmm11
|
||||
vpmovsxdq %xmm11, %ymm1
|
||||
vpsllq $3, %ymm1, %ymm1
|
||||
vpaddq %ymm1, %ymm3, %ymm1
|
||||
vmovq %xmm1, %r14
|
||||
vpextrq $1, %xmm1, %r9
|
||||
vextracti128 $1, %ymm1, %xmm1
|
||||
vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero
|
||||
vpsubd .LCPI0_5, %xmm11, %xmm6
|
||||
vpmovsxdq %xmm6, %ymm6
|
||||
vpsllq $3, %ymm6, %ymm6
|
||||
vmovq %xmm1, %rdi
|
||||
vpaddq %ymm6, %ymm3, %ymm6
|
||||
vmovq %xmm6, %rcx
|
||||
vpextrq $1, %xmm1, %rbx
|
||||
vpextrq $1, %xmm6, %rax
|
||||
vextracti128 $1, %ymm6, %xmm1
|
||||
vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero
|
||||
vmovq %xmm1, %rdi
|
||||
vpextrq $1, %xmm1, %rsi
|
||||
vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero
|
||||
vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero
|
||||
vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
|
||||
vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0]
|
||||
vpaddd %xmm12, %xmm11, %xmm4
|
||||
vpmovsxdq %xmm4, %ymm4
|
||||
vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0]
|
||||
vpsllq $3, %ymm4, %ymm4
|
||||
vpaddq %ymm4, %ymm3, %ymm4
|
||||
vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vpextrq $1, %xmm4, %rax
|
||||
vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0]
|
||||
vmovq %xmm4, %rcx
|
||||
vextracti128 $1, %ymm4, %xmm4
|
||||
vmovq %xmm4, %rsi
|
||||
vinsertf128 $1, %xmm6, %ymm2, %ymm2
|
||||
vpextrq $1, %xmm4, %rdi
|
||||
vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero
|
||||
vsubpd %ymm2, %ymm14, %ymm2
|
||||
vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0]
|
||||
vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero
|
||||
vinsertf128 $1, %xmm1, %ymm7, %ymm1
|
||||
vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0]
|
||||
vinsertf128 $1, %xmm4, %ymm6, %ymm4
|
||||
vsubpd %ymm1, %ymm5, %ymm1
|
||||
vsubpd %ymm4, %ymm10, %ymm4
|
||||
vmulpd %ymm2, %ymm2, %ymm6
|
||||
vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
|
||||
vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
|
||||
vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
|
||||
vdivpd %ymm6, %ymm7, %ymm7
|
||||
vmulpd %ymm7, %ymm7, %ymm11
|
||||
vmulpd %ymm9, %ymm11, %ymm11
|
||||
vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
|
||||
vmulpd %ymm7, %ymm11, %ymm11
|
||||
vaddpd %ymm12, %ymm11, %ymm12
|
||||
vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
|
||||
vmulpd %ymm7, %ymm11, %ymm7
|
||||
vmulpd %ymm7, %ymm12, %ymm7
|
||||
vcmpltpd %ymm8, %ymm6, %ymm6
|
||||
vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
|
||||
vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
|
||||
vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
|
||||
vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
|
||||
vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
|
||||
vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
|
||||
addq $4, %rbp
|
||||
cmpq %rdx, %rbp
|
||||
jb .LBB0_9
|
||||
# LLVM-MCA-END
|
||||
movl $222, %ebx # OSACA END MARKER
|
||||
.byte 100 # OSACA END MARKER
|
||||
.byte 103 # OSACA END MARKER
|
||||
.byte 144 # OSACA END MARKER
|
||||
# %bb.10: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
|
||||
vaddsd %xmm1, %xmm0, %xmm1
|
||||
vextractf128 $1, %ymm0, %xmm0
|
||||
vaddsd %xmm0, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
|
||||
vaddsd %xmm0, %xmm1, %xmm10
|
||||
vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
|
||||
vaddsd %xmm1, %xmm15, %xmm1
|
||||
vextractf128 $1, %ymm15, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm11
|
||||
vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
|
||||
vaddsd %xmm1, %xmm13, %xmm1
|
||||
vextractf128 $1, %ymm13, %xmm2
|
||||
vaddsd %xmm2, %xmm1, %xmm1
|
||||
vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
|
||||
vaddsd %xmm2, %xmm1, %xmm5
|
||||
movq 56(%rsp), %r14 # 8-byte Reload
|
||||
vmovapd 80(%rsp), %xmm12 # 16-byte Reload
|
||||
vmovapd 192(%rsp), %xmm13 # 16-byte Reload
|
||||
movq 48(%rsp), %rsi # 8-byte Reload
|
||||
movq 40(%rsp), %rdi # 8-byte Reload
|
||||
vmovdqa 208(%rsp), %xmm14 # 16-byte Reload
|
||||
vmovapd 176(%rsp), %xmm15 # 16-byte Reload
|
||||
vmovapd %ymm9, %ymm8
|
||||
movq 72(%rsp), %rbx # 8-byte Reload
|
||||
movq 64(%rsp), %rbp # 8-byte Reload
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
jmp .LBB0_11
|
||||
.p2align 4, 0x90
|
||||
.LBB0_4: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
movslq %r13d, %rdx
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
jmp .LBB0_5
|
||||
.p2align 4, 0x90
|
||||
.LBB0_16: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vxorpd %xmm10, %xmm10, %xmm10
|
||||
vxorpd %xmm11, %xmm11, %xmm11
|
||||
vxorpd %xmm5, %xmm5, %xmm5
|
||||
cmpq %r13, %rdx
|
||||
jae .LBB0_17
|
||||
.LBB0_11: #
|
||||
# in Loop: Header=BB0_6 Depth=1
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
jmp .LBB0_13
|
||||
.p2align 4, 0x90
|
||||
.LBB0_12: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
incq %rdx
|
||||
cmpq %rdx, %r13
|
||||
je .LBB0_17
|
||||
.LBB0_13: #
|
||||
# Parent Loop BB0_6 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movl (%r11,%rdx,4), %eax
|
||||
leal (%rax,%rax,2), %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm0, %xmm6
|
||||
leal (%rax,%rax,2), %ecx
|
||||
incl %ecx
|
||||
movslq %ecx, %rcx
|
||||
vsubsd (%r15,%rcx,8), %xmm4, %xmm2
|
||||
leal 2(%rax,%rax,2), %eax
|
||||
cltq
|
||||
vmovapd (%rsp), %xmm1 # 16-byte Reload
|
||||
vsubsd (%r15,%rax,8), %xmm1, %xmm1
|
||||
vmulsd %xmm6, %xmm6, %xmm7
|
||||
vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
|
||||
vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
|
||||
vucomisd %xmm13, %xmm7
|
||||
jae .LBB0_12
|
||||
# %bb.14: #
|
||||
# in Loop: Header=BB0_13 Depth=2
|
||||
vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero
|
||||
vdivsd %xmm7, %xmm0, %xmm7
|
||||
vmulsd %xmm7, %xmm7, %xmm0
|
||||
vmulsd %xmm0, %xmm12, %xmm0
|
||||
vmulsd %xmm7, %xmm0, %xmm0
|
||||
vaddsd .LCPI0_4(%rip), %xmm0, %xmm4
|
||||
vmulsd %xmm7, %xmm15, %xmm7
|
||||
vmulsd %xmm0, %xmm7, %xmm0
|
||||
vmulsd %xmm4, %xmm0, %xmm0
|
||||
vmovapd 96(%rsp), %xmm4 # 16-byte Reload
|
||||
vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
|
||||
vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
|
||||
vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
|
||||
vmovapd 112(%rsp), %xmm0 # 16-byte Reload
|
||||
jmp .LBB0_12
|
||||
.LBB0_18: #
|
||||
movq 24(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm14, (%rax)
|
||||
.LBB0_19: #
|
||||
movl $.L.str, %edi
|
||||
vzeroupper
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload
|
||||
addq $264, %rsp # imm = 0x108
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end0:
|
||||
.size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.section .rodata.cst8,"aM",@progbits,8
|
||||
.p2align 3 # -- Begin function computeForceLJHalfNeigh
|
||||
.LCPI1_0:
|
||||
.quad 4631952216750555136 # 48
|
||||
.LCPI1_1:
|
||||
.quad 4607182418800017408 # 1
|
||||
.LCPI1_2:
|
||||
.quad -4620693217682128896 # -0.5
|
||||
.LCPI1_3:
|
||||
.quad 4741671816366391296 # 1.0E+9
|
||||
.text
|
||||
.globl computeForceLJHalfNeigh
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJHalfNeigh,@function
|
||||
computeForceLJHalfNeigh: #
|
||||
.LcomputeForceLJHalfNeigh$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rbp
|
||||
.cfi_def_cfa_offset 16
|
||||
pushq %r15
|
||||
.cfi_def_cfa_offset 24
|
||||
pushq %r14
|
||||
.cfi_def_cfa_offset 32
|
||||
pushq %r13
|
||||
.cfi_def_cfa_offset 40
|
||||
pushq %r12
|
||||
.cfi_def_cfa_offset 48
|
||||
pushq %rbx
|
||||
.cfi_def_cfa_offset 56
|
||||
subq $56, %rsp
|
||||
.cfi_def_cfa_offset 112
|
||||
.cfi_offset %rbx, -56
|
||||
.cfi_offset %r12, -48
|
||||
.cfi_offset %r13, -40
|
||||
.cfi_offset %r14, -32
|
||||
.cfi_offset %r15, -24
|
||||
.cfi_offset %rbp, -16
|
||||
movq %rcx, 24(%rsp) # 8-byte Spill
|
||||
movq %rdx, %r12
|
||||
movq %rsi, %r13
|
||||
movl 4(%rsi), %r15d
|
||||
vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, (%rsp) # 8-byte Spill
|
||||
vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 8(%rsp) # 8-byte Spill
|
||||
movq %rdi, 40(%rsp) # 8-byte Spill
|
||||
vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero
|
||||
vmovsd %xmm0, 16(%rsp) # 8-byte Spill
|
||||
testl %r15d, %r15d
|
||||
jle .LBB1_2
|
||||
# %bb.1: #
|
||||
movq 64(%r13), %rdi
|
||||
leaq (,%r15,8), %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB1_2: #
|
||||
xorl %r14d, %r14d
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
vmovsd %xmm0, 32(%rsp) # 8-byte Spill
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStartRegion
|
||||
testl %r15d, %r15d
|
||||
jle .LBB1_8
|
||||
# %bb.3: #
|
||||
vmovsd (%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd %xmm0, %xmm0, %xmm12
|
||||
movq 16(%r12), %rax
|
||||
movq 24(%r12), %rcx
|
||||
movq %rcx, (%rsp) # 8-byte Spill
|
||||
movslq 8(%r12), %rdx
|
||||
movq 16(%r13), %rsi
|
||||
movq 64(%r13), %rdi
|
||||
vmovsd 8(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
vmulsd .LCPI1_0(%rip), %xmm0, %xmm11
|
||||
movq 24(%rsp), %rcx # 8-byte Reload
|
||||
vmovdqu (%rcx), %xmm10
|
||||
shlq $2, %rdx
|
||||
movq %rdx, 48(%rsp) # 8-byte Spill
|
||||
xorl %r13d, %r13d
|
||||
xorl %r14d, %r14d
|
||||
jmp .LBB1_4
|
||||
.p2align 4, 0x90
|
||||
.LBB1_14: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
movq 8(%rsp), %rbp # 8-byte Reload
|
||||
.LBB1_6: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
addl %r10d, %r14d
|
||||
vaddsd (%rdi,%r12,8), %xmm14, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r12,8)
|
||||
vaddsd (%rdi,%rbp,8), %xmm15, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%rbp,8)
|
||||
vaddsd (%rdi,%r11,8), %xmm13, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%r11,8)
|
||||
leal 3(%r10), %ecx
|
||||
addl $6, %r10d
|
||||
testl %ecx, %ecx
|
||||
cmovnsl %ecx, %r10d
|
||||
sarl $2, %r10d
|
||||
movslq %r10d, %rcx
|
||||
vmovq %rcx, %xmm0
|
||||
vmovq %rdx, %xmm1
|
||||
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
|
||||
vpaddq %xmm0, %xmm10, %xmm10
|
||||
incq %r13
|
||||
addq 48(%rsp), %rax # 8-byte Folded Reload
|
||||
cmpq %r15, %r13
|
||||
je .LBB1_7
|
||||
.LBB1_4: #
|
||||
# =>This Loop Header: Depth=1
|
||||
# Child Loop BB1_10 Depth 2
|
||||
movq (%rsp), %rcx # 8-byte Reload
|
||||
movslq (%rcx,%r13,4), %r10
|
||||
leaq (,%r13,2), %rcx
|
||||
addq %r13, %rcx
|
||||
leal 1(%rcx), %ebp
|
||||
leal 2(%rcx), %r11d
|
||||
movl %ecx, %r12d
|
||||
testq %r10, %r10
|
||||
jle .LBB1_5
|
||||
# %bb.9: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vmovsd (%rsi,%r12,8), %xmm9 # xmm9 = mem[0],zero
|
||||
movq %rbp, 8(%rsp) # 8-byte Spill
|
||||
vmovsd (%rsi,%rbp,8), %xmm4 # xmm4 = mem[0],zero
|
||||
vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero
|
||||
movl %r10d, %edx
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
xorl %ebx, %ebx
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
jmp .LBB1_10
|
||||
.p2align 4, 0x90
|
||||
.LBB1_13: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
incq %rbx
|
||||
cmpq %rbx, %rdx
|
||||
je .LBB1_14
|
||||
.LBB1_10: #
|
||||
# Parent Loop BB1_4 Depth=1
|
||||
# => This Inner Loop Header: Depth=2
|
||||
movslq (%rax,%rbx,4), %r9
|
||||
leaq (%r9,%r9,2), %r8
|
||||
vsubsd (%rsi,%r8,8), %xmm9, %xmm2
|
||||
movslq %r8d, %rcx
|
||||
vsubsd 8(%rsi,%rcx,8), %xmm4, %xmm5
|
||||
vsubsd 16(%rsi,%rcx,8), %xmm1, %xmm0
|
||||
vmulsd %xmm2, %xmm2, %xmm6
|
||||
vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
|
||||
vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
|
||||
vucomisd %xmm12, %xmm6
|
||||
jae .LBB1_13
|
||||
# %bb.11: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero
|
||||
vdivsd %xmm6, %xmm3, %xmm6
|
||||
vmulsd 16(%rsp), %xmm6, %xmm7 # 8-byte Folded Reload
|
||||
vmulsd %xmm6, %xmm6, %xmm8
|
||||
vmulsd %xmm7, %xmm8, %xmm7
|
||||
vaddsd .LCPI1_2(%rip), %xmm7, %xmm3
|
||||
vmulsd %xmm6, %xmm11, %xmm6
|
||||
vmulsd %xmm7, %xmm6, %xmm6
|
||||
vmulsd %xmm3, %xmm6, %xmm3
|
||||
vmulsd %xmm2, %xmm3, %xmm6
|
||||
vaddsd %xmm6, %xmm14, %xmm14
|
||||
vmulsd %xmm5, %xmm3, %xmm2
|
||||
vaddsd %xmm2, %xmm15, %xmm15
|
||||
vmulsd %xmm0, %xmm3, %xmm0
|
||||
vaddsd %xmm0, %xmm13, %xmm13
|
||||
cmpl %r15d, %r9d
|
||||
jge .LBB1_13
|
||||
# %bb.12: #
|
||||
# in Loop: Header=BB1_10 Depth=2
|
||||
leaq 1(%rcx), %rbp
|
||||
addq $2, %rcx
|
||||
vmovsd (%rdi,%r8,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm6, %xmm3, %xmm3
|
||||
vmovsd %xmm3, (%rdi,%r8,8)
|
||||
vmovsd (%rdi,%rbp,8), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd %xmm2, %xmm3, %xmm2
|
||||
vmovsd %xmm2, (%rdi,%rbp,8)
|
||||
vmovsd (%rdi,%rcx,8), %xmm2 # xmm2 = mem[0],zero
|
||||
vsubsd %xmm0, %xmm2, %xmm0
|
||||
vmovsd %xmm0, (%rdi,%rcx,8)
|
||||
jmp .LBB1_13
|
||||
.p2align 4, 0x90
|
||||
.LBB1_5: #
|
||||
# in Loop: Header=BB1_4 Depth=1
|
||||
vxorpd %xmm13, %xmm13, %xmm13
|
||||
movq %r10, %rdx
|
||||
vxorpd %xmm15, %xmm15, %xmm15
|
||||
vxorpd %xmm14, %xmm14, %xmm14
|
||||
jmp .LBB1_6
|
||||
.LBB1_7: #
|
||||
movq 24(%rsp), %rax # 8-byte Reload
|
||||
vmovdqu %xmm10, (%rax)
|
||||
.LBB1_8: #
|
||||
movl $.L.str.1, %edi
|
||||
callq likwid_markerStopRegion
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
movq 40(%rsp), %rax # 8-byte Reload
|
||||
vmovsd 264(%rax), %xmm3 # xmm3 = mem[0],zero
|
||||
vsubsd 32(%rsp), %xmm0, %xmm2 # 8-byte Folded Reload
|
||||
vmulsd .LCPI1_3(%rip), %xmm3, %xmm0
|
||||
vmulsd %xmm2, %xmm0, %xmm0
|
||||
vmovapd %xmm2, %xmm1
|
||||
vmovsd %xmm2, 16(%rsp) # 8-byte Spill
|
||||
movl %r14d, %eax
|
||||
vxorps %xmm12, %xmm12, %xmm12
|
||||
vcvtsi2sd %rax, %xmm12, %xmm2
|
||||
vdivsd %xmm2, %xmm0, %xmm2
|
||||
movl $.L.str.2, %edi
|
||||
movl %r14d, %esi
|
||||
vmovapd %xmm3, %xmm0
|
||||
movb $3, %al
|
||||
callq printf
|
||||
vmovsd 16(%rsp), %xmm0 # 8-byte Reload
|
||||
# xmm0 = mem[0],zero
|
||||
addq $56, %rsp
|
||||
.cfi_def_cfa_offset 56
|
||||
popq %rbx
|
||||
.cfi_def_cfa_offset 48
|
||||
popq %r12
|
||||
.cfi_def_cfa_offset 40
|
||||
popq %r13
|
||||
.cfi_def_cfa_offset 32
|
||||
popq %r14
|
||||
.cfi_def_cfa_offset 24
|
||||
popq %r15
|
||||
.cfi_def_cfa_offset 16
|
||||
popq %rbp
|
||||
.cfi_def_cfa_offset 8
|
||||
retq
|
||||
.Lfunc_end1:
|
||||
.size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
|
||||
.p2align 4, 0x90
|
||||
.type computeForceLJFullNeigh_simd,@function
|
||||
computeForceLJFullNeigh_simd: #
|
||||
.LcomputeForceLJFullNeigh_simd$local:
|
||||
.cfi_startproc
|
||||
# %bb.0: #
|
||||
pushq %rax
|
||||
.cfi_def_cfa_offset 16
|
||||
movl 4(%rsi), %eax
|
||||
testl %eax, %eax
|
||||
jle .LBB2_2
|
||||
# %bb.1: #
|
||||
movq 64(%rsi), %rdi
|
||||
shlq $3, %rax
|
||||
leaq (%rax,%rax,2), %rdx
|
||||
xorl %esi, %esi
|
||||
callq _intel_fast_memset
|
||||
.LBB2_2: #
|
||||
xorl %eax, %eax
|
||||
callq getTimeStamp
|
||||
movl $.L.str, %edi
|
||||
callq likwid_markerStartRegion
|
||||
movq stderr(%rip), %rcx
|
||||
movl $.L.str.3, %edi
|
||||
movl $65, %esi
|
||||
movl $1, %edx
|
||||
callq fwrite
|
||||
movl $-1, %edi
|
||||
callq exit
|
||||
.Lfunc_end2:
|
||||
.size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
|
||||
.cfi_endproc
|
||||
# -- End function
|
||||
.type .L.str,@object #
|
||||
.section .rodata.str1.1,"aMS",@progbits,1
|
||||
.L.str:
|
||||
.asciz "force"
|
||||
.size .L.str, 6
|
||||
.type .L.str.1,@object #
|
||||
.L.str.1:
|
||||
.asciz "forceLJ-halfneigh"
|
||||
.size .L.str.1, 18
|
||||
.type .L.str.2,@object #
|
||||
.L.str.2:
|
||||
.asciz "Its: %u Freq: %f Time: %f\nCy/it: %f (half-neigh)\n"
|
||||
.size .L.str.2, 52
|
||||
.type .L.str.3,@object #
|
||||
.L.str.3:
|
||||
.asciz "Error: SIMD kernel not implemented for specified instruction set!"
|
||||
.size .L.str.3, 66
|
||||
.ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
|
||||
.section ".note.GNU-stack","",@progbits
|
||||
Reference in New Issue
Block a user