From cb5598bc9132c8127fd368643d1dc77fd1162dd2 Mon Sep 17 00:00:00 2001 From: JanLJL Date: Thu, 9 Feb 2023 17:33:22 +0100 Subject: [PATCH] added static analysis and likwid files --- likwid-outputs/csx-lammps-dp-mem_dp-stub.out | 88 + likwid-outputs/csx-lammps-dp-mem_dp.out | 168 ++ likwid-outputs/csx-lammps-sp-mem_sp-stub.out | 88 + likwid-outputs/csx-lammps-sp-mem_sp.out | 168 ++ static_analysis/jan/icx-icc-lammps-avx2.s | 1474 ++++++++++ static_analysis/jan/icx-icc-lammps-avx512.s | 1659 +++++++++++ static_analysis/jan/icx-icc-lammps-novec.s | 1310 +++++++++ static_analysis/jan/icx-icc-lammps-sse.s | 1522 ++++++++++ .../jan/icx-icx-gromacs-avx512-sp.s | 2103 ++++++++++++++ static_analysis/jan/icx-icx-gromacs-avx512.s | 2453 +++++++++++++++++ static_analysis/jan/zen-icx-lammps-avx2.s | 676 +++++ 11 files changed, 11709 insertions(+) create mode 100644 likwid-outputs/csx-lammps-dp-mem_dp-stub.out create mode 100644 likwid-outputs/csx-lammps-dp-mem_dp.out create mode 100644 likwid-outputs/csx-lammps-sp-mem_sp-stub.out create mode 100644 likwid-outputs/csx-lammps-sp-mem_sp.out create mode 100644 static_analysis/jan/icx-icc-lammps-avx2.s create mode 100644 static_analysis/jan/icx-icc-lammps-avx512.s create mode 100644 static_analysis/jan/icx-icc-lammps-novec.s create mode 100644 static_analysis/jan/icx-icc-lammps-sse.s create mode 100644 static_analysis/jan/icx-icx-gromacs-avx512-sp.s create mode 100644 static_analysis/jan/icx-icx-gromacs-avx512.s create mode 100644 static_analysis/jan/zen-icx-lammps-avx2.s diff --git a/likwid-outputs/csx-lammps-dp-mem_dp-stub.out b/likwid-outputs/csx-lammps-dp-mem_dp-stub.out new file mode 100644 index 0000000..9cedc76 --- /dev/null +++ b/likwid-outputs/csx-lammps-dp-mem_dp-stub.out @@ -0,0 +1,88 @@ +-------------------------------------------------------------------------------- +CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz +CPU type: Intel Cascadelake SP processor +CPU clock: 2.49 GHz +-------------------------------------------------------------------------------- +Initializing parameters... +Initializing atoms... +Creating atoms... +Pattern: seq +Number of timesteps: 200 +Number of atoms: 256 +Number of neighbors per atom: 1024 +Number of times to replicate neighbor lists: 1 +Estimated total data volume (kB): 1062.9120 +Estimated atom data volume (kB): 6.1440 +Estimated neighborlist data volume (kB): 1050.6240 +Initializing neighbor lists... +Creating neighbor lists... +Computing forces... +Total time: 0.2735, Mega atom updates/s: 0.1872 +Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325 +Statistics: + Vector width: 8, Processor frequency: 2.0000 GHz + Average neighbors per atom: 1018.9055 + Average SIMD iterations per atom: 127.3632 + Total number of computed pair interactions: 52428800 + Total number of SIMD iterations: 6553600 + Useful read data volume for force computation: 1.47GB + Cycles/SIMD iteration: 83.4598 +-------------------------------------------------------------------------------- +Region force, Group 1: MEM_DP ++-------------------+------------+ +| Region Info | HWThread 0 | ++-------------------+------------+ +| RDTSC Runtime [s] | 0.110776 | +| call count | 200 | ++-------------------+------------+ + ++------------------------------------------+---------+------------+ +| Event | Counter | HWThread 0 | ++------------------------------------------+---------+------------+ +| INSTR_RETIRED_ANY | FIXC0 | 267036300 | +| CPU_CLK_UNHALTED_CORE | FIXC1 | 219034500 | +| CPU_CLK_UNHALTED_REF | FIXC2 | 273793400 | +| PWR_PKG_ENERGY | PWR0 | 10.9296 | +| PWR_DRAM_ENERGY | PWR3 | 0 | +| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 | +| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 159400 | +| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 | +| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 197068800 | +| CAS_COUNT_RD | MBOX0C0 | 8643 | +| CAS_COUNT_WR | MBOX0C1 | 1367 | +| CAS_COUNT_RD | MBOX1C0 | 9124 | +| CAS_COUNT_WR | MBOX1C1 | 1354 | +| CAS_COUNT_RD | MBOX2C0 | 9138 | +| CAS_COUNT_WR | MBOX2C1 | 1356 | +| CAS_COUNT_RD | MBOX3C0 | 5586 | +| CAS_COUNT_WR | MBOX3C1 | 1297 | +| CAS_COUNT_RD | MBOX4C0 | 5328 | +| CAS_COUNT_WR | MBOX4C1 | 1269 | +| CAS_COUNT_RD | MBOX5C0 | 5280 | +| CAS_COUNT_WR | MBOX5C1 | 1295 | ++------------------------------------------+---------+------------+ + ++-----------------------------------+------------+ +| Metric | HWThread 0 | ++-----------------------------------+------------+ +| Runtime (RDTSC) [s] | 0.1108 | +| Runtime unhalted [s] | 0.0878 | +| Clock [MHz] | 1995.2564 | +| CPI | 0.8202 | +| Energy [J] | 10.9296 | +| Power [W] | 98.6643 | +| Energy DRAM [J] | 0 | +| Power DRAM [W] | 0 | +| DP [MFLOP/s] | 14233.3287 | +| AVX DP [MFLOP/s] | 14231.8898 | +| Packed [MUOPS/s] | 1778.9862 | +| Scalar [MUOPS/s] | 1.4389 | +| Memory read bandwidth [MBytes/s] | 24.9001 | +| Memory read data volume [GBytes] | 0.0028 | +| Memory write bandwidth [MBytes/s] | 4.5861 | +| Memory write data volume [GBytes] | 0.0005 | +| Memory bandwidth [MBytes/s] | 29.4863 | +| Memory data volume [GBytes] | 0.0033 | +| Operational intensity | 482.7104 | ++-----------------------------------+------------+ + diff --git a/likwid-outputs/csx-lammps-dp-mem_dp.out b/likwid-outputs/csx-lammps-dp-mem_dp.out new file mode 100644 index 0000000..41e8192 --- /dev/null +++ b/likwid-outputs/csx-lammps-dp-mem_dp.out @@ -0,0 +1,168 @@ +-------------------------------------------------------------------------------- +CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz +CPU type: Intel Cascadelake SP processor +CPU clock: 2.49 GHz +-------------------------------------------------------------------------------- +Parameters: + Force field: lj + Kernel: plain-C + Data layout: AoS + Floating-point precision: double + Unit cells (nx, ny, nz): 32, 32, 32 + Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01 + Periodic (x, y, z): 1, 1, 1 + Lattice size: 1.679596e+00 + Epsilon: 1.000000e+00 + Sigma: 1.000000e+00 + Spring constant: 1.000000e+00 + Damping constant: 1.000000e+00 + Temperature: 1.440000e+00 + RHO: 8.442000e-01 + Mass: 1.000000e+00 + Number of types: 4 + Number of timesteps: 200 + Report stats every (timesteps): 100 + Reneighbor every (timesteps): 20 + Prune every (timesteps): 1000 + Output positions every (timesteps): 20 + Output velocities every (timesteps): 5 + Delta time (dt): 5.000000e-03 + Cutoff radius: 2.500000e+00 + Skin: 3.000000e-01 + Half neighbor lists: 0 + Processor frequency (GHz): 2.0000 +---------------------------------------------------------------------------- +step temp pressure +0 1.440000e+00 1.215639e+00 +100 8.200895e-01 6.923143e-01 +200 7.961495e-01 6.721043e-01 +---------------------------------------------------------------------------- +System: 131072 atoms 47265 ghost atoms, Steps: 200 +TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s +---------------------------------------------------------------------------- +Performance: 2.28 million atom updates per second +Statistics: + Vector width: 8, Processor frequency: 2.0000 GHz + Average neighbors per atom: 76.0352 + Average SIMD iterations per atom: 9.9181 + Total number of computed pair interactions: 2003182862 + Total number of SIMD iterations: 261297661 + Useful read data volume for force computation: 57.46GB + Cycles/SIMD iteration: 40.4432 +-------------------------------------------------------------------------------- +Region force, Group 1: MEM_DP ++-------------------+------------+ +| Region Info | HWThread 0 | ++-------------------+------------+ +| RDTSC Runtime [s] | 5.115807 | +| call count | 201 | ++-------------------+------------+ + ++------------------------------------------+---------+-------------+ +| Event | Counter | HWThread 0 | ++------------------------------------------+---------+-------------+ +| INSTR_RETIRED_ANY | FIXC0 | 12592470000 | +| CPU_CLK_UNHALTED_CORE | FIXC1 | 10196910000 | +| CPU_CLK_UNHALTED_REF | FIXC2 | 12746120000 | +| PWR_PKG_ENERGY | PWR0 | 307.9429 | +| PWR_DRAM_ENERGY | PWR3 | 0 | +| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 | +| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 79042240 | +| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 | +| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 8076039000 | +| CAS_COUNT_RD | MBOX0C0 | 22734550 | +| CAS_COUNT_WR | MBOX0C1 | 1147714 | +| CAS_COUNT_RD | MBOX1C0 | 22755180 | +| CAS_COUNT_WR | MBOX1C1 | 1144415 | +| CAS_COUNT_RD | MBOX2C0 | 22762780 | +| CAS_COUNT_WR | MBOX2C1 | 1129051 | +| CAS_COUNT_RD | MBOX3C0 | 22905660 | +| CAS_COUNT_WR | MBOX3C1 | 1143324 | +| CAS_COUNT_RD | MBOX4C0 | 22914860 | +| CAS_COUNT_WR | MBOX4C1 | 1169116 | +| CAS_COUNT_RD | MBOX5C0 | 22890220 | +| CAS_COUNT_WR | MBOX5C1 | 1180739 | ++------------------------------------------+---------+-------------+ + ++-----------------------------------+------------+ +| Metric | HWThread 0 | ++-----------------------------------+------------+ +| Runtime (RDTSC) [s] | 5.1158 | +| Runtime unhalted [s] | 4.0885 | +| Clock [MHz] | 1995.2508 | +| CPI | 0.8098 | +| Energy [J] | 307.9429 | +| Power [W] | 60.1944 | +| Energy DRAM [J] | 0 | +| Power DRAM [W] | 0 | +| DP [MFLOP/s] | 12644.6041 | +| AVX DP [MFLOP/s] | 12629.1535 | +| Packed [MUOPS/s] | 1578.6442 | +| Scalar [MUOPS/s] | 15.4506 | +| Memory read bandwidth [MBytes/s] | 1713.4438 | +| Memory read data volume [GBytes] | 8.7656 | +| Memory write bandwidth [MBytes/s] | 86.5003 | +| Memory write data volume [GBytes] | 0.4425 | +| Memory bandwidth [MBytes/s] | 1799.9442 | +| Memory data volume [GBytes] | 9.2082 | +| Operational intensity | 7.0250 | ++-----------------------------------+------------+ + +Region reneighbour, Group 1: MEM_DP ++-------------------+------------+ +| Region Info | HWThread 0 | ++-------------------+------------+ +| RDTSC Runtime [s] | 5.897385 | +| call count | 10 | ++-------------------+------------+ + ++------------------------------------------+---------+-------------+ +| Event | Counter | HWThread 0 | ++------------------------------------------+---------+-------------+ +| INSTR_RETIRED_ANY | FIXC0 | 18212540000 | +| CPU_CLK_UNHALTED_CORE | FIXC1 | 11728500000 | +| CPU_CLK_UNHALTED_REF | FIXC2 | 14660630000 | +| PWR_PKG_ENERGY | PWR0 | 338.9000 | +| PWR_DRAM_ENERGY | PWR3 | 0 | +| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE | PMC0 | 0 | +| FP_ARITH_INST_RETIRED_SCALAR_DOUBLE | PMC1 | 6240402000 | +| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE | PMC2 | 0 | +| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE | PMC3 | 983040 | +| CAS_COUNT_RD | MBOX0C0 | 2086787 | +| CAS_COUNT_WR | MBOX0C1 | 1115626 | +| CAS_COUNT_RD | MBOX1C0 | 2089964 | +| CAS_COUNT_WR | MBOX1C1 | 1117021 | +| CAS_COUNT_RD | MBOX2C0 | 2103832 | +| CAS_COUNT_WR | MBOX2C1 | 1117965 | +| CAS_COUNT_RD | MBOX3C0 | 2086930 | +| CAS_COUNT_WR | MBOX3C1 | 1102471 | +| CAS_COUNT_RD | MBOX4C0 | 2094688 | +| CAS_COUNT_WR | MBOX4C1 | 1103018 | +| CAS_COUNT_RD | MBOX5C0 | 2097438 | +| CAS_COUNT_WR | MBOX5C1 | 1102525 | ++------------------------------------------+---------+-------------+ + ++-----------------------------------+------------+ +| Metric | HWThread 0 | ++-----------------------------------+------------+ +| Runtime (RDTSC) [s] | 5.8974 | +| Runtime unhalted [s] | 4.7026 | +| Clock [MHz] | 1995.2473 | +| CPI | 0.6440 | +| Energy [J] | 338.9000 | +| Power [W] | 57.4661 | +| Energy DRAM [J] | 0 | +| Power DRAM [W] | 0 | +| DP [MFLOP/s] | 1059.4978 | +| AVX DP [MFLOP/s] | 1.3335 | +| Packed [MUOPS/s] | 0.1667 | +| Scalar [MUOPS/s] | 1058.1643 | +| Memory read bandwidth [MBytes/s] | 136.3006 | +| Memory read data volume [GBytes] | 0.8038 | +| Memory write bandwidth [MBytes/s] | 72.2612 | +| Memory write data volume [GBytes] | 0.4262 | +| Memory bandwidth [MBytes/s] | 208.5618 | +| Memory data volume [GBytes] | 1.2300 | +| Operational intensity | 5.0800 | ++-----------------------------------+------------+ + diff --git a/likwid-outputs/csx-lammps-sp-mem_sp-stub.out b/likwid-outputs/csx-lammps-sp-mem_sp-stub.out new file mode 100644 index 0000000..d76ea47 --- /dev/null +++ b/likwid-outputs/csx-lammps-sp-mem_sp-stub.out @@ -0,0 +1,88 @@ +-------------------------------------------------------------------------------- +CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz +CPU type: Intel Cascadelake SP processor +CPU clock: 2.49 GHz +-------------------------------------------------------------------------------- +Initializing parameters... +Initializing atoms... +Creating atoms... +Pattern: seq +Number of timesteps: 200 +Number of atoms: 256 +Number of neighbors per atom: 1024 +Number of times to replicate neighbor lists: 1 +Estimated total data volume (kB): 1056.7680 +Estimated atom data volume (kB): 3.0720 +Estimated neighborlist data volume (kB): 1050.6240 +Initializing neighbor lists... +Creating neighbor lists... +Computing forces... +Total time: 0.2466, Mega atom updates/s: 0.2076 +Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062 +Statistics: + Vector width: 16, Processor frequency: 2.0000 GHz + Average neighbors per atom: 1018.9055 + Average SIMD iterations per atom: 63.6816 + Total number of computed pair interactions: 52428800 + Total number of SIMD iterations: 3276800 + Useful read data volume for force computation: 0.84GB + Cycles/SIMD iteration: 150.4999 +-------------------------------------------------------------------------------- +Region force, Group 1: MEM_SP ++-------------------+------------+ +| Region Info | HWThread 0 | ++-------------------+------------+ +| RDTSC Runtime [s] | 0.085843 | +| call count | 200 | ++-------------------+------------+ + ++------------------------------------------+---------+------------+ +| Event | Counter | HWThread 0 | ++------------------------------------------+---------+------------+ +| INSTR_RETIRED_ANY | FIXC0 | 129769100 | +| CPU_CLK_UNHALTED_CORE | FIXC1 | 172300100 | +| CPU_CLK_UNHALTED_REF | FIXC2 | 215371300 | +| PWR_PKG_ENERGY | PWR0 | 9.2849 | +| PWR_DRAM_ENERGY | PWR3 | 0 | +| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 | +| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 154000 | +| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 | +| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 89088000 | +| CAS_COUNT_RD | MBOX0C0 | 8354 | +| CAS_COUNT_WR | MBOX0C1 | 1126 | +| CAS_COUNT_RD | MBOX1C0 | 7863 | +| CAS_COUNT_WR | MBOX1C1 | 1105 | +| CAS_COUNT_RD | MBOX2C0 | 7990 | +| CAS_COUNT_WR | MBOX2C1 | 1113 | +| CAS_COUNT_RD | MBOX3C0 | 4775 | +| CAS_COUNT_WR | MBOX3C1 | 1112 | +| CAS_COUNT_RD | MBOX4C0 | 4201 | +| CAS_COUNT_WR | MBOX4C1 | 1127 | +| CAS_COUNT_RD | MBOX5C0 | 4035 | +| CAS_COUNT_WR | MBOX5C1 | 1120 | ++------------------------------------------+---------+------------+ + ++-----------------------------------+------------+ +| Metric | HWThread 0 | ++-----------------------------------+------------+ +| Runtime (RDTSC) [s] | 0.0858 | +| Runtime unhalted [s] | 0.0691 | +| Clock [MHz] | 1995.2787 | +| CPI | 1.3277 | +| Energy [J] | 9.2849 | +| Power [W] | 108.1610 | +| Energy DRAM [J] | 0 | +| Power DRAM [W] | 0 | +| SP [MFLOP/s] | 16606.5397 | +| AVX SP [MFLOP/s] | 16604.7458 | +| Packed [MUOPS/s] | 1037.7966 | +| Scalar [MUOPS/s] | 1.7940 | +| Memory read bandwidth [MBytes/s] | 27.7476 | +| Memory read data volume [GBytes] | 0.0024 | +| Memory write bandwidth [MBytes/s] | 4.9974 | +| Memory write data volume [GBytes] | 0.0004 | +| Memory bandwidth [MBytes/s] | 32.7450 | +| Memory data volume [GBytes] | 0.0028 | +| Operational intensity | 507.1471 | ++-----------------------------------+------------+ + diff --git a/likwid-outputs/csx-lammps-sp-mem_sp.out b/likwid-outputs/csx-lammps-sp-mem_sp.out new file mode 100644 index 0000000..1e5a76b --- /dev/null +++ b/likwid-outputs/csx-lammps-sp-mem_sp.out @@ -0,0 +1,168 @@ +-------------------------------------------------------------------------------- +CPU name: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz +CPU type: Intel Cascadelake SP processor +CPU clock: 2.49 GHz +-------------------------------------------------------------------------------- +Parameters: + Force field: lj + Kernel: plain-C + Data layout: AoS + Floating-point precision: single + Unit cells (nx, ny, nz): 32, 32, 32 + Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01 + Periodic (x, y, z): 1, 1, 1 + Lattice size: 1.679596e+00 + Epsilon: 1.000000e+00 + Sigma: 1.000000e+00 + Spring constant: 1.000000e+00 + Damping constant: 1.000000e+00 + Temperature: 1.440000e+00 + RHO: 8.442000e-01 + Mass: 1.000000e+00 + Number of types: 4 + Number of timesteps: 200 + Report stats every (timesteps): 100 + Reneighbor every (timesteps): 20 + Prune every (timesteps): 1000 + Output positions every (timesteps): 20 + Output velocities every (timesteps): 5 + Delta time (dt): 5.000000e-03 + Cutoff radius: 2.500000e+00 + Skin: 3.000000e-01 + Half neighbor lists: 0 + Processor frequency (GHz): 2.0000 +---------------------------------------------------------------------------- +step temp pressure +0 1.440000e+00 1.215639e+00 +100 8.200897e-01 6.923144e-01 +200 7.961481e-01 6.721031e-01 +---------------------------------------------------------------------------- +System: 131072 atoms 47265 ghost atoms, Steps: 200 +TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s +---------------------------------------------------------------------------- +Performance: 2.42 million atom updates per second +Statistics: + Vector width: 16, Processor frequency: 2.0000 GHz + Average neighbors per atom: 76.0351 + Average SIMD iterations per atom: 5.0875 + Total number of computed pair interactions: 2003181259 + Total number of SIMD iterations: 134032075 + Useful read data volume for force computation: 32.79GB + Cycles/SIMD iteration: 68.9511 +-------------------------------------------------------------------------------- +Region force, Group 1: MEM_SP ++-------------------+------------+ +| Region Info | HWThread 0 | ++-------------------+------------+ +| RDTSC Runtime [s] | 4.452877 | +| call count | 201 | ++-------------------+------------+ + ++------------------------------------------+---------+-------------+ +| Event | Counter | HWThread 0 | ++------------------------------------------+---------+-------------+ +| INSTR_RETIRED_ANY | FIXC0 | 7428719000 | +| CPU_CLK_UNHALTED_CORE | FIXC1 | 8875251000 | +| CPU_CLK_UNHALTED_REF | FIXC2 | 11094050000 | +| PWR_PKG_ENERGY | PWR0 | 265.5057 | +| PWR_DRAM_ENERGY | PWR3 | 0 | +| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 | +| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 79036820 | +| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 | +| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 3935012000 | +| CAS_COUNT_RD | MBOX0C0 | 19716700 | +| CAS_COUNT_WR | MBOX0C1 | 595747 | +| CAS_COUNT_RD | MBOX1C0 | 19734880 | +| CAS_COUNT_WR | MBOX1C1 | 597090 | +| CAS_COUNT_RD | MBOX2C0 | 19732800 | +| CAS_COUNT_WR | MBOX2C1 | 595219 | +| CAS_COUNT_RD | MBOX3C0 | 19886430 | +| CAS_COUNT_WR | MBOX3C1 | 632443 | +| CAS_COUNT_RD | MBOX4C0 | 19887210 | +| CAS_COUNT_WR | MBOX4C1 | 633169 | +| CAS_COUNT_RD | MBOX5C0 | 19935560 | +| CAS_COUNT_WR | MBOX5C1 | 634112 | ++------------------------------------------+---------+-------------+ + ++-----------------------------------+------------+ +| Metric | HWThread 0 | ++-----------------------------------+------------+ +| Runtime (RDTSC) [s] | 4.4529 | +| Runtime unhalted [s] | 3.5585 | +| Clock [MHz] | 1995.2693 | +| CPI | 1.1947 | +| Energy [J] | 265.5057 | +| Power [W] | 59.6257 | +| Energy DRAM [J] | 0 | +| Power DRAM [W] | 0 | +| SP [MFLOP/s] | 14156.9661 | +| AVX SP [MFLOP/s] | 14139.2165 | +| Packed [MUOPS/s] | 883.7010 | +| Scalar [MUOPS/s] | 17.7496 | +| Memory read bandwidth [MBytes/s] | 1708.8254 | +| Memory read data volume [GBytes] | 7.6092 | +| Memory write bandwidth [MBytes/s] | 53.0035 | +| Memory write data volume [GBytes] | 0.2360 | +| Memory bandwidth [MBytes/s] | 1761.8288 | +| Memory data volume [GBytes] | 7.8452 | +| Operational intensity | 8.0354 | ++-----------------------------------+------------+ + +Region reneighbour, Group 1: MEM_SP ++-------------------+------------+ +| Region Info | HWThread 0 | ++-------------------+------------+ +| RDTSC Runtime [s] | 5.935627 | +| call count | 10 | ++-------------------+------------+ + ++------------------------------------------+---------+-------------+ +| Event | Counter | HWThread 0 | ++------------------------------------------+---------+-------------+ +| INSTR_RETIRED_ANY | FIXC0 | 18208530000 | +| CPU_CLK_UNHALTED_CORE | FIXC1 | 11805500000 | +| CPU_CLK_UNHALTED_REF | FIXC2 | 14756870000 | +| PWR_PKG_ENERGY | PWR0 | 340.7903 | +| PWR_DRAM_ENERGY | PWR3 | 0 | +| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE | PMC0 | 0 | +| FP_ARITH_INST_RETIRED_SCALAR_SINGLE | PMC1 | 6240406000 | +| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE | PMC2 | 0 | +| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE | PMC3 | 491520 | +| CAS_COUNT_RD | MBOX0C0 | 1772377 | +| CAS_COUNT_WR | MBOX0C1 | 975760 | +| CAS_COUNT_RD | MBOX1C0 | 1770611 | +| CAS_COUNT_WR | MBOX1C1 | 977433 | +| CAS_COUNT_RD | MBOX2C0 | 1771722 | +| CAS_COUNT_WR | MBOX2C1 | 979122 | +| CAS_COUNT_RD | MBOX3C0 | 1782901 | +| CAS_COUNT_WR | MBOX3C1 | 967621 | +| CAS_COUNT_RD | MBOX4C0 | 1780789 | +| CAS_COUNT_WR | MBOX4C1 | 967179 | +| CAS_COUNT_RD | MBOX5C0 | 1784733 | +| CAS_COUNT_WR | MBOX5C1 | 969349 | ++------------------------------------------+---------+-------------+ + ++-----------------------------------+------------+ +| Metric | HWThread 0 | ++-----------------------------------+------------+ +| Runtime (RDTSC) [s] | 5.9356 | +| Runtime unhalted [s] | 4.7334 | +| Clock [MHz] | 1995.2675 | +| CPI | 0.6483 | +| Energy [J] | 340.7903 | +| Power [W] | 57.4144 | +| Energy DRAM [J] | 0 | +| Power DRAM [W] | 0 | +| SP [MFLOP/s] | 1052.6723 | +| AVX SP [MFLOP/s] | 1.3249 | +| Packed [MUOPS/s] | 0.0828 | +| Scalar [MUOPS/s] | 1051.3474 | +| Memory read bandwidth [MBytes/s] | 114.9736 | +| Memory read data volume [GBytes] | 0.6824 | +| Memory write bandwidth [MBytes/s] | 62.9308 | +| Memory write data volume [GBytes] | 0.3735 | +| Memory bandwidth [MBytes/s] | 177.9044 | +| Memory data volume [GBytes] | 1.0560 | +| Operational intensity | 5.9171 | ++-----------------------------------+------------+ + diff --git a/static_analysis/jan/icx-icc-lammps-avx2.s b/static_analysis/jan/icx-icc-lammps-avx2.s new file mode 100644 index 0000000..8f54169 --- /dev/null +++ b/static_analysis/jan/icx-icc-lammps-avx2.s @@ -0,0 +1,1474 @@ +# mark_description "Intel(R) C Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.6.0 Build 2022"; +# mark_description "0226_000000"; +# mark_description "-I/apps/likwid/5.2.2/include -I././lammps/includes -I././common/includes -S -std=c11 -pedantic-errors -D_GNU"; +# mark_description "_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DCOMPUTE_STATS -DVECTOR_WIDTH=4 -D__ISA_AVX2__ -DENABLE_OMP_SI"; +# mark_description "MD -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX2 -o build-lammps-ICC-AVX2-DP/force_lj.s"; + .file "force_lj.c" + .text +..TXTST0: +.L_2__routine_start_computeForceLJFullNeigh_plain_c_0: +# -- Begin computeForceLJFullNeigh_plain_c + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJFullNeigh_plain_c +# --- computeForceLJFullNeigh_plain_c(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJFullNeigh_plain_c: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B1.1: # Preds ..B1.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJFullNeigh_plain_c.1: +..L2: + #23.104 + pushq %rbp #23.104 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #23.104 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-32, %rsp #23.104 + pushq %r13 #23.104 + pushq %r14 #23.104 + pushq %r15 #23.104 + pushq %rbx #23.104 + subq $224, %rsp #23.104 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + movq %rsi, %r15 #23.104 + vmovsd 144(%rdi), %xmm0 #27.27 + movq %rcx, %r13 #23.104 + vmulsd %xmm0, %xmm0, %xmm1 #27.45 + movq %rdx, %r14 #23.104 + vmovsd 56(%rdi), %xmm2 #28.23 + vmovsd 40(%rdi), %xmm3 #29.24 + movl 4(%r15), %eax #24.18 + vmovsd %xmm1, 128(%rsp) #27.45[spill] + vmovsd %xmm2, 136(%rsp) #28.23[spill] + vmovsd %xmm3, 24(%rsp) #29.24[spill] + testl %eax, %eax #32.24 + jle ..B1.34 # Prob 50% #32.24 + # LOE r12 r13 r14 r15 eax +..B1.2: # Preds ..B1.1 + # Execution count [5.00e-03] + movslq %eax, %rbx #24.18 + lea (%rax,%rax,2), %eax #24.18 + movq 64(%r15), %rdi #33.9 + cmpl $12, %eax #32.5 + jle ..B1.43 # Prob 0% #32.5 + # LOE rbx rdi r12 r13 r14 r15 +..B1.3: # Preds ..B1.2 + # Execution count [1.00e+00] + xorl %esi, %esi #32.5 + lea (%rbx,%rbx,2), %rdx #32.5 + shlq $3, %rdx #32.5 + call __intel_avx_rep_memset #32.5 + # LOE rbx r12 r13 r14 r15 +..B1.5: # Preds ..B1.49 ..B1.3 ..B1.47 + # Execution count [1.00e+00] + xorl %eax, %eax #38.16 + vzeroupper #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.13: +# getTimeStamp() + call getTimeStamp #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.14: + # LOE rbx r12 r13 r14 r15 xmm0 +..B1.54: # Preds ..B1.5 + # Execution count [1.00e+00] + vmovsd %xmm0, 16(%rsp) #38.16[spill] + # LOE rbx r12 r13 r14 r15 +..B1.6: # Preds ..B1.54 + # Execution count [5.00e-01] + movl $.L_2__STRING.0, %edi #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.16: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.17: + # LOE rbx r12 r13 r14 r15 +..B1.7: # Preds ..B1.6 + # Execution count [9.00e-01] + vmovsd 24(%rsp), %xmm0 #77.41[spill] + xorl %eax, %eax #41.15 + vmulsd .L_2il0floatpacket.0(%rip), %xmm0, %xmm4 #77.41 + xorl %ecx, %ecx #41.5 + vbroadcastsd 128(%rsp), %ymm6 #27.25[spill] + vbroadcastsd %xmm4, %ymm7 #77.41 + vbroadcastsd 136(%rsp), %ymm2 #28.21[spill] + vmovsd .L_2il0floatpacket.4(%rip), %xmm5 #75.32 + vmovsd .L_2il0floatpacket.1(%rip), %xmm0 #77.54 + vmovupd %ymm6, 32(%rsp) #41.5[spill] + vmovupd %ymm7, 64(%rsp) #41.5[spill] + vmovsd 136(%rsp), %xmm6 #41.5[spill] + vmovsd 128(%rsp), %xmm7 #41.5[spill] + vmovupd %ymm2, 96(%rsp) #41.5[spill] + movslq 8(%r14), %rsi #42.43 + xorl %edi, %edi #41.5 + movq 16(%r14), %rdx #42.19 + shlq $2, %rsi #25.5 + movq 24(%r14), %r14 #43.25 + movq 16(%r15), %r11 #44.25 + movq 64(%r15), %r8 #89.9 + movq (%r13), %r9 #93.9 + movq 8(%r13), %r10 #94.9 + movq %rsi, 144(%rsp) #41.5[spill] + movq %rdx, 152(%rsp) #41.5[spill] + movq %rbx, 208(%rsp) #41.5[spill] + movq %r13, (%rsp) #41.5[spill] + movq %r12, 8(%rsp) #41.5[spill] + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22 + # LOE rax rcx rdi r8 r9 r10 r11 r14 xmm0 xmm4 xmm5 xmm6 xmm7 +..B1.8: # Preds ..B1.32 ..B1.7 + # Execution count [5.00e+00] + movl (%r14,%rcx,4), %r13d #43.25 + testl %r13d, %r13d #56.28 + vxorpd %xmm8, %xmm8, %xmm8 #47.22 + vmovapd %xmm8, %xmm9 #48.22 + vmovsd (%rdi,%r11), %xmm3 #44.25 + vmovapd %xmm9, %xmm10 #49.22 + vmovsd 8(%rdi,%r11), %xmm2 #45.25 + vmovsd 16(%rdi,%r11), %xmm1 #46.25 + movslq %r13d, %r12 #56.9 + jle ..B1.32 # Prob 50% #56.28 + # LOE rax rcx rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.9: # Preds ..B1.8 + # Execution count [4.50e+00] + cmpq $4, %r12 #56.9 + jl ..B1.39 # Prob 10% #56.9 + # LOE rax rcx rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.10: # Preds ..B1.9 + # Execution count [4.50e+00] + movq 144(%rsp), %rbx #42.43[spill] + imulq %rax, %rbx #42.43 + addq 152(%rsp), %rbx #25.5[spill] + cmpq $600, %r12 #56.9 + jl ..B1.41 # Prob 10% #56.9 + # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.11: # Preds ..B1.10 + # Execution count [4.50e+00] + movq %rbx, %r15 #56.9 + andq $31, %r15 #56.9 + testl %r15d, %r15d #56.9 + je ..B1.14 # Prob 50% #56.9 + # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.12: # Preds ..B1.11 + # Execution count [4.50e+00] + testl $3, %r15d #56.9 + jne ..B1.39 # Prob 10% #56.9 + # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.13: # Preds ..B1.12 + # Execution count [2.25e+00] + negl %r15d #56.9 + addl $32, %r15d #56.9 + shrl $2, %r15d #56.9 + # LOE rax rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.14: # Preds ..B1.13 ..B1.11 + # Execution count [4.50e+00] + movl %r15d, %edx #56.9 + lea 4(%rdx), %rsi #56.9 + cmpq %rsi, %r12 #56.9 + jl ..B1.39 # Prob 10% #56.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12 r14 r13d r15d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.15: # Preds ..B1.14 + # Execution count [5.00e+00] + movl %r13d, %esi #56.9 + subl %r15d, %esi #56.9 + andl $3, %esi #56.9 + negl %esi #56.9 + addl %r13d, %esi #56.9 + movslq %esi, %rsi #56.9 + testl %r15d, %r15d #56.9 + movl $0, %r15d #56.9 + jbe ..B1.21 # Prob 10% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.16: # Preds ..B1.15 + # Execution count [4.50e+00] + movq %rcx, 24(%rsp) #[spill] + # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.17: # Preds ..B1.19 ..B1.16 + # Execution count [2.50e+01] + movl (%rbx,%r15,4), %ecx #57.21 + lea (%rcx,%rcx,2), %ecx #58.36 + movslq %ecx, %rcx #58.36 + vsubsd 8(%r11,%rcx,8), %xmm2, %xmm13 #59.36 + vsubsd (%r11,%rcx,8), %xmm3, %xmm12 #58.36 + vsubsd 16(%r11,%rcx,8), %xmm1, %xmm11 #60.36 + vmulsd %xmm13, %xmm13, %xmm14 #61.49 + vfmadd231sd %xmm12, %xmm12, %xmm14 #61.63 + vfmadd231sd %xmm11, %xmm11, %xmm14 #61.63 + vcomisd %xmm14, %xmm7 #71.22 + jbe ..B1.19 # Prob 50% #71.22 + # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B1.18: # Preds ..B1.17 + # Execution count [1.25e+01] + vdivsd %xmm14, %xmm5, %xmm15 #75.38 + vmulsd %xmm15, %xmm6, %xmm14 #76.38 + vmulsd %xmm15, %xmm14, %xmm14 #76.44 + vmulsd %xmm15, %xmm14, %xmm14 #76.50 + vmulsd %xmm4, %xmm15, %xmm15 #77.54 + vmulsd %xmm14, %xmm15, %xmm15 #77.61 + vsubsd %xmm0, %xmm14, %xmm14 #77.54 + vmulsd %xmm14, %xmm15, %xmm15 #77.67 + vfmadd231sd %xmm12, %xmm15, %xmm8 #78.17 + vfmadd231sd %xmm15, %xmm13, %xmm9 #79.17 + vfmadd231sd %xmm15, %xmm11, %xmm10 #80.17 + # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.19: # Preds ..B1.18 ..B1.17 + # Execution count [2.50e+01] + incq %r15 #56.9 + cmpq %rdx, %r15 #56.9 + jb ..B1.17 # Prob 82% #56.9 + # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.20: # Preds ..B1.19 + # Execution count [4.50e+00] + movq 24(%rsp), %rcx #[spill] + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.21: # Preds ..B1.20 ..B1.15 ..B1.41 + # Execution count [4.50e+00] + vmovsd %xmm3, 192(%rsp) #71.22[spill] + vxorpd %xmm11, %xmm11, %xmm11 #47.22 + vmovsd %xmm8, %xmm11, %xmm13 #47.22 + vmovsd %xmm9, %xmm11, %xmm12 #48.22 + vmovsd %xmm10, %xmm11, %xmm11 #49.22 + vmovsd %xmm4, 200(%rsp) #71.22[spill] + vbroadcastsd %xmm3, %ymm10 #44.23 + vmovsd %xmm1, 176(%rsp) #71.22[spill] + vmovsd %xmm2, 184(%rsp) #71.22[spill] + vmovupd .L_2il0floatpacket.3(%rip), %ymm3 #71.22 + vmovupd .L_2il0floatpacket.2(%rip), %ymm4 #71.22 + vmovupd 32(%rsp), %ymm5 #71.22[spill] + vbroadcastsd %xmm2, %ymm9 #45.23 + vbroadcastsd %xmm1, %ymm8 #46.23 + movq %r8, 160(%rsp) #71.22[spill] + movq %r14, 168(%rsp) #71.22[spill] + movq %rcx, 24(%rsp) #71.22[spill] + vmovaps %xmm13, %xmm13 #47.22 + vmovaps %xmm12, %xmm12 #48.22 + vmovaps %xmm11, %xmm11 #49.22 + # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 +# LLVM-MCA-BEGIN +# OSACA-BEGIN +..B1.22: # Preds ..B1.24 ..B1.21 + # Execution count [2.50e+01] + vmovdqu (%rbx,%rdx,4), %xmm0 #57.21 + vmovq %xmm0, %rcx #57.21 + vpunpckhqdq %xmm0, %xmm0, %xmm2 #57.21 + vmovq %xmm2, %r15 #57.21 + movl %ecx, %r8d #57.21 + shrq $32, %rcx #57.21 + lea (%rcx,%rcx,2), %r14d #58.36 + lea (%r8,%r8,2), %r8d #58.36 + movslq %r8d, %rcx #58.36 + movslq %r14d, %r8 #58.36 + movl %r15d, %r14d #57.21 + shrq $32, %r15 #57.21 + vmovups (%r11,%rcx,8), %xmm7 #58.36 + vmovups (%r11,%r8,8), %xmm6 #58.36 + vmovq 16(%r11,%rcx,8), %xmm14 #58.36 + lea (%r14,%r14,2), %r14d #58.36 + movslq %r14d, %r14 #58.36 + lea (%r15,%r15,2), %r15d #58.36 + movslq %r15d, %r15 #58.36 + vmovhpd 16(%r11,%r8,8), %xmm14, %xmm15 #58.36 + vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1 #58.36 + vmovq 16(%r11,%r14,8), %xmm0 #58.36 + vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6 #58.36 + vmovhpd 16(%r11,%r15,8), %xmm0, %xmm2 #58.36 + vunpcklpd %ymm6, %ymm1, %ymm14 #58.36 + vunpckhpd %ymm6, %ymm1, %ymm1 #58.36 + vsubpd %ymm14, %ymm10, %ymm6 #58.36 + vinsertf128 $1, %xmm2, %ymm15, %ymm7 #58.36 + vsubpd %ymm1, %ymm9, %ymm2 #59.36 + vsubpd %ymm7, %ymm8, %ymm0 #60.36 + vmulpd %ymm2, %ymm2, %ymm14 #61.49 + vfmadd231pd %ymm6, %ymm6, %ymm14 #61.49 + vfmadd231pd %ymm0, %ymm0, %ymm14 #61.63 + vcmpltpd %ymm5, %ymm14, %ymm1 #71.22 + vpcmpeqd %ymm7, %ymm7, %ymm7 #71.22 + vptest %ymm7, %ymm1 #71.22 + je ..B1.24 # Prob 50% #71.22 + # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14 +..B1.23: # Preds ..B1.22 + # Execution count [1.25e+01] + vdivpd %ymm14, %ymm4, %ymm7 #75.38 + vmulpd 96(%rsp), %ymm7, %ymm14 #76.38[spill] + vmulpd %ymm14, %ymm7, %ymm14 #76.44 + vmulpd %ymm14, %ymm7, %ymm15 #76.50 + vfmsub213pd %ymm3, %ymm7, %ymm14 #77.54 + vmulpd 64(%rsp), %ymm7, %ymm7 #77.54[spill] + vmulpd %ymm7, %ymm15, %ymm15 #77.61 + vmulpd %ymm14, %ymm15, %ymm7 #77.67 + vmulpd %ymm7, %ymm6, %ymm6 #78.31 + vmulpd %ymm7, %ymm2, %ymm2 #79.31 + vandpd %ymm6, %ymm1, %ymm6 #78.31 + vaddpd %ymm6, %ymm13, %ymm13 #78.17 + vmulpd %ymm7, %ymm0, %ymm6 #80.31 + vandpd %ymm2, %ymm1, %ymm0 #79.31 + vandpd %ymm6, %ymm1, %ymm1 #80.31 + vaddpd %ymm0, %ymm12, %ymm12 #79.17 + vaddpd %ymm1, %ymm11, %ymm11 #80.17 + # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 +..B1.24: # Preds ..B1.23 ..B1.22 + # Execution count [2.50e+01] + addq $4, %rdx #56.9 + cmpq %rsi, %rdx #56.9 + jb ..B1.22 # Prob 82% #56.9 +# OSACA-END +# LLVM-MCA-END + # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 +..B1.25: # Preds ..B1.24 + # Execution count [4.50e+00] + vextractf128 $1, %ymm11, %xmm10 #49.22 + vmovsd 176(%rsp), %xmm1 #[spill] + vmovsd 184(%rsp), %xmm2 #[spill] + vaddpd %xmm10, %xmm11, %xmm9 #49.22 + vunpckhpd %xmm9, %xmm9, %xmm8 #49.22 + vmovsd 192(%rsp), %xmm3 #[spill] + vaddsd %xmm8, %xmm9, %xmm10 #49.22 + vmovsd 200(%rsp), %xmm4 #[spill] + vmovsd 136(%rsp), %xmm6 #[spill] + vmovsd 128(%rsp), %xmm7 #[spill] + movq 160(%rsp), %r8 #[spill] + movq 168(%rsp), %r14 #[spill] + movq 24(%rsp), %rcx #[spill] + vmovsd .L_2il0floatpacket.1(%rip), %xmm0 # + vmovsd .L_2il0floatpacket.4(%rip), %xmm5 # + vextractf128 $1, %ymm12, %xmm14 #48.22 + vextractf128 $1, %ymm13, %xmm8 #47.22 + vaddpd %xmm14, %xmm12, %xmm15 #48.22 + vaddpd %xmm8, %xmm13, %xmm11 #47.22 + vunpckhpd %xmm15, %xmm15, %xmm9 #48.22 + vunpckhpd %xmm11, %xmm11, %xmm12 #47.22 + vaddsd %xmm9, %xmm15, %xmm9 #48.22 + vaddsd %xmm12, %xmm11, %xmm8 #47.22 + # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.26: # Preds ..B1.25 ..B1.39 + # Execution count [5.00e+00] + cmpq %r12, %rsi #56.9 + jae ..B1.32 # Prob 10% #56.9 + # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.27: # Preds ..B1.26 + # Execution count [4.50e+00] + imulq 144(%rsp), %rax #42.43[spill] + addq 152(%rsp), %rax #25.5[spill] + # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.28: # Preds ..B1.30 ..B1.27 + # Execution count [2.50e+01] + movl (%rax,%rsi,4), %edx #57.21 + lea (%rdx,%rdx,2), %ebx #58.36 + movslq %ebx, %rbx #58.36 + vsubsd 8(%r11,%rbx,8), %xmm2, %xmm13 #59.36 + vsubsd (%r11,%rbx,8), %xmm3, %xmm12 #58.36 + vsubsd 16(%r11,%rbx,8), %xmm1, %xmm11 #60.36 + vmulsd %xmm13, %xmm13, %xmm14 #61.49 + vfmadd231sd %xmm12, %xmm12, %xmm14 #61.63 + vfmadd231sd %xmm11, %xmm11, %xmm14 #61.63 + vcomisd %xmm14, %xmm7 #71.22 + jbe ..B1.30 # Prob 50% #71.22 + # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B1.29: # Preds ..B1.28 + # Execution count [1.25e+01] + vdivsd %xmm14, %xmm5, %xmm15 #75.38 + vmulsd %xmm15, %xmm6, %xmm14 #76.38 + vmulsd %xmm15, %xmm14, %xmm14 #76.44 + vmulsd %xmm15, %xmm14, %xmm14 #76.50 + vmulsd %xmm4, %xmm15, %xmm15 #77.54 + vmulsd %xmm14, %xmm15, %xmm15 #77.61 + vsubsd %xmm0, %xmm14, %xmm14 #77.54 + vmulsd %xmm14, %xmm15, %xmm15 #77.67 + vfmadd231sd %xmm12, %xmm15, %xmm8 #78.17 + vfmadd231sd %xmm15, %xmm13, %xmm9 #79.17 + vfmadd231sd %xmm15, %xmm11, %xmm10 #80.17 + # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.30: # Preds ..B1.29 ..B1.28 + # Execution count [2.50e+01] + incq %rsi #56.9 + cmpq %r12, %rsi #56.9 + jb ..B1.28 # Prob 82% #56.9 + # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.32: # Preds ..B1.30 ..B1.8 ..B1.26 + # Execution count [5.00e+00] + addq %r12, %r9 #93.9 + lea 3(%r13), %eax #94.9 + sarl $1, %eax #94.9 + vaddsd (%rdi,%r8), %xmm8, %xmm1 #89.9 + vaddsd 8(%rdi,%r8), %xmm9, %xmm2 #90.9 + vaddsd 16(%rdi,%r8), %xmm10, %xmm3 #91.9 + shrl $30, %eax #94.9 + vmovsd %xmm1, (%rdi,%r8) #89.9 + vmovsd %xmm2, 8(%rdi,%r8) #90.9 + vmovsd %xmm3, 16(%rdi,%r8) #91.9 + addq $24, %rdi #41.5 + lea 3(%rax,%r13), %edx #94.9 + movslq %ecx, %rax #41.32 + sarl $2, %edx #94.9 + incq %rcx #41.5 + movslq %edx, %rdx #94.9 + incq %rax #41.32 + addq %rdx, %r10 #94.9 + cmpq 208(%rsp), %rcx #41.5[spill] + jb ..B1.8 # Prob 82% #41.5 + # LOE rax rcx rdi r8 r9 r10 r11 r14 xmm0 xmm4 xmm5 xmm6 xmm7 +..B1.33: # Preds ..B1.32 + # Execution count [9.00e-01] + movq (%rsp), %r13 #[spill] + movq 8(%rsp), %r12 #[spill] + .cfi_restore 12 + movq %r9, (%r13) #93.9 + movq %r10, 8(%r13) #94.9 + jmp ..B1.36 # Prob 100% #94.9 + # LOE r12 +..B1.34: # Preds ..B1.1 + # Execution count [5.00e-01] + xorl %eax, %eax #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.61: +# getTimeStamp() + call getTimeStamp #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.62: + # LOE r12 xmm0 +..B1.55: # Preds ..B1.34 + # Execution count [5.00e-01] + vmovsd %xmm0, 16(%rsp) #38.16[spill] + # LOE r12 +..B1.35: # Preds ..B1.55 + # Execution count [5.00e-01] + movl $.L_2__STRING.0, %edi #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.64: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.65: + # LOE r12 +..B1.36: # Preds ..B1.33 ..B1.35 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #97.5 + vzeroupper #97.5 +..___tag_value_computeForceLJFullNeigh_plain_c.66: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #97.5 +..___tag_value_computeForceLJFullNeigh_plain_c.67: + # LOE r12 +..B1.37: # Preds ..B1.36 + # Execution count [1.00e+00] + xorl %eax, %eax #98.16 +..___tag_value_computeForceLJFullNeigh_plain_c.68: +# getTimeStamp() + call getTimeStamp #98.16 +..___tag_value_computeForceLJFullNeigh_plain_c.69: + # LOE r12 xmm0 +..B1.38: # Preds ..B1.37 + # Execution count [1.00e+00] + vsubsd 16(%rsp), %xmm0, %xmm0 #102.14[spill] + addq $224, %rsp #102.14 + .cfi_restore 3 + popq %rbx #102.14 + .cfi_restore 15 + popq %r15 #102.14 + .cfi_restore 14 + popq %r14 #102.14 + .cfi_restore 13 + popq %r13 #102.14 + movq %rbp, %rsp #102.14 + popq %rbp #102.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #102.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B1.39: # Preds ..B1.9 ..B1.12 ..B1.14 + # Execution count [4.50e-01]: Infreq + xorl %esi, %esi #56.9 + jmp ..B1.26 # Prob 100% #56.9 + # LOE rax rcx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.41: # Preds ..B1.10 + # Execution count [4.50e-01]: Infreq + movl %r13d, %esi #56.9 + xorl %edx, %edx #56.9 + andl $-4, %esi #56.9 + movslq %esi, %rsi #56.9 + jmp ..B1.21 # Prob 100% #56.9 + .cfi_restore 12 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 +..B1.43: # Preds ..B1.2 + # Execution count [1.00e+00]: Infreq + lea (%rbx,%rbx,2), %rcx #24.18 + cmpq $8, %rcx #32.5 + jl ..B1.51 # Prob 10% #32.5 + # LOE rcx rbx rdi r12 r13 r14 r15 +..B1.44: # Preds ..B1.43 + # Execution count [1.00e+00]: Infreq + movl %ecx, %eax #32.5 + xorl %edx, %edx #32.5 + andl $-8, %eax #32.5 + movslq %eax, %rax #32.5 + vxorpd %ymm0, %ymm0, %ymm0 #33.22 + # LOE rax rdx rcx rbx rdi r12 r13 r14 r15 ymm0 +..B1.45: # Preds ..B1.45 ..B1.44 + # Execution count [5.56e+00]: Infreq + vmovupd %ymm0, (%rdi,%rdx,8) #33.9 + vmovupd %ymm0, 32(%rdi,%rdx,8) #33.9 + addq $8, %rdx #32.5 + cmpq %rax, %rdx #32.5 + jb ..B1.45 # Prob 82% #32.5 + # LOE rax rdx rcx rbx rdi r12 r13 r14 r15 ymm0 +..B1.47: # Preds ..B1.45 ..B1.51 + # Execution count [1.11e+00]: Infreq + cmpq %rcx, %rax #32.5 + jae ..B1.5 # Prob 10% #32.5 + # LOE rax rcx rbx rdi r12 r13 r14 r15 +..B1.48: # Preds ..B1.47 + # Execution count [1.00e+00]: Infreq + xorl %edx, %edx # + # LOE rax rdx rcx rbx rdi r12 r13 r14 r15 +..B1.49: # Preds ..B1.48 ..B1.49 + # Execution count [5.56e+00]: Infreq + movq %rdx, (%rdi,%rax,8) #33.9 + incq %rax #32.5 + cmpq %rcx, %rax #32.5 + jb ..B1.49 # Prob 82% #32.5 + jmp ..B1.5 # Prob 100% #32.5 + # LOE rax rdx rcx rbx rdi r12 r13 r14 r15 +..B1.51: # Preds ..B1.43 + # Execution count [1.00e-01]: Infreq + xorl %eax, %eax #32.5 + jmp ..B1.47 # Prob 100% #32.5 + .align 16,0x90 + # LOE rax rcx rbx rdi r12 r13 r14 r15 + .cfi_endproc +# mark_end; + .type computeForceLJFullNeigh_plain_c,@function + .size computeForceLJFullNeigh_plain_c,.-computeForceLJFullNeigh_plain_c +..LNcomputeForceLJFullNeigh_plain_c.0: + .data +# -- End computeForceLJFullNeigh_plain_c + .text +.L_2__routine_start_computeForceLJHalfNeigh_1: +# -- Begin computeForceLJHalfNeigh + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJHalfNeigh +# --- computeForceLJHalfNeigh(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJHalfNeigh: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B2.1: # Preds ..B2.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJHalfNeigh.86: +..L87: + #105.96 + pushq %rbp #105.96 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #105.96 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-32, %rsp #105.96 + pushq %r12 #105.96 + pushq %r13 #105.96 + pushq %r14 #105.96 + pushq %r15 #105.96 + pushq %rbx #105.96 + subq $248, %rsp #105.96 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %r12 #105.96 + movq %rsi, %r14 #105.96 + movq %rcx, %r15 #105.96 + movq %rdx, 32(%rsp) #105.96[spill] + vmovsd 144(%r12), %xmm0 #109.27 + vmulsd %xmm0, %xmm0, %xmm1 #109.45 + vmovsd 56(%r12), %xmm2 #110.23 + vmovsd 40(%r12), %xmm3 #111.24 + movl 4(%r14), %r13d #106.18 + vmovsd %xmm1, 56(%rsp) #109.45[spill] + vmovsd %xmm2, 48(%rsp) #110.23[spill] + vmovsd %xmm3, 24(%rsp) #111.24[spill] + testl %r13d, %r13d #114.24 + jle ..B2.51 # Prob 50% #114.24 + # LOE r12 r14 r15 r13d +..B2.2: # Preds ..B2.1 + # Execution count [5.00e-03] + movslq %r13d, %r13 #106.18 + movq 64(%r14), %rdi #115.9 + lea (%r13,%r13,2), %eax #106.18 + movq %r13, 40(%rsp) #106.18[spill] + cmpl $12, %eax #114.5 + jle ..B2.59 # Prob 0% #114.5 + # LOE rdi r12 r13 r14 r15 r13d +..B2.3: # Preds ..B2.2 + # Execution count [1.00e+00] + movq %r13, %rax #114.5 + xorl %esi, %esi #114.5 + lea (%rax,%rax,2), %rdx #114.5 + shlq $3, %rdx #114.5 + call __intel_avx_rep_memset #114.5 + # LOE r12 r14 r15 r13d +..B2.5: # Preds ..B2.65 ..B2.3 ..B2.63 + # Execution count [1.00e+00] + xorl %ebx, %ebx #120.22 + xorl %eax, %eax #121.16 + vzeroupper #121.16 +..___tag_value_computeForceLJHalfNeigh.101: +# getTimeStamp() + call getTimeStamp #121.16 +..___tag_value_computeForceLJHalfNeigh.102: + # LOE r12 r14 r15 ebx r13d xmm0 +..B2.70: # Preds ..B2.5 + # Execution count [1.00e+00] + vmovsd %xmm0, 16(%rsp) #121.16[spill] + # LOE r12 r14 r15 ebx r13d +..B2.6: # Preds ..B2.70 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #122.5 +..___tag_value_computeForceLJHalfNeigh.104: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #122.5 +..___tag_value_computeForceLJHalfNeigh.105: + # LOE r12 r14 r15 ebx r13d +..B2.7: # Preds ..B2.6 + # Execution count [9.00e-01] + vmovsd 24(%rsp), %xmm5 #161.41[spill] + vmovd %r13d, %xmm0 #106.18 + vmulsd .L_2il0floatpacket.0(%rip), %xmm5, %xmm5 #161.41 + xorl %r9d, %r9d #124.15 + movq 32(%rsp), %rdx #125.19[spill] + xorl %r8d, %r8d #124.5 + vmovddup 56(%rsp), %xmm8 #109.25[spill] + xorl %esi, %esi #124.5 + vmovddup 48(%rsp), %xmm4 #110.21[spill] + movslq 8(%rdx), %rax #125.43 + shlq $2, %rax #107.5 + movq 16(%rdx), %rdi #125.19 + vmovddup %xmm5, %xmm3 #161.41 + vpbroadcastd %xmm0, %xmm1 #106.18 + movq 24(%rdx), %rcx #126.25 + movq 16(%r14), %rdx #127.25 + movq %rax, 64(%rsp) #124.5[spill] + vmovsd .L_2il0floatpacket.4(%rip), %xmm7 #159.32 + vmovdqu .L_2il0floatpacket.6(%rip), %xmm9 #147.36 + vmovsd .L_2il0floatpacket.1(%rip), %xmm0 #161.54 + movq 64(%r14), %r14 #168.21 + movq (%r15), %r11 #179.9 + movq 8(%r15), %r10 #180.9 + vmovdqu %xmm1, 192(%rsp) #124.5[spill] + vmovupd %xmm3, 176(%rsp) #124.5[spill] + vmovupd %xmm4, 160(%rsp) #124.5[spill] + vmovupd %xmm8, 208(%rsp) #124.5[spill] + movq %rdi, 72(%rsp) #124.5[spill] + movl %r13d, 80(%rsp) #124.5[spill] + movq %r12, (%rsp) #124.5[spill] + movq %r15, 8(%rsp) #124.5[spill] + vmovsd 48(%rsp), %xmm6 #124.5[spill] + vmovsd 56(%rsp), %xmm2 #124.5[spill] + movq 40(%rsp), %rax #124.5[spill] + # LOE rax rdx rcx rsi r8 r9 r10 r11 r14 ebx xmm0 xmm2 xmm5 xmm6 xmm7 +..B2.8: # Preds ..B2.49 ..B2.7 + # Execution count [5.00e+00] + movl (%rcx,%r8,4), %edi #126.25 + addl %edi, %ebx #138.9 + vxorpd %xmm10, %xmm10, %xmm10 #130.22 + testl %edi, %edi #143.9 + vmovapd %xmm10, %xmm11 #131.22 + vmovsd (%rsi,%rdx), %xmm4 #127.25 + vmovapd %xmm11, %xmm12 #132.22 + vmovsd 8(%rsi,%rdx), %xmm3 #128.25 + vmovsd 16(%rsi,%rdx), %xmm1 #129.25 + jle ..B2.48 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.9: # Preds ..B2.8 + # Execution count [2.50e+00] + jbe ..B2.48 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.10: # Preds ..B2.9 + # Execution count [2.25e+00] + cmpl $2, %edi #143.9 + jb ..B2.58 # Prob 10% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.11: # Preds ..B2.10 + # Execution count [2.25e+00] + movq 64(%rsp), %r13 #125.43[spill] + movl %edi, %r12d #143.9 + imulq %r9, %r13 #125.43 + vxorpd %xmm14, %xmm14, %xmm14 #130.22 + andl $-2, %r12d #143.9 + vmovapd %xmm14, %xmm13 #131.22 + addq 72(%rsp), %r13 #107.5[spill] + xorl %r15d, %r15d #143.9 + vmovddup %xmm4, %xmm10 #127.23 + vmovapd %xmm13, %xmm11 #132.22 + vmovddup %xmm3, %xmm9 #128.23 + vmovddup %xmm1, %xmm8 #129.23 + movslq %r12d, %r12 #143.9 + vmovsd %xmm1, 128(%rsp) #143.9[spill] + vmovsd %xmm3, 136(%rsp) #143.9[spill] + vmovsd %xmm4, 144(%rsp) #143.9[spill] + vmovsd %xmm5, 152(%rsp) #143.9[spill] + movq %r9, 24(%rsp) #143.9[spill] + movl %edi, 32(%rsp) #143.9[spill] + movq %rsi, 88(%rsp) #143.9[spill] + movq %r10, 96(%rsp) #143.9[spill] + movq %r11, 104(%rsp) #143.9[spill] + movq %rcx, 112(%rsp) #143.9[spill] + movq %r8, 120(%rsp) #143.9[spill] + vmovdqu .L_2il0floatpacket.6(%rip), %xmm6 #143.9 + vmovdqu .L_2il0floatpacket.5(%rip), %xmm7 #143.9 + # LOE rdx rbx r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.12: # Preds ..B2.38 ..B2.11 + # Execution count [1.25e+01] + vmovq (%r13,%r15,4), %xmm4 #144.21 + vpaddd %xmm4, %xmm4, %xmm0 #145.36 + vpaddd %xmm0, %xmm4, %xmm1 #145.36 + vmovd %xmm1, %r9d #145.36 + vpaddd %xmm7, %xmm1, %xmm12 #146.36 + vpshufd $57, %xmm1, %xmm2 #145.36 + vpshufd $57, %xmm12, %xmm15 #146.36 + vmovd %xmm2, %r8d #145.36 + vmovd %xmm12, %edi #146.36 + vmovd %xmm15, %ecx #146.36 + movslq %r9d, %r9 #145.36 + movslq %r8d, %r8 #145.36 + movslq %edi, %rdi #146.36 + movslq %ecx, %rcx #146.36 + vmovsd (%rdx,%r9,8), %xmm3 #145.36 + vmovhpd (%rdx,%r8,8), %xmm3, %xmm5 #145.36 + vsubpd %xmm5, %xmm10, %xmm0 #145.36 + vpaddd %xmm6, %xmm1, %xmm5 #147.36 + vmovd %xmm5, %eax #147.36 + vpshufd $57, %xmm5, %xmm1 #147.36 + vmovsd (%rdx,%rdi,8), %xmm2 #146.36 + vmovd %xmm1, %r10d #147.36 + vmovhpd (%rdx,%rcx,8), %xmm2, %xmm3 #146.36 + vpcmpeqd %xmm1, %xmm1, %xmm1 #158.22 + vsubpd %xmm3, %xmm9, %xmm2 #146.36 + movslq %eax, %rax #147.36 + movslq %r10d, %r10 #147.36 + vmovsd (%rdx,%rax,8), %xmm12 #147.36 + vmovhpd (%rdx,%r10,8), %xmm12, %xmm15 #147.36 + vsubpd %xmm15, %xmm8, %xmm3 #147.36 + vmulpd %xmm2, %xmm2, %xmm15 #148.49 + vfmadd231pd %xmm0, %xmm0, %xmm15 #148.49 + vfmadd231pd %xmm3, %xmm3, %xmm15 #148.63 + vcmpltpd 208(%rsp), %xmm15, %xmm5 #158.22[spill] + vptest %xmm1, %xmm5 #158.22 + je ..B2.38 # Prob 50% #158.22 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 xmm15 +..B2.13: # Preds ..B2.12 + # Execution count [6.25e+00] + vmovupd .L_2il0floatpacket.7(%rip), %xmm12 #159.38 + vdivpd %xmm15, %xmm12, %xmm1 #159.38 + vmovdqu 192(%rsp), %xmm12 #167.24[spill] + vpcmpeqd %xmm15, %xmm15, %xmm15 #167.24 + vpcmpgtd %xmm4, %xmm12, %xmm4 #167.24 + vmulpd 160(%rsp), %xmm1, %xmm12 #160.38[spill] + vmulpd %xmm12, %xmm1, %xmm12 #160.44 + vpmovsxdq %xmm4, %xmm4 #167.24 + vandpd %xmm4, %xmm5, %xmm4 #167.24 + vptest %xmm15, %xmm4 #167.24 + vmulpd %xmm12, %xmm1, %xmm15 #160.50 + vfmsub213pd .L_2il0floatpacket.8(%rip), %xmm1, %xmm12 #161.54 + vmulpd 176(%rsp), %xmm1, %xmm1 #161.54[spill] + vmulpd %xmm1, %xmm15, %xmm1 #161.61 + vmulpd %xmm12, %xmm1, %xmm15 #161.67 + vmulpd %xmm15, %xmm0, %xmm12 #162.31 + vmulpd %xmm15, %xmm2, %xmm1 #163.31 + vmulpd %xmm15, %xmm3, %xmm0 #164.31 + vandpd %xmm12, %xmm5, %xmm2 #162.31 + vandpd %xmm1, %xmm5, %xmm3 #163.31 + vandpd %xmm0, %xmm5, %xmm5 #164.31 + vaddpd %xmm2, %xmm14, %xmm14 #162.17 + vaddpd %xmm3, %xmm13, %xmm13 #163.17 + vaddpd %xmm5, %xmm11, %xmm11 #164.17 + je ..B2.38 # Prob 50% #167.24 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 xmm0 xmm1 xmm4 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B2.14: # Preds ..B2.13 + # Execution count [3.12e+00] + vmovmskpd %xmm4, %esi #168.21 + movl %esi, %r11d #168.21 + andl $2, %r11d #168.21 + andl $1, %esi #168.21 + je ..B2.17 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B2.15: # Preds ..B2.14 + # Execution count [3.12e+00] + vmovsd (%r14,%r9,8), %xmm2 #168.21 + testl %r11d, %r11d #168.21 + jne ..B2.18 # Prob 60% #168.21 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B2.16: # Preds ..B2.15 + # Execution count [1.25e+00] + vxorpd %xmm3, %xmm3, %xmm3 #168.21 + vunpcklpd %xmm3, %xmm2, %xmm4 #168.21 + vsubpd %xmm12, %xmm4, %xmm2 #168.21 + jmp ..B2.31 # Prob 100% #168.21 + # LOE rax rdx rbx rdi r9 r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.17: # Preds ..B2.14 + # Execution count [3.12e+00] + testl %r11d, %r11d #168.21 + vxorpd %xmm2, %xmm2, %xmm2 #168.21 + je ..B2.30 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B2.18: # Preds ..B2.15 ..B2.17 + # Execution count [3.12e+00] + vmovhpd (%r14,%r8,8), %xmm2, %xmm3 #168.21 + testl %esi, %esi #168.21 + vsubpd %xmm12, %xmm3, %xmm2 #168.21 + je ..B2.20 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.19: # Preds ..B2.18 + # Execution count [1.88e+00] + vpshufd $14, %xmm2, %xmm3 #168.21 + vmovsd %xmm2, (%r14,%r9,8) #168.21 + vmovsd %xmm3, (%r14,%r8,8) #168.21 + vmovsd (%r14,%rdi,8), %xmm2 #169.21 + jmp ..B2.21 # Prob 100% #169.21 + # LOE rax rdx rcx rbx rdi r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.20: # Preds ..B2.18 + # Execution count [1.25e+00] + vpshufd $14, %xmm2, %xmm2 #168.21 + vmovsd %xmm2, (%r14,%r8,8) #168.21 + vxorpd %xmm2, %xmm2, %xmm2 #169.21 + # LOE rax rdx rcx rbx rdi r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.21: # Preds ..B2.19 ..B2.20 + # Execution count [1.88e+00] + testl %r11d, %r11d #169.21 + je ..B2.74 # Prob 40% #169.21 + # LOE rax rdx rcx rbx rdi r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.22: # Preds ..B2.21 + # Execution count [3.12e+00] + vmovhpd (%r14,%rcx,8), %xmm2, %xmm3 #169.21 + testl %esi, %esi #169.21 + vsubpd %xmm1, %xmm3, %xmm1 #169.21 + je ..B2.24 # Prob 40% #169.21 + # LOE rax rdx rcx rbx rdi r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.23: # Preds ..B2.22 + # Execution count [1.88e+00] + vpshufd $14, %xmm1, %xmm2 #169.21 + vmovsd %xmm1, (%r14,%rdi,8) #169.21 + vmovsd %xmm2, (%r14,%rcx,8) #169.21 + vmovsd (%r14,%rax,8), %xmm1 #170.21 + jmp ..B2.25 # Prob 100% #170.21 + # LOE rax rdx rbx r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.24: # Preds ..B2.22 + # Execution count [1.25e+00] + vpshufd $14, %xmm1, %xmm1 #169.21 + vmovsd %xmm1, (%r14,%rcx,8) #169.21 + vxorpd %xmm1, %xmm1, %xmm1 #170.21 + # LOE rax rdx rbx r10 r12 r13 r14 r15 esi r11d xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.25: # Preds ..B2.23 ..B2.24 + # Execution count [1.88e+00] + testl %r11d, %r11d #170.21 + je ..B2.73 # Prob 40% #170.21 + # LOE rax rdx rbx r10 r12 r13 r14 r15 esi xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.26: # Preds ..B2.25 + # Execution count [3.12e+00] + vmovhpd (%r14,%r10,8), %xmm1, %xmm2 #170.21 + testl %esi, %esi #170.21 + vsubpd %xmm0, %xmm2, %xmm0 #170.21 + je ..B2.28 # Prob 40% #170.21 + # LOE rax rdx rbx r10 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.27: # Preds ..B2.26 + # Execution count [1.88e+00] + vmovsd %xmm0, (%r14,%rax,8) #170.21 + vpshufd $14, %xmm0, %xmm0 #170.21 + jmp ..B2.29 # Prob 100% #170.21 + # LOE rdx rbx r10 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.28: # Preds ..B2.26 + # Execution count [1.25e+00] + vpshufd $14, %xmm0, %xmm0 #170.21 + # LOE rdx rbx r10 r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.29: # Preds ..B2.27 ..B2.28 + # Execution count [3.12e+00] + vmovsd %xmm0, (%r14,%r10,8) #170.21 + jmp ..B2.38 # Prob 100% #170.21 + # LOE rdx rbx r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.30: # Preds ..B2.17 + # Execution count [1.88e+00] + testl %esi, %esi #168.21 + vxorpd %xmm2, %xmm2, %xmm2 #168.21 + vsubpd %xmm12, %xmm2, %xmm2 #168.21 + je ..B2.32 # Prob 40% #168.21 + # LOE rax rdx rbx rdi r9 r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.31: # Preds ..B2.16 ..B2.30 + # Execution count [1.25e+00] + vmovsd %xmm2, (%r14,%r9,8) #168.21 + vmovsd (%r14,%rdi,8), %xmm3 #169.21 + vxorpd %xmm4, %xmm4, %xmm4 #169.21 + vunpcklpd %xmm4, %xmm3, %xmm5 #169.21 + vsubpd %xmm1, %xmm5, %xmm1 #169.21 + jmp ..B2.34 # Prob 100% #169.21 + # LOE rax rdx rbx rdi r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.32: # Preds ..B2.30 + # Execution count [0.00e+00] + vxorpd %xmm2, %xmm2, %xmm2 #169.21 + jmp ..B2.33 # Prob 100% #169.21 + # LOE rax rdx rbx rdi r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.74: # Preds ..B2.21 + # Execution count [7.50e-01] + testl %esi, %esi #168.21 + # LOE rax rdx rbx rdi r12 r13 r14 r15 xmm0 xmm1 xmm2 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.33: # Preds ..B2.32 ..B2.74 + # Execution count [2.67e+00] + vxorpd %xmm3, %xmm3, %xmm3 #169.21 + vunpcklpd %xmm3, %xmm2, %xmm4 #169.21 + vsubpd %xmm1, %xmm4, %xmm1 #169.21 + je ..B2.35 # Prob 40% #169.21 + # LOE rax rdx rbx rdi r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.34: # Preds ..B2.31 ..B2.33 + # Execution count [1.25e+00] + vmovsd %xmm1, (%r14,%rdi,8) #169.21 + vmovsd (%r14,%rax,8), %xmm2 #170.21 + vxorpd %xmm3, %xmm3, %xmm3 #170.21 + vunpcklpd %xmm3, %xmm2, %xmm4 #170.21 + vsubpd %xmm0, %xmm4, %xmm0 #170.21 + jmp ..B2.37 # Prob 100% #170.21 + # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.35: # Preds ..B2.33 + # Execution count [0.00e+00] + vxorpd %xmm1, %xmm1, %xmm1 #170.21 + jmp ..B2.36 # Prob 100% #170.21 + # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.73: # Preds ..B2.25 + # Execution count [7.50e-01] + testl %esi, %esi #168.21 + # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.36: # Preds ..B2.35 ..B2.73 + # Execution count [2.67e+00] + vxorpd %xmm2, %xmm2, %xmm2 #170.21 + vunpcklpd %xmm2, %xmm1, %xmm3 #170.21 + vsubpd %xmm0, %xmm3, %xmm0 #170.21 + je ..B2.38 # Prob 40% #170.21 + # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.37: # Preds ..B2.34 ..B2.36 + # Execution count [1.25e+00] + vmovsd %xmm0, (%r14,%rax,8) #170.21 + # LOE rdx rbx r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.38: # Preds ..B2.36 ..B2.29 ..B2.37 ..B2.13 ..B2.12 + # + # Execution count [1.25e+01] + addq $2, %r15 #143.9 + cmpq %r12, %r15 #143.9 + jb ..B2.12 # Prob 82% #143.9 + # LOE rdx rbx r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 +..B2.39: # Preds ..B2.38 + # Execution count [2.25e+00] + vunpckhpd %xmm11, %xmm11, %xmm12 #132.22 + vunpckhpd %xmm14, %xmm14, %xmm8 #130.22 + vaddsd %xmm12, %xmm11, %xmm12 #132.22 + vaddsd %xmm8, %xmm14, %xmm10 #130.22 + vunpckhpd %xmm13, %xmm13, %xmm11 #131.22 + vmovsd 128(%rsp), %xmm1 #[spill] + vaddsd %xmm11, %xmm13, %xmm11 #131.22 + vmovsd 136(%rsp), %xmm3 #[spill] + vmovsd 144(%rsp), %xmm4 #[spill] + vmovsd 152(%rsp), %xmm5 #[spill] + vmovsd 48(%rsp), %xmm6 #[spill] + vmovsd 56(%rsp), %xmm2 #[spill] + movq 24(%rsp), %r9 #[spill] + movl 32(%rsp), %edi #[spill] + movq 88(%rsp), %rsi #[spill] + movq 96(%rsp), %r10 #[spill] + movq 104(%rsp), %r11 #[spill] + movq 112(%rsp), %rcx #[spill] + movq 120(%rsp), %r8 #[spill] + movq 40(%rsp), %rax #[spill] + vmovsd .L_2il0floatpacket.1(%rip), %xmm0 # + vmovsd .L_2il0floatpacket.4(%rip), %xmm7 # + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.40: # Preds ..B2.39 ..B2.58 + # Execution count [2.50e+00] + movslq %edi, %r13 #143.9 + cmpq %r13, %r12 #143.9 + jae ..B2.49 # Prob 10% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.41: # Preds ..B2.40 + # Execution count [2.25e+00] + imulq 64(%rsp), %r9 #125.43[spill] + addq 72(%rsp), %r9 #107.5[spill] + movl 80(%rsp), %eax #107.5[spill] + movq %r8, 120(%rsp) #107.5[spill] + # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 eax edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.42: # Preds ..B2.45 ..B2.41 + # Execution count [1.25e+01] + movl (%r9,%r12,4), %r8d #144.21 + lea (%r8,%r8,2), %r15d #145.36 + movslq %r15d, %r15 #145.36 + vsubsd 8(%rdx,%r15,8), %xmm3, %xmm9 #146.36 + vsubsd (%rdx,%r15,8), %xmm4, %xmm14 #145.36 + vsubsd 16(%rdx,%r15,8), %xmm1, %xmm8 #147.36 + vmulsd %xmm9, %xmm9, %xmm13 #148.49 + vfmadd231sd %xmm14, %xmm14, %xmm13 #148.63 + vfmadd231sd %xmm8, %xmm8, %xmm13 #148.63 + vcomisd %xmm13, %xmm2 #158.22 + jbe ..B2.45 # Prob 50% #158.22 + # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 r15 eax edi r8d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B2.43: # Preds ..B2.42 + # Execution count [6.25e+00] + vdivsd %xmm13, %xmm7, %xmm15 #159.38 + vmulsd %xmm15, %xmm6, %xmm13 #160.38 + vmulsd %xmm15, %xmm13, %xmm13 #160.44 + vmulsd %xmm15, %xmm13, %xmm13 #160.50 + vmulsd %xmm5, %xmm15, %xmm15 #161.54 + vmulsd %xmm13, %xmm15, %xmm15 #161.61 + vsubsd %xmm0, %xmm13, %xmm13 #161.54 + vmulsd %xmm13, %xmm15, %xmm15 #161.67 + vmulsd %xmm15, %xmm14, %xmm13 #162.31 + vmulsd %xmm15, %xmm9, %xmm9 #163.31 + vmulsd %xmm15, %xmm8, %xmm8 #164.31 + vaddsd %xmm13, %xmm10, %xmm10 #162.17 + vaddsd %xmm9, %xmm11, %xmm11 #163.17 + vaddsd %xmm8, %xmm12, %xmm12 #164.17 + cmpl %eax, %r8d #167.24 + jge ..B2.45 # Prob 50% #167.24 + # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 r15 eax edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.44: # Preds ..B2.43 + # Execution count [3.12e+00] + vmovsd 8(%r14,%r15,8), %xmm15 #169.21 + vmovsd (%r14,%r15,8), %xmm14 #168.21 + vsubsd %xmm9, %xmm15, %xmm9 #169.21 + vsubsd %xmm13, %xmm14, %xmm13 #168.21 + vmovsd %xmm9, 8(%r14,%r15,8) #169.21 + vmovsd 16(%r14,%r15,8), %xmm9 #170.21 + vmovsd %xmm13, (%r14,%r15,8) #168.21 + vsubsd %xmm8, %xmm9, %xmm8 #170.21 + vmovsd %xmm8, 16(%r14,%r15,8) #170.21 + # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 eax edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.45: # Preds ..B2.44 ..B2.43 ..B2.42 + # Execution count [1.25e+01] + incq %r12 #143.9 + cmpq %r13, %r12 #143.9 + jb ..B2.42 # Prob 82% #143.9 + # LOE rdx rcx rbx rsi r9 r10 r11 r12 r13 r14 eax edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.46: # Preds ..B2.45 + # Execution count [2.25e+00] + movq 120(%rsp), %r8 #[spill] + movq 40(%rsp), %rax #[spill] + jmp ..B2.49 # Prob 100% # + # LOE rax rdx rcx rbx rsi r8 r10 r11 r13 r14 edi xmm0 xmm2 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.48: # Preds ..B2.9 ..B2.8 + # Execution count [2.50e+00] + movslq %edi, %r13 #179.9 + # LOE rax rdx rcx rbx rsi r8 r10 r11 r13 r14 edi xmm0 xmm2 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.49: # Preds ..B2.46 ..B2.40 ..B2.48 + # Execution count [5.00e+00] + addq %r13, %r11 #179.9 + lea 3(%rdi), %r9d #180.9 + sarl $1, %r9d #180.9 + vaddsd (%rsi,%r14), %xmm10, %xmm1 #175.9 + vaddsd 8(%rsi,%r14), %xmm11, %xmm3 #176.9 + vaddsd 16(%rsi,%r14), %xmm12, %xmm4 #177.9 + shrl $30, %r9d #180.9 + vmovsd %xmm1, (%rsi,%r14) #175.9 + vmovsd %xmm3, 8(%rsi,%r14) #176.9 + vmovsd %xmm4, 16(%rsi,%r14) #177.9 + addq $24, %rsi #124.5 + lea 3(%r9,%rdi), %edi #180.9 + movslq %r8d, %r9 #124.32 + sarl $2, %edi #180.9 + incq %r8 #124.5 + movslq %edi, %rdi #180.9 + incq %r9 #124.32 + addq %rdi, %r10 #180.9 + cmpq %rax, %r8 #124.5 + jb ..B2.8 # Prob 82% #124.5 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r14 xmm0 xmm2 xmm5 xmm6 xmm7 +..B2.50: # Preds ..B2.49 + # Execution count [9.00e-01] + movq 8(%rsp), %r15 #[spill] + movq (%rsp), %r12 #[spill] + movq %r11, (%r15) #179.9 + movq %r10, 8(%r15) #180.9 + jmp ..B2.54 # Prob 100% #180.9 + # LOE rbx r12 +..B2.51: # Preds ..B2.1 + # Execution count [5.00e-01] + xorl %ebx, %ebx #120.22 + xorl %eax, %eax #121.16 +..___tag_value_computeForceLJHalfNeigh.161: +# getTimeStamp() + call getTimeStamp #121.16 +..___tag_value_computeForceLJHalfNeigh.162: + # LOE rbx r12 xmm0 +..B2.71: # Preds ..B2.51 + # Execution count [5.00e-01] + vmovsd %xmm0, 16(%rsp) #121.16[spill] + # LOE rbx r12 +..B2.52: # Preds ..B2.71 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #122.5 +..___tag_value_computeForceLJHalfNeigh.164: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #122.5 +..___tag_value_computeForceLJHalfNeigh.165: + # LOE rbx r12 +..B2.54: # Preds ..B2.52 ..B2.50 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #183.5 +..___tag_value_computeForceLJHalfNeigh.166: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #183.5 +..___tag_value_computeForceLJHalfNeigh.167: + # LOE rbx r12 +..B2.55: # Preds ..B2.54 + # Execution count [1.00e+00] + xorl %eax, %eax #184.16 +..___tag_value_computeForceLJHalfNeigh.168: +# getTimeStamp() + call getTimeStamp #184.16 +..___tag_value_computeForceLJHalfNeigh.169: + # LOE rbx r12 xmm0 +..B2.56: # Preds ..B2.55 + # Execution count [1.00e+00] + vxorpd %xmm4, %xmm4, %xmm4 #185.5 + vcvtsi2sdq %rbx, %xmm4, %xmm4 #185.5 + vsubsd 16(%rsp), %xmm0, %xmm1 #185.94[spill] + vmovsd .L_2il0floatpacket.9(%rip), %xmm3 #185.5 + movl $.L_2__STRING.2, %edi #185.5 + vdivsd %xmm4, %xmm3, %xmm5 #185.5 + vmulsd %xmm1, %xmm5, %xmm6 #185.5 + movl %ebx, %esi #185.5 + vmovsd 264(%r12), %xmm7 #185.74 + movl $3, %eax #185.5 + vmulsd %xmm7, %xmm6, %xmm2 #185.5 + vmovapd %xmm7, %xmm0 #185.5 + vmovsd %xmm1, (%rsp) #185.5[spill] +..___tag_value_computeForceLJHalfNeigh.171: +# printf(const char *__restrict__, ...) + call printf #185.5 +..___tag_value_computeForceLJHalfNeigh.172: + # LOE +..B2.57: # Preds ..B2.56 + # Execution count [1.00e+00] + vmovsd (%rsp), %xmm1 #[spill] + vmovapd %xmm1, %xmm0 #186.14 + addq $248, %rsp #186.14 + .cfi_restore 3 + popq %rbx #186.14 + .cfi_restore 15 + popq %r15 #186.14 + .cfi_restore 14 + popq %r14 #186.14 + .cfi_restore 13 + popq %r13 #186.14 + .cfi_restore 12 + popq %r12 #186.14 + movq %rbp, %rsp #186.14 + popq %rbp #186.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #186.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B2.58: # Preds ..B2.10 + # Execution count [2.25e-01]: Infreq + xorl %r12d, %r12d #143.9 + jmp ..B2.40 # Prob 100% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 edi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.59: # Preds ..B2.2 + # Execution count [1.00e+00]: Infreq + movq %r13, %rax #106.18 + lea (%rax,%rax,2), %rcx #106.18 + cmpq $8, %rcx #114.5 + jl ..B2.67 # Prob 10% #114.5 + # LOE rcx rdi r12 r14 r15 r13d +..B2.60: # Preds ..B2.59 + # Execution count [1.00e+00]: Infreq + movl %ecx, %eax #114.5 + xorl %edx, %edx #114.5 + andl $-8, %eax #114.5 + movslq %eax, %rax #114.5 + vxorpd %ymm0, %ymm0, %ymm0 #115.22 + # LOE rax rdx rcx rdi r12 r14 r15 r13d ymm0 +..B2.61: # Preds ..B2.61 ..B2.60 + # Execution count [5.56e+00]: Infreq + vmovupd %ymm0, (%rdi,%rdx,8) #115.9 + vmovupd %ymm0, 32(%rdi,%rdx,8) #115.9 + addq $8, %rdx #114.5 + cmpq %rax, %rdx #114.5 + jb ..B2.61 # Prob 82% #114.5 + # LOE rax rdx rcx rdi r12 r14 r15 r13d ymm0 +..B2.63: # Preds ..B2.61 ..B2.67 + # Execution count [1.11e+00]: Infreq + cmpq %rcx, %rax #114.5 + jae ..B2.5 # Prob 10% #114.5 + # LOE rax rcx rdi r12 r14 r15 r13d +..B2.64: # Preds ..B2.63 + # Execution count [1.00e+00]: Infreq + xorl %edx, %edx # + # LOE rax rdx rcx rdi r12 r14 r15 r13d +..B2.65: # Preds ..B2.64 ..B2.65 + # Execution count [5.56e+00]: Infreq + movq %rdx, (%rdi,%rax,8) #115.9 + incq %rax #114.5 + cmpq %rcx, %rax #114.5 + jb ..B2.65 # Prob 82% #114.5 + jmp ..B2.5 # Prob 100% #114.5 + # LOE rax rdx rcx rdi r12 r14 r15 r13d +..B2.67: # Preds ..B2.59 + # Execution count [1.00e-01]: Infreq + xorl %eax, %eax #114.5 + jmp ..B2.63 # Prob 100% #114.5 + .align 16,0x90 + # LOE rax rcx rdi r12 r14 r15 r13d + .cfi_endproc +# mark_end; + .type computeForceLJHalfNeigh,@function + .size computeForceLJHalfNeigh,.-computeForceLJHalfNeigh +..LNcomputeForceLJHalfNeigh.1: + .data +# -- End computeForceLJHalfNeigh + .text +.L_2__routine_start_computeForceLJFullNeigh_simd_2: +# -- Begin computeForceLJFullNeigh_simd + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJFullNeigh_simd +# --- computeForceLJFullNeigh_simd(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJFullNeigh_simd: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B3.1: # Preds ..B3.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJFullNeigh_simd.190: +..L191: + #189.101 + pushq %rbp #189.101 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #189.101 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-32, %rsp #189.101 + movl 4(%rsi), %edx #190.18 + testl %edx, %edx #196.24 + jle ..B3.4 # Prob 50% #196.24 + # LOE rbx rsi r12 r13 r14 r15 edx +..B3.2: # Preds ..B3.1 + # Execution count [5.00e-03] + movq 64(%rsi), %rdi #197.9 + lea (%rdx,%rdx,2), %eax #190.18 + cmpl $12, %eax #196.5 + jle ..B3.8 # Prob 0% #196.5 + # LOE rbx rdi r12 r13 r14 r15 edx +..B3.3: # Preds ..B3.2 + # Execution count [1.00e+00] + movslq %edx, %rdx #196.5 + xorl %esi, %esi #196.5 + lea (%rdx,%rdx,2), %rdx #196.5 + shlq $3, %rdx #196.5 + call __intel_avx_rep_memset #196.5 + # LOE rbx r12 r13 r14 r15 +..B3.4: # Preds ..B3.14 ..B3.1 ..B3.12 ..B3.3 + # Execution count [1.00e+00] + xorl %eax, %eax #203.16 + vzeroupper #203.16 +..___tag_value_computeForceLJFullNeigh_simd.195: +# getTimeStamp() + call getTimeStamp #203.16 +..___tag_value_computeForceLJFullNeigh_simd.196: + # LOE rbx r12 r13 r14 r15 +..B3.5: # Preds ..B3.4 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #204.5 +..___tag_value_computeForceLJFullNeigh_simd.197: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #204.5 +..___tag_value_computeForceLJFullNeigh_simd.198: + # LOE +..B3.6: # Preds ..B3.5 + # Execution count [1.00e+00] + movl $il0_peep_printf_format_0, %edi #207.5 + movq stderr(%rip), %rsi #207.5 + call fputs #207.5 + # LOE +..B3.7: # Preds ..B3.6 + # Execution count [1.00e+00] + movl $-1, %edi #208.5 +# exit(int) + call exit #208.5 + # LOE +..B3.8: # Preds ..B3.2 + # Execution count [1.00e+00]: Infreq + movslq %edx, %rdx #196.5 + lea (%rdx,%rdx,2), %rsi #190.18 + cmpq $8, %rsi #196.5 + jl ..B3.16 # Prob 10% #196.5 + # LOE rbx rsi rdi r12 r13 r14 r15 +..B3.9: # Preds ..B3.8 + # Execution count [1.00e+00]: Infreq + movl %esi, %edx #196.5 + xorl %ecx, %ecx #196.5 + andl $-8, %edx #196.5 + xorl %eax, %eax #196.5 + movslq %edx, %rdx #196.5 + vxorpd %ymm0, %ymm0, %ymm0 #197.22 + # LOE rax rdx rcx rbx rsi rdi r12 r13 r14 r15 ymm0 +..B3.10: # Preds ..B3.10 ..B3.9 + # Execution count [5.56e+00]: Infreq + vmovupd %ymm0, (%rdi,%rcx,8) #197.9 + vmovupd %ymm0, 32(%rdi,%rcx,8) #197.9 + addq $8, %rcx #196.5 + cmpq %rdx, %rcx #196.5 + jb ..B3.10 # Prob 82% #196.5 + # LOE rax rdx rcx rbx rsi rdi r12 r13 r14 r15 ymm0 +..B3.12: # Preds ..B3.10 ..B3.16 + # Execution count [1.11e+00]: Infreq + cmpq %rsi, %rdx #196.5 + jae ..B3.4 # Prob 10% #196.5 + # LOE rax rdx rbx rsi rdi r12 r13 r14 r15 +..B3.14: # Preds ..B3.12 ..B3.14 + # Execution count [5.56e+00]: Infreq + movq %rax, (%rdi,%rdx,8) #197.9 + incq %rdx #196.5 + cmpq %rsi, %rdx #196.5 + jb ..B3.14 # Prob 82% #196.5 + jmp ..B3.4 # Prob 100% #196.5 + # LOE rax rdx rbx rsi rdi r12 r13 r14 r15 +..B3.16: # Preds ..B3.8 + # Execution count [1.00e-01]: Infreq + xorl %edx, %edx #196.5 + xorl %eax, %eax #196.5 + jmp ..B3.12 # Prob 100% #196.5 + .align 16,0x90 + # LOE rax rdx rbx rsi rdi r12 r13 r14 r15 + .cfi_endproc +# mark_end; + .type computeForceLJFullNeigh_simd,@function + .size computeForceLJFullNeigh_simd,.-computeForceLJFullNeigh_simd +..LNcomputeForceLJFullNeigh_simd.2: + .section .rodata.str1.32, "aMS",@progbits,1 + .align 32 + .align 32 +il0_peep_printf_format_0: + .long 1869771333 + .long 1394621042 + .long 541347145 + .long 1852990827 + .long 1847618661 + .long 1763734639 + .long 1701605485 + .long 1953391981 + .long 1713398885 + .long 1931506287 + .long 1768121712 + .long 1684367718 + .long 1936615712 + .long 1668641396 + .long 1852795252 + .long 1952805664 + .word 33 + .data +# -- End computeForceLJFullNeigh_simd + .section .rodata, "a" + .align 32 + .align 32 +.L_2il0floatpacket.2: + .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000 + .type .L_2il0floatpacket.2,@object + .size .L_2il0floatpacket.2,32 + .align 32 +.L_2il0floatpacket.3: + .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000 + .type .L_2il0floatpacket.3,@object + .size .L_2il0floatpacket.3,32 + .align 16 +.L_2il0floatpacket.5: + .long 0x00000001,0x00000001,0x00000001,0x00000001 + .type .L_2il0floatpacket.5,@object + .size .L_2il0floatpacket.5,16 + .align 16 +.L_2il0floatpacket.6: + .long 0x00000002,0x00000002,0x00000002,0x00000002 + .type .L_2il0floatpacket.6,@object + .size .L_2il0floatpacket.6,16 + .align 16 +.L_2il0floatpacket.7: + .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000 + .type .L_2il0floatpacket.7,@object + .size .L_2il0floatpacket.7,16 + .align 16 +.L_2il0floatpacket.8: + .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000 + .type .L_2il0floatpacket.8,@object + .size .L_2il0floatpacket.8,16 + .align 8 +.L_2il0floatpacket.0: + .long 0x00000000,0x40480000 + .type .L_2il0floatpacket.0,@object + .size .L_2il0floatpacket.0,8 + .align 8 +.L_2il0floatpacket.1: + .long 0x00000000,0x3fe00000 + .type .L_2il0floatpacket.1,@object + .size .L_2il0floatpacket.1,8 + .align 8 +.L_2il0floatpacket.4: + .long 0x00000000,0x3ff00000 + .type .L_2il0floatpacket.4,@object + .size .L_2il0floatpacket.4,8 + .align 8 +.L_2il0floatpacket.9: + .long 0x00000000,0x41cdcd65 + .type .L_2il0floatpacket.9,@object + .size .L_2il0floatpacket.9,8 + .section .rodata.str1.4, "aMS",@progbits,1 + .align 4 + .align 4 +.L_2__STRING.0: + .long 1668444006 + .word 101 + .type .L_2__STRING.0,@object + .size .L_2__STRING.0,6 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.1: + .long 1668444006 + .long 759843941 + .long 1718378856 + .long 1734960494 + .word 104 + .type .L_2__STRING.1,@object + .size .L_2__STRING.1,18 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.2: + .long 980644937 + .long 544548128 + .long 1701987872 + .long 622869105 + .long 1411391590 + .long 979725673 + .long 174466336 + .long 1764718915 + .long 622869108 + .long 1747460198 + .long 761687137 + .long 1734960494 + .long 665960 + .type .L_2__STRING.2,@object + .size .L_2__STRING.2,52 + .data + .section .note.GNU-stack, "" +# End diff --git a/static_analysis/jan/icx-icc-lammps-avx512.s b/static_analysis/jan/icx-icc-lammps-avx512.s new file mode 100644 index 0000000..47960ac --- /dev/null +++ b/static_analysis/jan/icx-icc-lammps-avx512.s @@ -0,0 +1,1659 @@ +# mark_description "Intel(R) C Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.6.0 Build 2022"; +# mark_description "0226_000000"; +# mark_description "-I/apps/likwid/5.2.2/include -I././lammps/includes -I././common/includes -S -std=c11 -pedantic-errors -D_GNU"; +# mark_description "_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DCOMPUTE_STATS -DVECTOR_WIDTH=8 -D__SIMD_KERNEL__ -D__ISA_AVX5"; +# mark_description "12__ -DENABLE_OMP_SIMD -DALIGNMENT=64 -restrict -Ofast -xCORE-AVX512 -qopt-zmm-usage=high -o build-lammps-IC"; +# mark_description "C-AVX512-DP/force_lj.s"; + .file "force_lj.c" + .text +..TXTST0: +.L_2__routine_start_computeForceLJFullNeigh_plain_c_0: +# -- Begin computeForceLJFullNeigh_plain_c + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJFullNeigh_plain_c +# --- computeForceLJFullNeigh_plain_c(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJFullNeigh_plain_c: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B1.1: # Preds ..B1.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJFullNeigh_plain_c.1: +..L2: + #23.104 + pushq %rbp #23.104 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #23.104 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #23.104 + pushq %r13 #23.104 + pushq %r14 #23.104 + pushq %r15 #23.104 + pushq %rbx #23.104 + subq $96, %rsp #23.104 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + movq %rsi, %r13 #23.104 + vmovsd 144(%rdi), %xmm0 #27.27 + movq %rcx, %r14 #23.104 + vmovsd 56(%rdi), %xmm1 #28.23 + movq %rdx, %rbx #23.104 + vmovsd 40(%rdi), %xmm2 #29.24 + movl 4(%r13), %r15d #24.18 + vmovsd %xmm0, 32(%rsp) #27.27[spill] + vmovsd %xmm1, 16(%rsp) #28.23[spill] + vmovsd %xmm2, 24(%rsp) #29.24[spill] + testl %r15d, %r15d #32.24 + jle ..B1.27 # Prob 50% #32.24 + # LOE rbx r12 r13 r14 r15d +..B1.2: # Preds ..B1.1 + # Execution count [5.00e-03] + movq 64(%r13), %rdi #33.9 + lea (%r15,%r15,2), %esi #24.18 + cmpl $12, %esi #32.5 + jle ..B1.34 # Prob 0% #32.5 + # LOE rbx rdi r12 r13 r14 esi r15d +..B1.3: # Preds ..B1.2 + # Execution count [1.00e+00] + movslq %r15d, %r15 #32.5 + xorl %esi, %esi #32.5 + lea (%r15,%r15,2), %rdx #32.5 + shlq $3, %rdx #32.5 + call __intel_skx_avx512_memset #32.5 + # LOE rbx r12 r13 r14 r15 +..B1.4: # Preds ..B1.3 ..B1.46 ..B1.39 + # Execution count [1.00e+00] + xorl %eax, %eax #38.16 + vzeroupper #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.13: +# getTimeStamp() + call getTimeStamp #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.14: + # LOE rbx r12 r13 r14 r15 xmm0 +..B1.43: # Preds ..B1.4 + # Execution count [1.00e+00] + vmovsd %xmm0, 40(%rsp) #38.16[spill] + # LOE rbx r12 r13 r14 r15 +..B1.5: # Preds ..B1.43 + # Execution count [5.00e-01] + movl $.L_2__STRING.0, %edi #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.16: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.17: + # LOE rbx r12 r13 r14 r15 +..B1.6: # Preds ..B1.5 + # Execution count [9.00e-01] + vmovsd 32(%rsp), %xmm13 #27.45[spill] + xorl %esi, %esi #41.15 + vmovsd 24(%rsp), %xmm0 #77.41[spill] + xorl %edi, %edi #41.5 + vmulsd %xmm13, %xmm13, %xmm14 #27.45 + xorl %eax, %eax #41.5 + vmovdqu32 .L_2il0floatpacket.0(%rip), %ymm16 #56.9 + vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm1 #77.41 + vmovdqu .L_2il0floatpacket.1(%rip), %ymm15 #56.9 + vmovups .L_2il0floatpacket.4(%rip), %zmm5 #77.54 + vbroadcastsd %xmm14, %zmm14 #27.25 + vbroadcastsd 16(%rsp), %zmm13 #28.21[spill] + vbroadcastsd %xmm1, %zmm12 #77.41 + movq 24(%rbx), %r11 #43.25 + movq 64(%r13), %r10 #89.9 + movq 16(%rbx), %r9 #42.19 + movslq 8(%rbx), %r8 #42.43 + shlq $2, %r8 #25.5 + movq 16(%r13), %rbx #44.25 + movq (%r14), %rdx #93.9 + movq 8(%r14), %rcx #94.9 + movq %r10, 48(%rsp) #41.5[spill] + movq %r11, 56(%rsp) #41.5[spill] + movq %r15, 64(%rsp) #41.5[spill] + movq %r14, (%rsp) #41.5[spill] + movq %r12, 8(%rsp) #41.5[spill] + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22 + # LOE rax rdx rcx rbx rsi rdi r8 r9 ymm15 ymm16 zmm5 zmm12 zmm13 zmm14 +..B1.7: # Preds ..B1.25 ..B1.6 + # Execution count [5.00e+00] + movq 56(%rsp), %r10 #43.25[spill] + vxorpd %xmm24, %xmm24, %xmm24 #47.22 + vmovapd %xmm24, %xmm18 #48.22 + movl (%r10,%rdi,4), %r13d #43.25 + vmovapd %xmm18, %xmm4 #49.22 + vmovsd (%rax,%rbx), %xmm11 #44.25 + vmovsd 8(%rax,%rbx), %xmm6 #45.25 + vmovsd 16(%rax,%rbx), %xmm7 #46.25 + testl %r13d, %r13d #56.28 + jle ..B1.25 # Prob 50% #56.28 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r13d xmm4 xmm6 xmm7 xmm11 xmm18 xmm24 ymm15 ymm16 zmm5 zmm12 zmm13 zmm14 +..B1.8: # Preds ..B1.7 + # Execution count [4.50e+00] + vpxord %zmm10, %zmm10, %zmm10 #47.22 + vmovaps %zmm10, %zmm9 #48.22 + vmovaps %zmm9, %zmm8 #49.22 + cmpl $8, %r13d #56.9 + jl ..B1.33 # Prob 10% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.9: # Preds ..B1.8 + # Execution count [4.50e+00] + cmpl $1200, %r13d #56.9 + jl ..B1.32 # Prob 10% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.10: # Preds ..B1.9 + # Execution count [4.50e+00] + movq %r8, %r10 #42.43 + imulq %rsi, %r10 #42.43 + addq %r9, %r10 #25.5 + movq %r10, %r12 #56.9 + andq $63, %r12 #56.9 + testl $3, %r12d #56.9 + je ..B1.12 # Prob 50% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.11: # Preds ..B1.10 + # Execution count [2.25e+00] + xorl %r12d, %r12d #56.9 + jmp ..B1.14 # Prob 100% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.12: # Preds ..B1.10 + # Execution count [2.25e+00] + testl %r12d, %r12d #56.9 + je ..B1.14 # Prob 50% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.13: # Preds ..B1.12 + # Execution count [2.50e+01] + negl %r12d #56.9 + addl $64, %r12d #56.9 + shrl $2, %r12d #56.9 + cmpl %r12d, %r13d #56.9 + cmovl %r13d, %r12d #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.14: # Preds ..B1.11 ..B1.13 ..B1.12 + # Execution count [5.00e+00] + movl %r13d, %r11d #56.9 + subl %r12d, %r11d #56.9 + andl $7, %r11d #56.9 + negl %r11d #56.9 + addl %r13d, %r11d #56.9 + cmpl $1, %r12d #56.9 + jb ..B1.18 # Prob 50% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.15: # Preds ..B1.14 + # Execution count [4.50e+00] + vmovdqa32 %ymm16, %ymm4 #56.9 + xorl %r15d, %r15d #56.9 + vpbroadcastd %r12d, %ymm3 #56.9 + vbroadcastsd %xmm11, %zmm2 #44.23 + vbroadcastsd %xmm6, %zmm1 #45.23 + vbroadcastsd %xmm7, %zmm0 #46.23 + movslq %r12d, %r14 #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r14 r15 r11d r12d r13d xmm6 xmm7 xmm11 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# LLVM-MCA-BEGIN +# pointer_increment=64 55c62179dea305ceefda0dbc87792a60 +..B1.16: # Preds ..B1.16 ..B1.15 + # Execution count [2.50e+01] + vpcmpgtd %ymm4, %ymm3, %k5 #56.9 + vpaddd %ymm15, %ymm4, %ymm4 #56.9 + vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z} #57.21 + vpaddd %ymm17, %ymm17, %ymm18 #58.36 + addq $8, %r15 #56.9 + vpaddd %ymm18, %ymm17, %ymm19 #58.36 + kmovw %k5, %k2 #58.36 + kmovw %k5, %k3 #58.36 + kmovw %k5, %k1 #58.36 + vpxord %zmm21, %zmm21, %zmm21 #58.36 + vpxord %zmm20, %zmm20, %zmm20 #58.36 + vpxord %zmm22, %zmm22, %zmm22 #58.36 + vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2} #58.36 + vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3} #58.36 + vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1} #58.36 + vsubpd %zmm21, %zmm1, %zmm18 #59.36 + vsubpd %zmm20, %zmm2, %zmm17 #58.36 + vsubpd %zmm22, %zmm0, %zmm19 #60.36 + vmulpd %zmm18, %zmm18, %zmm31 #61.49 + vfmadd231pd %zmm17, %zmm17, %zmm31 #61.49 + vfmadd231pd %zmm19, %zmm19, %zmm31 #61.63 + vrcp14pd %zmm31, %zmm30 #75.38 + vcmppd $1, %zmm14, %zmm31, %k6{%k5} #71.22 + vfpclasspd $30, %zmm30, %k0 #75.38 + vmovaps %zmm31, %zmm23 #75.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.38 + knotw %k0, %k4 #75.38 + vmulpd %zmm23, %zmm23, %zmm24 #75.38 + vfmadd213pd %zmm30, %zmm23, %zmm30{%k4} #75.38 + vfmadd213pd %zmm30, %zmm24, %zmm30{%k4} #75.38 + vmulpd %zmm13, %zmm30, %zmm25 #76.38 + vmulpd %zmm12, %zmm30, %zmm27 #77.54 + vmulpd %zmm25, %zmm30, %zmm28 #76.44 + vmulpd %zmm28, %zmm30, %zmm26 #76.50 + vfmsub213pd %zmm5, %zmm28, %zmm30 #77.54 + vmulpd %zmm27, %zmm26, %zmm29 #77.61 + vmulpd %zmm30, %zmm29, %zmm23 #77.67 + vfmadd231pd %zmm17, %zmm23, %zmm10{%k6} #78.17 + vfmadd231pd %zmm18, %zmm23, %zmm9{%k6} #79.17 + vfmadd231pd %zmm19, %zmm23, %zmm8{%k6} #80.17 + cmpq %r14, %r15 #56.9 + jb ..B1.16 # Prob 82% #56.9 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r14 r15 r11d r12d r13d xmm6 xmm7 xmm11 ymm3 ymm4 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.17: # Preds ..B1.16 + # Execution count [4.50e+00] + cmpl %r12d, %r13d #56.9 + je ..B1.24 # Prob 10% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.18: # Preds ..B1.17 ..B1.14 ..B1.32 + # Execution count [2.50e+01] + lea 8(%r12), %r10d #56.9 + cmpl %r10d, %r11d #56.9 + jl ..B1.22 # Prob 50% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.19: # Preds ..B1.18 + # Execution count [4.50e+00] + movq %r8, %r10 #42.43 + imulq %rsi, %r10 #42.43 + vbroadcastsd %xmm11, %zmm2 #44.23 + vbroadcastsd %xmm6, %zmm1 #45.23 + vbroadcastsd %xmm7, %zmm0 #46.23 + movslq %r12d, %r14 #56.9 + addq %r9, %r10 #25.5 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r14 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.20: # Preds ..B1.20 ..B1.19 + # Execution count [2.50e+01] + vmovdqu (%r10,%r14,4), %ymm3 #57.21 + addl $8, %r12d #56.9 + vpcmpeqb %xmm0, %xmm0, %k2 #58.36 + vpcmpeqb %xmm0, %xmm0, %k3 #58.36 + vpcmpeqb %xmm0, %xmm0, %k1 #58.36 + vpaddd %ymm3, %ymm3, %ymm4 #58.36 + vpaddd %ymm4, %ymm3, %ymm17 #58.36 + addq $8, %r14 #56.9 + vpxord %zmm19, %zmm19, %zmm19 #58.36 + vpxord %zmm18, %zmm18, %zmm18 #58.36 + vpxord %zmm20, %zmm20, %zmm20 #58.36 + vgatherdpd 8(%rbx,%ymm17,8), %zmm19{%k2} #58.36 + vgatherdpd (%rbx,%ymm17,8), %zmm18{%k3} #58.36 + vgatherdpd 16(%rbx,%ymm17,8), %zmm20{%k1} #58.36 + vsubpd %zmm19, %zmm1, %zmm30 #59.36 + vsubpd %zmm18, %zmm2, %zmm29 #58.36 + vsubpd %zmm20, %zmm0, %zmm3 #60.36 + vmulpd %zmm30, %zmm30, %zmm21 #61.49 + vfmadd231pd %zmm29, %zmm29, %zmm21 #61.49 + vfmadd231pd %zmm3, %zmm3, %zmm21 #61.63 + vrcp14pd %zmm21, %zmm28 #75.38 + vcmppd $1, %zmm14, %zmm21, %k5 #71.22 + vfpclasspd $30, %zmm28, %k0 #75.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm28, %zmm21 #75.38 + knotw %k0, %k4 #75.38 + vmulpd %zmm21, %zmm21, %zmm22 #75.38 + vfmadd213pd %zmm28, %zmm21, %zmm28{%k4} #75.38 + vfmadd213pd %zmm28, %zmm22, %zmm28{%k4} #75.38 + vmulpd %zmm13, %zmm28, %zmm23 #76.38 + vmulpd %zmm12, %zmm28, %zmm25 #77.54 + vmulpd %zmm23, %zmm28, %zmm26 #76.44 + vmulpd %zmm26, %zmm28, %zmm24 #76.50 + vfmsub213pd %zmm5, %zmm26, %zmm28 #77.54 + vmulpd %zmm25, %zmm24, %zmm27 #77.61 + vmulpd %zmm28, %zmm27, %zmm31 #77.67 + vfmadd231pd %zmm29, %zmm31, %zmm10{%k5} #78.17 + vfmadd231pd %zmm30, %zmm31, %zmm9{%k5} #79.17 + vfmadd231pd %zmm3, %zmm31, %zmm8{%k5} #80.17 + cmpl %r11d, %r12d #56.9 + jb ..B1.20 # Prob 82% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r14 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm0 zmm1 zmm2 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.22: # Preds ..B1.20 ..B1.18 ..B1.33 + # Execution count [5.00e+00] + lea 1(%r11), %r10d #56.9 + cmpl %r13d, %r10d #56.9 + ja ..B1.24 # Prob 50% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.23: # Preds ..B1.22 + # Execution count [2.50e+01] + imulq %r8, %rsi #42.43 + vbroadcastsd %xmm7, %zmm17 #46.23 + vbroadcastsd %xmm6, %zmm4 #45.23 + vbroadcastsd %xmm11, %zmm2 #44.23 + movl %r13d, %r10d #56.9 + addq %r9, %rsi #25.5 + subl %r11d, %r10d #56.9 + vpbroadcastd %r10d, %ymm7 #56.9 + vpcmpgtd %ymm16, %ymm7, %k5 #56.9 + movslq %r11d, %r11 #56.9 + kmovw %k5, %k2 #58.36 + kmovw %k5, %k3 #58.36 + kmovw %k5, %k1 #58.36 + vmovdqu32 (%rsi,%r11,4), %ymm6{%k5}{z} #57.21 + vpaddd %ymm6, %ymm6, %ymm0 #58.36 + vpaddd %ymm0, %ymm6, %ymm1 #58.36 + vpxord %zmm11, %zmm11, %zmm11 #58.36 + vpxord %zmm3, %zmm3, %zmm3 #58.36 + vpxord %zmm18, %zmm18, %zmm18 #58.36 + vgatherdpd 8(%rbx,%ymm1,8), %zmm11{%k2} #58.36 + vgatherdpd (%rbx,%ymm1,8), %zmm3{%k3} #58.36 + vgatherdpd 16(%rbx,%ymm1,8), %zmm18{%k1} #58.36 + vsubpd %zmm11, %zmm4, %zmm29 #59.36 + vsubpd %zmm3, %zmm2, %zmm28 #58.36 + vsubpd %zmm18, %zmm17, %zmm31 #60.36 + vmulpd %zmm29, %zmm29, %zmm27 #61.49 + vfmadd231pd %zmm28, %zmm28, %zmm27 #61.49 + vfmadd231pd %zmm31, %zmm31, %zmm27 #61.63 + vrcp14pd %zmm27, %zmm26 #75.38 + vcmppd $1, %zmm14, %zmm27, %k6{%k5} #71.22 + vfpclasspd $30, %zmm26, %k0 #75.38 + vmovaps %zmm27, %zmm19 #75.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm26, %zmm19 #75.38 + knotw %k0, %k4 #75.38 + vmulpd %zmm19, %zmm19, %zmm20 #75.38 + vfmadd213pd %zmm26, %zmm19, %zmm26{%k4} #75.38 + vfmadd213pd %zmm26, %zmm20, %zmm26{%k4} #75.38 + vmulpd %zmm13, %zmm26, %zmm21 #76.38 + vmulpd %zmm12, %zmm26, %zmm23 #77.54 + vmulpd %zmm21, %zmm26, %zmm24 #76.44 + vmulpd %zmm24, %zmm26, %zmm22 #76.50 + vfmsub213pd %zmm5, %zmm24, %zmm26 #77.54 + vmulpd %zmm23, %zmm22, %zmm25 #77.61 + vmulpd %zmm26, %zmm25, %zmm30 #77.67 + vfmadd231pd %zmm28, %zmm30, %zmm10{%k6} #78.17 + vfmadd231pd %zmm29, %zmm30, %zmm9{%k6} #79.17 + vfmadd231pd %zmm31, %zmm30, %zmm8{%k6} #80.17 + # LOE rax rdx rcx rbx rdi r8 r9 r13d ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.24: # Preds ..B1.17 ..B1.23 ..B1.22 + # Execution count [4.50e+00] + vmovups .L_2il0floatpacket.6(%rip), %zmm19 #49.22 + vpermd %zmm8, %zmm19, %zmm0 #49.22 + vpermd %zmm9, %zmm19, %zmm6 #48.22 + vpermd %zmm10, %zmm19, %zmm20 #47.22 + vaddpd %zmm8, %zmm0, %zmm8 #49.22 + vaddpd %zmm9, %zmm6, %zmm9 #48.22 + vaddpd %zmm10, %zmm20, %zmm10 #47.22 + vpermpd $78, %zmm8, %zmm1 #49.22 + vpermpd $78, %zmm9, %zmm7 #48.22 + vpermpd $78, %zmm10, %zmm21 #47.22 + vaddpd %zmm1, %zmm8, %zmm2 #49.22 + vaddpd %zmm7, %zmm9, %zmm11 #48.22 + vaddpd %zmm21, %zmm10, %zmm22 #47.22 + vpermpd $177, %zmm2, %zmm3 #49.22 + vpermpd $177, %zmm11, %zmm17 #48.22 + vpermpd $177, %zmm22, %zmm23 #47.22 + vaddpd %zmm3, %zmm2, %zmm4 #49.22 + vaddpd %zmm17, %zmm11, %zmm18 #48.22 + vaddpd %zmm23, %zmm22, %zmm24 #47.22 + # LOE rax rdx rcx rbx rdi r8 r9 r13d xmm4 xmm18 xmm24 ymm15 ymm16 zmm5 zmm12 zmm13 zmm14 +..B1.25: # Preds ..B1.24 ..B1.7 + # Execution count [5.00e+00] + movslq %r13d, %r13 #93.9 + movq 48(%rsp), %rsi #89.9[spill] + lea 7(%r13), %r10d #94.9 + sarl $2, %r10d #94.9 + addq %r13, %rdx #93.9 + shrl $29, %r10d #94.9 + vaddsd (%rax,%rsi), %xmm24, %xmm0 #89.9 + vaddsd 8(%rax,%rsi), %xmm18, %xmm1 #90.9 + vaddsd 16(%rax,%rsi), %xmm4, %xmm2 #91.9 + vmovsd %xmm0, (%rax,%rsi) #89.9 + lea 7(%r10,%r13), %r11d #94.9 + sarl $3, %r11d #94.9 + vmovsd %xmm1, 8(%rax,%rsi) #90.9 + vmovsd %xmm2, 16(%rax,%rsi) #91.9 + addq $24, %rax #41.5 + movslq %r11d, %r11 #94.9 + movslq %edi, %rsi #41.32 + incq %rdi #41.5 + addq %r11, %rcx #94.9 + incq %rsi #41.32 + cmpq 64(%rsp), %rdi #41.5[spill] + jb ..B1.7 # Prob 82% #41.5 + # LOE rax rdx rcx rbx rsi rdi r8 r9 ymm15 ymm16 zmm5 zmm12 zmm13 zmm14 +..B1.26: # Preds ..B1.25 + # Execution count [9.00e-01] + movq (%rsp), %r14 #[spill] + movq 8(%rsp), %r12 #[spill] + .cfi_restore 12 + movq %rdx, (%r14) #93.9 + movq %rcx, 8(%r14) #94.9 + jmp ..B1.29 # Prob 100% #94.9 + # LOE r12 +..B1.27: # Preds ..B1.1 + # Execution count [5.00e-01] + xorl %eax, %eax #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.33: +# getTimeStamp() + call getTimeStamp #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.34: + # LOE r12 xmm0 +..B1.44: # Preds ..B1.27 + # Execution count [5.00e-01] + vmovsd %xmm0, 40(%rsp) #38.16[spill] + # LOE r12 +..B1.28: # Preds ..B1.44 + # Execution count [5.00e-01] + movl $.L_2__STRING.0, %edi #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.36: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.37: + # LOE r12 +..B1.29: # Preds ..B1.26 ..B1.28 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #97.5 + vzeroupper #97.5 +..___tag_value_computeForceLJFullNeigh_plain_c.38: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #97.5 +..___tag_value_computeForceLJFullNeigh_plain_c.39: + # LOE r12 +..B1.30: # Preds ..B1.29 + # Execution count [1.00e+00] + xorl %eax, %eax #98.16 +..___tag_value_computeForceLJFullNeigh_plain_c.40: +# getTimeStamp() + call getTimeStamp #98.16 +..___tag_value_computeForceLJFullNeigh_plain_c.41: + # LOE r12 xmm0 +..B1.31: # Preds ..B1.30 + # Execution count [1.00e+00] + vsubsd 40(%rsp), %xmm0, %xmm0 #102.14[spill] + addq $96, %rsp #102.14 + .cfi_restore 3 + popq %rbx #102.14 + .cfi_restore 15 + popq %r15 #102.14 + .cfi_restore 14 + popq %r14 #102.14 + .cfi_restore 13 + popq %r13 #102.14 + movq %rbp, %rsp #102.14 + popq %rbp #102.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #102.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x88, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + # LOE +..B1.32: # Preds ..B1.9 + # Execution count [4.50e-01]: Infreq + movl %r13d, %r11d #56.9 + xorl %r12d, %r12d #56.9 + andl $-8, %r11d #56.9 + jmp ..B1.18 # Prob 100% #56.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r12d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.33: # Preds ..B1.8 + # Execution count [4.50e-01]: Infreq + xorl %r11d, %r11d #56.9 + jmp ..B1.22 # Prob 100% #56.9 + .cfi_restore 12 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r11d r13d xmm6 xmm7 xmm11 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm12 zmm13 zmm14 +..B1.34: # Preds ..B1.2 + # Execution count [1.00e+00]: Infreq + cmpl $8, %esi #32.5 + jl ..B1.40 # Prob 10% #32.5 + # LOE rbx rdi r12 r13 r14 esi r15d +..B1.35: # Preds ..B1.34 + # Execution count [1.00e+00]: Infreq + movl %esi, %eax #32.5 + xorl %ecx, %ecx #32.5 + andl $-8, %eax #32.5 + movslq %eax, %rdx #32.5 + vpxord %zmm0, %zmm0, %zmm0 #33.22 + # LOE rdx rcx rbx rdi r12 r13 r14 eax esi r15d zmm0 +..B1.36: # Preds ..B1.36 ..B1.35 + # Execution count [5.56e+00]: Infreq + vmovupd %zmm0, (%rdi,%rcx,8) #33.9 + addq $8, %rcx #32.5 + cmpq %rdx, %rcx #32.5 + jb ..B1.36 # Prob 82% #32.5 + # LOE rdx rcx rbx rdi r12 r13 r14 eax esi r15d zmm0 +..B1.38: # Preds ..B1.36 ..B1.40 + # Execution count [1.11e+00]: Infreq + lea 1(%rax), %edx #32.5 + cmpl %esi, %edx #32.5 + ja ..B1.46 # Prob 50% #32.5 + # LOE rbx rdi r12 r13 r14 eax esi r15d +..B1.39: # Preds ..B1.38 + # Execution count [5.56e+00]: Infreq + subl %eax, %esi #32.5 + vpbroadcastd %esi, %ymm0 #32.5 + vpcmpgtd .L_2il0floatpacket.0(%rip), %ymm0, %k1 #32.5 + movslq %eax, %rax #32.5 + movslq %r15d, %r15 #32.5 + vpxord %zmm1, %zmm1, %zmm1 #33.22 + vmovupd %zmm1, (%rdi,%rax,8){%k1} #33.9 + jmp ..B1.4 # Prob 100% #33.9 + # LOE rbx r12 r13 r14 r15 +..B1.40: # Preds ..B1.34 + # Execution count [1.00e-01]: Infreq + xorl %eax, %eax #32.5 + jmp ..B1.38 # Prob 100% #32.5 + # LOE rbx rdi r12 r13 r14 eax esi r15d +..B1.46: # Preds ..B1.38 + # Execution count [5.56e-01]: Infreq + movslq %r15d, %r15 #32.5 + jmp ..B1.4 # Prob 100% #32.5 + .align 16,0x90 + # LOE rbx r12 r13 r14 r15 + .cfi_endproc +# mark_end; + .type computeForceLJFullNeigh_plain_c,@function + .size computeForceLJFullNeigh_plain_c,.-computeForceLJFullNeigh_plain_c +..LNcomputeForceLJFullNeigh_plain_c.0: + .data +# -- End computeForceLJFullNeigh_plain_c + .text +.L_2__routine_start_computeForceLJHalfNeigh_1: +# -- Begin computeForceLJHalfNeigh + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJHalfNeigh +# --- computeForceLJHalfNeigh(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJHalfNeigh: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B2.1: # Preds ..B2.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJHalfNeigh.58: +..L59: + #105.96 + pushq %rbp #105.96 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #105.96 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #105.96 + pushq %r12 #105.96 + pushq %r13 #105.96 + pushq %r14 #105.96 + pushq %r15 #105.96 + pushq %rbx #105.96 + subq $88, %rsp #105.96 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rsi, %rbx #105.96 + vmovsd 144(%rdi), %xmm0 #109.27 + movq %rcx, %r13 #105.96 + vmovsd 56(%rdi), %xmm1 #110.23 + vmovsd 40(%rdi), %xmm2 #111.24 + movl 4(%rbx), %r15d #106.18 + movq %rdx, 48(%rsp) #105.96[spill] + movq %rdi, 16(%rsp) #105.96[spill] + vmovsd %xmm0, 32(%rsp) #109.27[spill] + vmovsd %xmm1, 24(%rsp) #110.23[spill] + vmovsd %xmm2, 40(%rsp) #111.24[spill] + testl %r15d, %r15d #114.24 + jle ..B2.28 # Prob 50% #114.24 + # LOE rbx r13 r15d +..B2.2: # Preds ..B2.1 + # Execution count [5.00e-03] + movq 64(%rbx), %rdi #115.9 + lea (%r15,%r15,2), %esi #106.18 + cmpl $12, %esi #114.5 + jle ..B2.36 # Prob 0% #114.5 + # LOE rbx rdi r13 esi r15d +..B2.3: # Preds ..B2.2 + # Execution count [1.00e+00] + movslq %r15d, %r14 #114.5 + xorl %esi, %esi #114.5 + lea (%r14,%r14,2), %rdx #114.5 + shlq $3, %rdx #114.5 + call __intel_skx_avx512_memset #114.5 + # LOE rbx r13 r14 r15d +..B2.4: # Preds ..B2.3 ..B2.48 ..B2.41 + # Execution count [1.00e+00] + xorl %r12d, %r12d #120.22 + xorl %eax, %eax #121.16 + vzeroupper #121.16 +..___tag_value_computeForceLJHalfNeigh.73: +# getTimeStamp() + call getTimeStamp #121.16 +..___tag_value_computeForceLJHalfNeigh.74: + # LOE rbx r13 r14 r12d r15d xmm0 +..B2.45: # Preds ..B2.4 + # Execution count [1.00e+00] + vmovsd %xmm0, 8(%rsp) #121.16[spill] + # LOE rbx r13 r14 r12d r15d +..B2.5: # Preds ..B2.45 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #122.5 +..___tag_value_computeForceLJHalfNeigh.76: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #122.5 +..___tag_value_computeForceLJHalfNeigh.77: + # LOE rbx r13 r14 r12d r15d +..B2.6: # Preds ..B2.5 + # Execution count [9.00e-01] + vmovsd 32(%rsp), %xmm9 #109.45[spill] + xorl %edi, %edi #124.15 + vmovsd 40(%rsp), %xmm0 #161.41[spill] + xorl %r9d, %r9d #124.5 + vmulsd %xmm9, %xmm9, %xmm10 #109.45 + vmovdqu .L_2il0floatpacket.0(%rip), %ymm14 #143.9 + vmulsd .L_2il0floatpacket.3(%rip), %xmm0, %xmm1 #161.41 + vmovdqu .L_2il0floatpacket.1(%rip), %ymm13 #143.9 + vmovdqu .L_2il0floatpacket.7(%rip), %ymm12 #146.36 + vmovdqu .L_2il0floatpacket.8(%rip), %ymm11 #147.36 + vmovups .L_2il0floatpacket.4(%rip), %zmm5 #161.54 + vpbroadcastd %r15d, %ymm4 #106.18 + vbroadcastsd %xmm10, %zmm10 #109.25 + vbroadcastsd 24(%rsp), %zmm9 #110.21[spill] + vbroadcastsd %xmm1, %zmm7 #161.41 + movq 48(%rsp), %rax #125.19[spill] + movq 16(%rbx), %r11 #127.25 + movq 64(%rbx), %rdx #168.21 + movq 24(%rax), %r15 #126.25 + movslq 8(%rax), %r8 #125.43 + movq 16(%rax), %r10 #125.19 + xorl %eax, %eax #124.5 + shlq $2, %r8 #107.5 + movq (%r13), %rcx #179.9 + movq 8(%r13), %rbx #180.9 + movq %r15, 56(%rsp) #124.5[spill] + movq %r14, 64(%rsp) #124.5[spill] + movq %r13, (%rsp) #124.5[spill] + vpxord %zmm15, %zmm15, %zmm15 #124.5 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 +..B2.7: # Preds ..B2.26 ..B2.6 + # Execution count [5.00e+00] + movq 56(%rsp), %r13 #126.25[spill] + vxorpd %xmm27, %xmm27, %xmm27 #130.22 + vmovapd %xmm27, %xmm21 #131.22 + movl (%r13,%r9,4), %r13d #126.25 + addl %r13d, %r12d #138.9 + vmovsd (%rax,%r11), %xmm1 #127.25 + vmovapd %xmm21, %xmm3 #132.22 + vmovsd 8(%rax,%r11), %xmm0 #128.25 + vmovsd 16(%rax,%r11), %xmm2 #129.25 + testl %r13d, %r13d #143.9 + jle ..B2.26 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d xmm0 xmm1 xmm2 xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 +..B2.8: # Preds ..B2.7 + # Execution count [2.50e+00] + jbe ..B2.26 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d xmm0 xmm1 xmm2 xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 +..B2.9: # Preds ..B2.8 + # Execution count [2.25e+00] + vmovaps %zmm15, %zmm8 #130.22 + vmovaps %zmm8, %zmm6 #131.22 + vmovaps %zmm6, %zmm3 #132.22 + cmpl $8, %r13d #143.9 + jb ..B2.35 # Prob 10% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.10: # Preds ..B2.9 + # Execution count [2.25e+00] + cmpl $1200, %r13d #143.9 + jb ..B2.34 # Prob 10% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.11: # Preds ..B2.10 + # Execution count [2.25e+00] + movq %r8, %rsi #125.43 + imulq %rdi, %rsi #125.43 + addq %r10, %rsi #107.5 + movq %rsi, %r14 #143.9 + andq $63, %r14 #143.9 + testl $3, %r14d #143.9 + je ..B2.13 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.12: # Preds ..B2.11 + # Execution count [1.12e+00] + movl %r13d, %r15d #143.9 + xorl %r14d, %r14d #143.9 + andl $7, %r15d #143.9 + negl %r15d #143.9 + addl %r13d, %r15d #143.9 + jmp ..B2.19 # Prob 100% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.13: # Preds ..B2.11 + # Execution count [1.12e+00] + testl %r14d, %r14d #143.9 + je ..B2.18 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.14: # Preds ..B2.13 + # Execution count [1.25e+01] + negl %r14d #143.9 + movl %r13d, %r15d #143.9 + addl $64, %r14d #143.9 + shrl $2, %r14d #143.9 + cmpl %r14d, %r13d #143.9 + cmovb %r13d, %r14d #143.9 + subl %r14d, %r15d #143.9 + andl $7, %r15d #143.9 + negl %r15d #143.9 + addl %r13d, %r15d #143.9 + cmpl $1, %r14d #143.9 + jb ..B2.19 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.15: # Preds ..B2.14 + # Execution count [2.25e+00] + vpbroadcastd %r14d, %ymm28 #143.9 + vbroadcastsd %xmm1, %zmm27 #127.23 + vbroadcastsd %xmm0, %zmm26 #128.23 + vbroadcastsd %xmm2, %zmm25 #129.23 + movslq %r14d, %r14 #143.9 + movq $0, 40(%rsp) #143.9[spill] + movq %r9, 24(%rsp) #143.9[spill] + movq %rdi, 32(%rsp) #143.9[spill] + vmovdqa32 %ymm14, %ymm29 #143.9 + movq %r14, %rdi #143.9 + movq 40(%rsp), %r9 #143.9[spill] + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 ymm28 ymm29 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm25 zmm26 zmm27 +..B2.16: # Preds ..B2.16 ..B2.15 + # Execution count [1.25e+01] + vpcmpud $1, %ymm28, %ymm29, %k4 #143.9 + vpaddd %ymm13, %ymm29, %ymm29 #143.9 + vmovdqu32 (%rsi,%r9,4), %ymm21{%k4}{z} #144.21 + vpaddd %ymm21, %ymm21, %ymm30 #145.36 + addq $8, %r9 #143.9 + vpcmpgtd %ymm21, %ymm4, %k6 #167.24 + vpaddd %ymm30, %ymm21, %ymm24 #145.36 + kmovw %k4, %k2 #145.36 + kmovw %k4, %k3 #145.36 + kmovw %k4, %k1 #145.36 + vpxord %zmm16, %zmm16, %zmm16 #145.36 + vpxord %zmm31, %zmm31, %zmm31 #145.36 + vpxord %zmm20, %zmm20, %zmm20 #145.36 + vpaddd %ymm12, %ymm24, %ymm17 #146.36 + vgatherdpd 8(%r11,%ymm24,8), %zmm16{%k2} #145.36 + vgatherdpd (%r11,%ymm24,8), %zmm31{%k3} #145.36 + vgatherdpd 16(%r11,%ymm24,8), %zmm20{%k1} #145.36 + vsubpd %zmm16, %zmm26, %zmm22 #146.36 + vsubpd %zmm31, %zmm27, %zmm23 #145.36 + vsubpd %zmm20, %zmm25, %zmm20 #147.36 + vmulpd %zmm22, %zmm22, %zmm18 #148.49 + vpaddd %ymm11, %ymm24, %ymm16 #147.36 + vfmadd231pd %zmm23, %zmm23, %zmm18 #148.49 + vfmadd231pd %zmm20, %zmm20, %zmm18 #148.63 + vrcp14pd %zmm18, %zmm19 #159.38 + vcmppd $1, %zmm10, %zmm18, %k7{%k4} #158.22 + vfpclasspd $30, %zmm19, %k0 #159.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm19, %zmm18 #159.38 + knotw %k0, %k5 #159.38 + kandw %k6, %k7, %k6 #167.24 + vmulpd %zmm18, %zmm18, %zmm21 #159.38 + vfmadd213pd %zmm19, %zmm18, %zmm19{%k5} #159.38 + vfmadd213pd %zmm19, %zmm21, %zmm19{%k5} #159.38 + vmulpd %zmm9, %zmm19, %zmm30 #160.38 + vmulpd %zmm30, %zmm19, %zmm18 #160.44 + vmulpd %zmm18, %zmm19, %zmm31 #160.50 + vfmsub213pd %zmm5, %zmm19, %zmm18 #161.54 + vmulpd %zmm7, %zmm19, %zmm19 #161.54 + vmulpd %zmm19, %zmm31, %zmm19 #161.61 + vmulpd %zmm18, %zmm19, %zmm21 #161.67 + vmovaps %zmm15, %zmm18 #168.21 + kmovw %k6, %k1 #168.21 + vfmadd231pd %zmm23, %zmm21, %zmm8{%k7} #162.17 + vfmadd231pd %zmm22, %zmm21, %zmm6{%k7} #163.17 + vfmadd231pd %zmm20, %zmm21, %zmm3{%k7} #164.17 + .byte 144 #168.21 + vgatherdpd (%rdx,%ymm24,8), %zmm18{%k1} #168.21 + vfnmadd213pd %zmm18, %zmm21, %zmm23 #168.21 + kmovw %k6, %k2 #168.21 + vscatterdpd %zmm23, (%rdx,%ymm24,8){%k2} #168.21 + vmovaps %zmm15, %zmm23 #169.21 + kmovw %k6, %k3 #169.21 + kmovw %k6, %k4 #169.21 + kmovw %k6, %k5 #170.21 + vgatherdpd (%rdx,%ymm17,8), %zmm23{%k3} #169.21 + vfnmadd213pd %zmm23, %zmm21, %zmm22 #169.21 + vscatterdpd %zmm22, (%rdx,%ymm17,8){%k4} #169.21 + vmovaps %zmm15, %zmm17 #170.21 + vgatherdpd (%rdx,%ymm16,8), %zmm17{%k5} #170.21 + vfnmadd213pd %zmm17, %zmm21, %zmm20 #170.21 + vscatterdpd %zmm20, (%rdx,%ymm16,8){%k6} #170.21 + cmpq %rdi, %r9 #143.9 + jb ..B2.16 # Prob 82% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 ymm28 ymm29 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm25 zmm26 zmm27 +..B2.17: # Preds ..B2.16 + # Execution count [2.25e+00] + movq 24(%rsp), %r9 #[spill] + movq 32(%rsp), %rdi #[spill] + cmpl %r14d, %r13d #143.9 + je ..B2.25 # Prob 10% #143.9 + jmp ..B2.19 # Prob 100% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.18: # Preds ..B2.13 + # Execution count [5.62e-01] + movl %r13d, %r15d #143.9 + andl $7, %r15d #143.9 + negl %r15d #143.9 + addl %r13d, %r15d #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.19: # Preds ..B2.12 ..B2.18 ..B2.17 ..B2.14 ..B2.34 + # + # Execution count [1.25e+01] + lea 8(%r14), %esi #143.9 + cmpl %esi, %r15d #143.9 + jb ..B2.23 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.20: # Preds ..B2.19 + # Execution count [2.25e+00] + movq %r8, %rsi #125.43 + imulq %rdi, %rsi #125.43 + vbroadcastsd %xmm1, %zmm26 #127.23 + vbroadcastsd %xmm0, %zmm25 #128.23 + vbroadcastsd %xmm2, %zmm23 #129.23 + movslq %r14d, %r14 #143.9 + addq %r10, %rsi #107.5 + movq %rdi, 32(%rsp) #107.5[spill] + movq %r14, %rdi #107.5 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm23 zmm25 zmm26 +..B2.21: # Preds ..B2.21 ..B2.20 + # Execution count [1.25e+01] + vmovdqu32 (%rsi,%rdi,4), %ymm24 #144.21 + addl $8, %r14d #143.9 + vpcmpeqb %xmm0, %xmm0, %k2 #145.36 + vpcmpeqb %xmm0, %xmm0, %k3 #145.36 + vpcmpeqb %xmm0, %xmm0, %k1 #145.36 + vpcmpgtd %ymm24, %ymm4, %k6 #167.24 + vpaddd %ymm24, %ymm24, %ymm27 #145.36 + vpaddd %ymm27, %ymm24, %ymm20 #145.36 + addq $8, %rdi #143.9 + vpxord %zmm29, %zmm29, %zmm29 #145.36 + vpxord %zmm28, %zmm28, %zmm28 #145.36 + vpxord %zmm30, %zmm30, %zmm30 #145.36 + vpaddd %ymm20, %ymm12, %ymm21 #146.36 + vpaddd %ymm20, %ymm11, %ymm18 #147.36 + vgatherdpd 8(%r11,%ymm20,8), %zmm29{%k2} #145.36 + vgatherdpd (%r11,%ymm20,8), %zmm28{%k3} #145.36 + vgatherdpd 16(%r11,%ymm20,8), %zmm30{%k1} #145.36 + vsubpd %zmm29, %zmm25, %zmm19 #146.36 + vsubpd %zmm28, %zmm26, %zmm22 #145.36 + vsubpd %zmm30, %zmm23, %zmm17 #147.36 + vmulpd %zmm19, %zmm19, %zmm31 #148.49 + vfmadd231pd %zmm22, %zmm22, %zmm31 #148.49 + vfmadd231pd %zmm17, %zmm17, %zmm31 #148.63 + vrcp14pd %zmm31, %zmm16 #159.38 + vcmppd $1, %zmm10, %zmm31, %k5 #158.22 + vfpclasspd $30, %zmm16, %k0 #159.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm16, %zmm31 #159.38 + knotw %k0, %k4 #159.38 + vmulpd %zmm31, %zmm31, %zmm27 #159.38 + vfmadd213pd %zmm16, %zmm31, %zmm16{%k4} #159.38 + vfmadd213pd %zmm16, %zmm27, %zmm16{%k4} #159.38 + vmulpd %zmm9, %zmm16, %zmm28 #160.38 + vmulpd %zmm7, %zmm16, %zmm24 #161.54 + vmulpd %zmm28, %zmm16, %zmm30 #160.44 + vmulpd %zmm30, %zmm16, %zmm29 #160.50 + vfmsub213pd %zmm5, %zmm30, %zmm16 #161.54 + vmulpd %zmm24, %zmm29, %zmm31 #161.61 + vmulpd %zmm16, %zmm31, %zmm24 #161.67 + vfmadd231pd %zmm22, %zmm24, %zmm8{%k5} #162.17 + vfmadd231pd %zmm19, %zmm24, %zmm6{%k5} #163.17 + vfmadd231pd %zmm17, %zmm24, %zmm3{%k5} #164.17 + kandw %k6, %k5, %k5 #167.24 + vmovaps %zmm15, %zmm16 #168.21 + kmovw %k5, %k7 #168.21 + kmovw %k5, %k1 #168.21 + kmovw %k5, %k2 #169.21 + kmovw %k5, %k3 #169.21 + kmovw %k5, %k4 #170.21 + vgatherdpd (%rdx,%ymm20,8), %zmm16{%k7} #168.21 + vfnmadd213pd %zmm16, %zmm24, %zmm22 #168.21 + vscatterdpd %zmm22, (%rdx,%ymm20,8){%k1} #168.21 + vmovaps %zmm15, %zmm20 #169.21 + vgatherdpd (%rdx,%ymm21,8), %zmm20{%k2} #169.21 + vfnmadd213pd %zmm20, %zmm24, %zmm19 #169.21 + vscatterdpd %zmm19, (%rdx,%ymm21,8){%k3} #169.21 + vmovaps %zmm15, %zmm19 #170.21 + vgatherdpd (%rdx,%ymm18,8), %zmm19{%k4} #170.21 + vfnmadd213pd %zmm19, %zmm24, %zmm17 #170.21 + vscatterdpd %zmm17, (%rdx,%ymm18,8){%k5} #170.21 + cmpl %r15d, %r14d #143.9 + jb ..B2.21 # Prob 82% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 zmm23 zmm25 zmm26 +..B2.22: # Preds ..B2.21 + # Execution count [2.25e+00] + movq 32(%rsp), %rdi #[spill] + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.23: # Preds ..B2.22 ..B2.19 ..B2.35 + # Execution count [2.50e+00] + lea 1(%r15), %r14d #143.9 + cmpl %r13d, %r14d #143.9 + ja ..B2.25 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.24: # Preds ..B2.23 + # Execution count [1.25e+01] + imulq %r8, %rdi #125.43 + vbroadcastsd %xmm0, %zmm24 #128.23 + vbroadcastsd %xmm1, %zmm22 #127.23 + vbroadcastsd %xmm2, %zmm26 #129.23 + movl %r13d, %r14d #143.9 + addq %r10, %rdi #107.5 + subl %r15d, %r14d #143.9 + vpbroadcastd %r14d, %ymm20 #143.9 + vpcmpud $1, %ymm20, %ymm14, %k5 #143.9 + movslq %r15d, %r15 #143.9 + kmovw %k5, %k2 #145.36 + kmovw %k5, %k3 #145.36 + kmovw %k5, %k1 #145.36 + vmovdqu32 (%rdi,%r15,4), %ymm19{%k5}{z} #144.21 + vpaddd %ymm19, %ymm19, %ymm21 #145.36 + vpcmpgtd %ymm19, %ymm4, %k7 #167.24 + vpaddd %ymm21, %ymm19, %ymm18 #145.36 + vmovaps %zmm15, %zmm19 #168.21 + vpxord %zmm25, %zmm25, %zmm25 #145.36 + vpxord %zmm23, %zmm23, %zmm23 #145.36 + vpxord %zmm27, %zmm27, %zmm27 #145.36 + vpaddd %ymm18, %ymm12, %ymm16 #146.36 + vpaddd %ymm18, %ymm11, %ymm0 #147.36 + vgatherdpd 8(%r11,%ymm18,8), %zmm25{%k2} #145.36 + vgatherdpd (%r11,%ymm18,8), %zmm23{%k3} #145.36 + vgatherdpd 16(%r11,%ymm18,8), %zmm27{%k1} #145.36 + vsubpd %zmm25, %zmm24, %zmm1 #146.36 + vsubpd %zmm23, %zmm22, %zmm17 #145.36 + vsubpd %zmm27, %zmm26, %zmm2 #147.36 + vmulpd %zmm1, %zmm1, %zmm21 #148.49 + vfmadd231pd %zmm17, %zmm17, %zmm21 #148.49 + vfmadd231pd %zmm2, %zmm2, %zmm21 #148.63 + vrcp14pd %zmm21, %zmm20 #159.38 + vcmppd $1, %zmm10, %zmm21, %k6{%k5} #158.22 + vfpclasspd $30, %zmm20, %k0 #159.38 + vmovaps %zmm21, %zmm28 #159.38 + vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm20, %zmm28 #159.38 + knotw %k0, %k4 #159.38 + vmulpd %zmm28, %zmm28, %zmm29 #159.38 + vfmadd213pd %zmm20, %zmm28, %zmm20{%k4} #159.38 + vfmadd213pd %zmm20, %zmm29, %zmm20{%k4} #159.38 + vmulpd %zmm9, %zmm20, %zmm30 #160.38 + vmulpd %zmm7, %zmm20, %zmm28 #161.54 + vmulpd %zmm30, %zmm20, %zmm29 #160.44 + vmulpd %zmm29, %zmm20, %zmm31 #160.50 + vfmsub213pd %zmm5, %zmm29, %zmm20 #161.54 + vmulpd %zmm28, %zmm31, %zmm30 #161.61 + vmulpd %zmm20, %zmm30, %zmm22 #161.67 + vfmadd231pd %zmm17, %zmm22, %zmm8{%k6} #162.17 + vfmadd231pd %zmm1, %zmm22, %zmm6{%k6} #163.17 + vfmadd231pd %zmm2, %zmm22, %zmm3{%k6} #164.17 + kandw %k7, %k6, %k6 #167.24 + kmovw %k6, %k1 #168.21 + kmovw %k6, %k2 #168.21 + kmovw %k6, %k3 #169.21 + kmovw %k6, %k4 #169.21 + kmovw %k6, %k5 #170.21 + vgatherdpd (%rdx,%ymm18,8), %zmm19{%k1} #168.21 + vfnmadd213pd %zmm19, %zmm22, %zmm17 #168.21 + vscatterdpd %zmm17, (%rdx,%ymm18,8){%k2} #168.21 + vmovaps %zmm15, %zmm17 #169.21 + vgatherdpd (%rdx,%ymm16,8), %zmm17{%k3} #169.21 + vfnmadd213pd %zmm17, %zmm22, %zmm1 #169.21 + vscatterdpd %zmm1, (%rdx,%ymm16,8){%k4} #169.21 + vmovaps %zmm15, %zmm1 #170.21 + vgatherdpd (%rdx,%ymm0,8), %zmm1{%k5} #170.21 + vfnmadd213pd %zmm1, %zmm22, %zmm2 #170.21 + vscatterdpd %zmm2, (%rdx,%ymm0,8){%k6} #170.21 + # LOE rax rdx rcx rbx r8 r9 r10 r11 r12d r13d ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.25: # Preds ..B2.17 ..B2.24 ..B2.23 + # Execution count [2.25e+00] + vmovups .L_2il0floatpacket.6(%rip), %zmm22 #132.22 + vpermd %zmm3, %zmm22, %zmm0 #132.22 + vpermd %zmm6, %zmm22, %zmm17 #131.22 + vpermd %zmm8, %zmm22, %zmm23 #130.22 + vaddpd %zmm3, %zmm0, %zmm3 #132.22 + vaddpd %zmm6, %zmm17, %zmm6 #131.22 + vaddpd %zmm8, %zmm23, %zmm8 #130.22 + vpermpd $78, %zmm3, %zmm1 #132.22 + vpermpd $78, %zmm6, %zmm18 #131.22 + vpermpd $78, %zmm8, %zmm24 #130.22 + vaddpd %zmm1, %zmm3, %zmm2 #132.22 + vaddpd %zmm18, %zmm6, %zmm19 #131.22 + vaddpd %zmm24, %zmm8, %zmm25 #130.22 + vpermpd $177, %zmm2, %zmm16 #132.22 + vpermpd $177, %zmm19, %zmm20 #131.22 + vpermpd $177, %zmm25, %zmm26 #130.22 + vaddpd %zmm16, %zmm2, %zmm3 #132.22 + vaddpd %zmm20, %zmm19, %zmm21 #131.22 + vaddpd %zmm26, %zmm25, %zmm27 #130.22 + # LOE rax rdx rcx rbx r8 r9 r10 r11 r12d r13d xmm3 xmm21 xmm27 ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 +..B2.26: # Preds ..B2.25 ..B2.8 ..B2.7 + # Execution count [5.00e+00] + movslq %r13d, %r13 #179.9 + vaddsd (%rax,%rdx), %xmm27, %xmm0 #175.9 + vaddsd 8(%rax,%rdx), %xmm21, %xmm1 #176.9 + vaddsd 16(%rax,%rdx), %xmm3, %xmm2 #177.9 + vmovsd %xmm0, (%rax,%rdx) #175.9 + lea 7(%r13), %edi #180.9 + sarl $2, %edi #180.9 + addq %r13, %rcx #179.9 + shrl $29, %edi #180.9 + vmovsd %xmm1, 8(%rax,%rdx) #176.9 + vmovsd %xmm2, 16(%rax,%rdx) #177.9 + addq $24, %rax #124.5 + lea 7(%rdi,%r13), %r14d #180.9 + movslq %r9d, %rdi #124.32 + sarl $3, %r14d #180.9 + incq %r9 #124.5 + movslq %r14d, %r14 #180.9 + incq %rdi #124.32 + addq %r14, %rbx #180.9 + cmpq 64(%rsp), %r9 #124.5[spill] + jb ..B2.7 # Prob 82% #124.5 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d ymm4 ymm11 ymm12 ymm13 ymm14 zmm5 zmm7 zmm9 zmm10 zmm15 +..B2.27: # Preds ..B2.26 + # Execution count [9.00e-01] + movq (%rsp), %r13 #[spill] + movq %rcx, (%r13) #179.9 + movq %rbx, 8(%r13) #180.9 + jmp ..B2.30 # Prob 100% #180.9 + # LOE r12d +..B2.28: # Preds ..B2.1 + # Execution count [5.00e-01] + xorl %r12d, %r12d #120.22 + xorl %eax, %eax #121.16 +..___tag_value_computeForceLJHalfNeigh.96: +# getTimeStamp() + call getTimeStamp #121.16 +..___tag_value_computeForceLJHalfNeigh.97: + # LOE r12d xmm0 +..B2.46: # Preds ..B2.28 + # Execution count [5.00e-01] + vmovsd %xmm0, 8(%rsp) #121.16[spill] + # LOE r12d +..B2.29: # Preds ..B2.46 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #122.5 +..___tag_value_computeForceLJHalfNeigh.99: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #122.5 +..___tag_value_computeForceLJHalfNeigh.100: + # LOE r12d +..B2.30: # Preds ..B2.27 ..B2.29 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #183.5 + vzeroupper #183.5 +..___tag_value_computeForceLJHalfNeigh.101: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #183.5 +..___tag_value_computeForceLJHalfNeigh.102: + # LOE r12d +..B2.31: # Preds ..B2.30 + # Execution count [1.00e+00] + xorl %eax, %eax #184.16 +..___tag_value_computeForceLJHalfNeigh.103: +# getTimeStamp() + call getTimeStamp #184.16 +..___tag_value_computeForceLJHalfNeigh.104: + # LOE r12d xmm0 +..B2.32: # Preds ..B2.31 + # Execution count [1.00e+00] + vxorpd %xmm4, %xmm4, %xmm4 #185.5 + movl $.L_2__STRING.2, %edi #185.5 + vmovsd .L_2il0floatpacket.9(%rip), %xmm3 #185.5 + movl %r12d, %esi #185.5 + movq 16(%rsp), %rax #185.74[spill] + vsubsd 8(%rsp), %xmm0, %xmm1 #185.94[spill] + vmovsd 264(%rax), %xmm7 #185.74 + movl $3, %eax #185.5 + vcvtusi2sdl %r12d, %xmm4, %xmm4 #185.5 + vdivsd %xmm4, %xmm3, %xmm5 #185.5 + vmulsd %xmm1, %xmm5, %xmm6 #185.5 + vmulsd %xmm7, %xmm6, %xmm2 #185.5 + vmovapd %xmm7, %xmm0 #185.5 + vmovsd %xmm1, (%rsp) #185.5[spill] +..___tag_value_computeForceLJHalfNeigh.107: +# printf(const char *__restrict__, ...) + call printf #185.5 +..___tag_value_computeForceLJHalfNeigh.108: + # LOE +..B2.33: # Preds ..B2.32 + # Execution count [1.00e+00] + vmovsd (%rsp), %xmm1 #[spill] + vmovapd %xmm1, %xmm0 #186.14 + addq $88, %rsp #186.14 + .cfi_restore 3 + popq %rbx #186.14 + .cfi_restore 15 + popq %r15 #186.14 + .cfi_restore 14 + popq %r14 #186.14 + .cfi_restore 13 + popq %r13 #186.14 + .cfi_restore 12 + popq %r12 #186.14 + movq %rbp, %rsp #186.14 + popq %rbp #186.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #186.14 + .cfi_def_cfa 6, 16 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_offset 6, -16 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + # LOE +..B2.34: # Preds ..B2.10 + # Execution count [2.25e-01]: Infreq + movl %r13d, %r15d #143.9 + xorl %r14d, %r14d #143.9 + andl $-8, %r15d #143.9 + jmp ..B2.19 # Prob 100% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r14d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.35: # Preds ..B2.9 + # Execution count [2.25e-01]: Infreq + xorl %r15d, %r15d #143.9 + jmp ..B2.23 # Prob 100% #143.9 + # LOE rax rdx rcx rbx rdi r8 r9 r10 r11 r12d r13d r15d xmm0 xmm1 xmm2 ymm4 ymm11 ymm12 ymm13 ymm14 zmm3 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm15 +..B2.36: # Preds ..B2.2 + # Execution count [1.00e+00]: Infreq + cmpl $8, %esi #114.5 + jl ..B2.42 # Prob 10% #114.5 + # LOE rbx rdi r13 esi r15d +..B2.37: # Preds ..B2.36 + # Execution count [1.00e+00]: Infreq + movl %esi, %eax #114.5 + xorl %ecx, %ecx #114.5 + andl $-8, %eax #114.5 + movslq %eax, %rdx #114.5 + vpxord %zmm0, %zmm0, %zmm0 #114.5 + # LOE rdx rcx rbx rdi r13 eax esi r15d zmm0 +..B2.38: # Preds ..B2.38 ..B2.37 + # Execution count [5.56e+00]: Infreq + vmovupd %zmm0, (%rdi,%rcx,8) #115.9 + addq $8, %rcx #114.5 + cmpq %rdx, %rcx #114.5 + jb ..B2.38 # Prob 82% #114.5 + # LOE rdx rcx rbx rdi r13 eax esi r15d zmm0 +..B2.40: # Preds ..B2.38 ..B2.42 + # Execution count [1.11e+00]: Infreq + lea 1(%rax), %edx #114.5 + cmpl %esi, %edx #114.5 + ja ..B2.48 # Prob 50% #114.5 + # LOE rbx rdi r13 eax esi r15d +..B2.41: # Preds ..B2.40 + # Execution count [5.56e+00]: Infreq + subl %eax, %esi #114.5 + vpbroadcastd %esi, %ymm0 #114.5 + vpcmpgtd .L_2il0floatpacket.0(%rip), %ymm0, %k1 #114.5 + movslq %eax, %rax #114.5 + movslq %r15d, %r14 #114.5 + vpxord %zmm1, %zmm1, %zmm1 #115.9 + vmovupd %zmm1, (%rdi,%rax,8){%k1} #115.9 + jmp ..B2.4 # Prob 100% #115.9 + # LOE rbx r13 r14 r15d +..B2.42: # Preds ..B2.36 + # Execution count [1.00e-01]: Infreq + xorl %eax, %eax #114.5 + jmp ..B2.40 # Prob 100% #114.5 + # LOE rbx rdi r13 eax esi r15d +..B2.48: # Preds ..B2.40 + # Execution count [5.56e-01]: Infreq + movslq %r15d, %r14 #114.5 + jmp ..B2.4 # Prob 100% #114.5 + .align 16,0x90 + # LOE rbx r13 r14 r15d + .cfi_endproc +# mark_end; + .type computeForceLJHalfNeigh,@function + .size computeForceLJHalfNeigh,.-computeForceLJHalfNeigh +..LNcomputeForceLJHalfNeigh.1: + .data +# -- End computeForceLJHalfNeigh + .text +.L_2__routine_start_computeForceLJFullNeigh_simd_2: +# -- Begin computeForceLJFullNeigh_simd + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJFullNeigh_simd +# --- computeForceLJFullNeigh_simd(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJFullNeigh_simd: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B3.1: # Preds ..B3.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJFullNeigh_simd.126: +..L127: + #189.101 + pushq %rbp #189.101 + .cfi_def_cfa_offset 16 + movq %rsp, %rbp #189.101 + .cfi_def_cfa 6, 16 + .cfi_offset 6, -16 + andq $-64, %rsp #189.101 + pushq %r12 #189.101 + pushq %r13 #189.101 + pushq %r14 #189.101 + pushq %r15 #189.101 + pushq %rbx #189.101 + subq $216, %rsp #189.101 + .cfi_escape 0x10, 0x03, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xd8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xf0, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe8, 0xff, 0xff, 0xff, 0x22 + .cfi_escape 0x10, 0x0f, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x22 + movq %rdi, %rbx #189.101 + movq %rsi, %r13 #189.101 + movq %rdx, %r12 #189.101 + vmovsd 144(%rbx), %xmm0 #192.27 + vmovsd 56(%rbx), %xmm1 #193.23 + vmovsd 40(%rbx), %xmm2 #194.24 + movl 4(%r13), %r14d #190.18 + vmovsd %xmm0, 16(%rsp) #192.27[spill] + vmovsd %xmm1, 8(%rsp) #193.23[spill] + vmovsd %xmm2, (%rsp) #194.24[spill] + testl %r14d, %r14d #196.24 + jle ..B3.9 # Prob 50% #196.24 + # LOE rbx r12 r13 r14d +..B3.2: # Preds ..B3.1 + # Execution count [1.00e+00] + movl %r14d, %ecx #196.5 + xorl %edx, %edx #196.5 + movl $1, %esi #196.5 + xorl %eax, %eax #196.5 + shrl $1, %ecx #196.5 + je ..B3.6 # Prob 9% #196.5 + # LOE rax rdx rcx rbx r12 r13 esi r14d +..B3.3: # Preds ..B3.2 + # Execution count [9.00e-01] + xorl %r15d, %r15d #196.5 + .align 16,0x90 + # LOE rax rdx rcx rbx r12 r13 r15 r14d +..B3.4: # Preds ..B3.4 ..B3.3 + # Execution count [2.50e+00] + movq 64(%r13), %rsi #197.9 + incq %rdx #196.5 + movq %r15, (%rsi,%rax) #197.9 + movq 64(%r13), %rdi #198.9 + movq %r15, 8(%rdi,%rax) #198.9 + movq 64(%r13), %r8 #199.9 + movq %r15, 16(%r8,%rax) #199.9 + movq 64(%r13), %r9 #197.9 + movq %r15, 24(%r9,%rax) #197.9 + movq 64(%r13), %r10 #198.9 + movq %r15, 32(%r10,%rax) #198.9 + movq 64(%r13), %r11 #199.9 + movq %r15, 40(%r11,%rax) #199.9 + addq $48, %rax #196.5 + cmpq %rcx, %rdx #196.5 + jb ..B3.4 # Prob 63% #196.5 + # LOE rax rdx rcx rbx r12 r13 r15 r14d +..B3.5: # Preds ..B3.4 + # Execution count [9.00e-01] + lea 1(%rdx,%rdx), %esi #197.9 + # LOE rbx r12 r13 esi r14d +..B3.6: # Preds ..B3.5 ..B3.2 + # Execution count [1.00e+00] + lea -1(%rsi), %eax #196.5 + cmpl %r14d, %eax #196.5 + jae ..B3.9 # Prob 9% #196.5 + # LOE rbx r12 r13 esi r14d +..B3.7: # Preds ..B3.6 + # Execution count [9.00e-01] + movslq %esi, %rsi #197.9 + xorl %ecx, %ecx #197.9 + movq 64(%r13), %rax #197.9 + lea (%rsi,%rsi,2), %r8 #197.9 + movq %rcx, -24(%rax,%r8,8) #197.9 + movq 64(%r13), %rdx #198.9 + movq %rcx, -16(%rdx,%r8,8) #198.9 + movq 64(%r13), %rdi #199.9 + movq %rcx, -8(%rdi,%r8,8) #199.9 + # LOE rbx r12 r13 r14d +..B3.9: # Preds ..B3.7 ..B3.6 ..B3.1 + # Execution count [5.00e-01] + xorl %eax, %eax #203.16 +..___tag_value_computeForceLJFullNeigh_simd.139: +# getTimeStamp() + call getTimeStamp #203.16 +..___tag_value_computeForceLJFullNeigh_simd.140: + # LOE rbx r12 r13 r14d xmm0 +..B3.26: # Preds ..B3.9 + # Execution count [5.00e-01] + vmovsd %xmm0, 192(%rsp) #203.16[spill] + # LOE rbx r12 r13 r14d +..B3.10: # Preds ..B3.26 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #204.5 + xorl %r15d, %r15d #204.5 +..___tag_value_computeForceLJFullNeigh_simd.142: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #204.5 +..___tag_value_computeForceLJFullNeigh_simd.143: + # LOE rbx r12 r13 r14d r15d +..B3.11: # Preds ..B3.10 + # Execution count [1.00e+00] + vmovsd 16(%rsp), %xmm0 #210.36[spill] + xorl %edi, %edi #217.9 + vmulsd %xmm0, %xmm0, %xmm1 #210.36 + xorl %r11d, %r11d #216.5 + vbroadcastsd 8(%rsp), %zmm10 #211.32[spill] + vbroadcastsd (%rsp), %zmm9 #212.29[spill] + vbroadcastsd %xmm1, %zmm11 #210.36 + vbroadcastsd .L_2il0floatpacket.3(%rip), %zmm8 #213.29 + vbroadcastsd .L_2il0floatpacket.10(%rip), %zmm13 #214.29 + xorl %edx, %edx #217.9 + testl %r14d, %r14d #216.24 + jle ..B3.19 # Prob 9% #216.24 + # LOE rdx rbx rdi r12 r13 r11d r14d r15d zmm8 zmm9 zmm10 zmm11 zmm13 +..B3.12: # Preds ..B3.11 + # Execution count [9.00e-01] + vmovdqu .L_2il0floatpacket.11(%rip), %ymm12 #230.101 + vmovups .L_2il0floatpacket.6(%rip), %zmm0 #253.23 + vpxord %zmm1, %zmm1, %zmm1 #223.29 + # LOE rdx rbx rdi r12 r13 r11d r14d r15d ymm12 zmm0 zmm1 zmm8 zmm9 zmm10 zmm11 zmm13 +..B3.13: # Preds ..B3.17 ..B3.12 + # Execution count [5.00e+00] + movl %r11d, %r8d #217.43 + xorl %r10d, %r10d #228.9 + imull 8(%r12), %r8d #217.43 + movslq %r8d, %r8 #217.19 + movq 24(%r12), %rcx #218.25 + movq 16(%r12), %rax #217.19 + movq 16(%r13), %r9 #220.45 + vmovaps %zmm1, %zmm3 #223.29 + vmovaps %zmm3, %zmm2 #224.29 + lea (%rax,%r8,4), %r8 #217.19 + movl (%rcx,%rdi,4), %ecx #218.25 + addl %ecx, %r15d #227.9 + vmovaps %zmm2, %zmm7 #225.29 + xorl %eax, %eax #230.78 + vpbroadcastd %ecx, %ymm6 #219.37 + vbroadcastsd (%r9,%rdx,8), %zmm5 #220.30 + vbroadcastsd 8(%r9,%rdx,8), %zmm4 #221.30 + vbroadcastsd 16(%r9,%rdx,8), %zmm14 #222.30 + testl %ecx, %ecx #228.28 + jle ..B3.17 # Prob 10% #228.28 + # LOE rdx rbx rdi r8 r9 r12 r13 eax ecx r10d r11d r14d r15d ymm6 ymm12 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 +..B3.14: # Preds ..B3.13 + # Execution count [4.50e+00] + addl $7, %ecx #218.25 + shrl $3, %ecx #218.25 + # LOE rdx rbx rdi r8 r9 r12 r13 eax ecx r10d r11d r14d r15d ymm6 ymm12 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 +..B3.15: # Preds ..B3.15 ..B3.14 + # Execution count [2.50e+01] + vpbroadcastd %eax, %ymm15 #230.78 + incl %r10d #228.9 + vpcmpeqb %xmm0, %xmm0, %k4 #236.41 + vpcmpeqb %xmm0, %xmm0, %k3 #235.41 + vpcmpeqb %xmm0, %xmm0, %k2 #234.41 + vpaddd %ymm12, %ymm15, %ymm16 #230.65 + vpcmpgtd %ymm16, %ymm6, %k1 #230.43 + movslq %eax, %rax #231.29 + kmovw %k1, %esi #230.43 + kmovb %esi, %k5 #243.40 + vmovdqu32 (%r8,%rax,4), %ymm18{%k1}{z} #231.29 + addl $8, %eax #228.9 + vpaddd %ymm18, %ymm18, %ymm17 #233.43 + vpaddd %ymm18, %ymm17, %ymm19 #233.30 + vpxord %zmm22, %zmm22, %zmm22 #236.41 + vpxord %zmm21, %zmm21, %zmm21 #235.41 + vpxord %zmm20, %zmm20, %zmm20 #234.41 + vgatherdpd 16(%r9,%ymm19,8), %zmm22{%k4} #236.41 + vgatherdpd 8(%r9,%ymm19,8), %zmm21{%k3} #235.41 + vgatherdpd (%r9,%ymm19,8), %zmm20{%k2} #234.41 + vsubpd %zmm22, %zmm14, %zmm16 #236.41 + vsubpd %zmm21, %zmm4, %zmm15 #235.41 + vsubpd %zmm20, %zmm5, %zmm31 #234.41 + vmulpd %zmm16, %zmm16, %zmm23 #242.75 + vfmadd231pd %zmm15, %zmm15, %zmm23 #242.54 + vfmadd231pd %zmm31, %zmm31, %zmm23 #242.33 + vrcp14pd %zmm23, %zmm25 #244.33 + vcmppd $17, %zmm11, %zmm23, %k0 #243.70 + vmulpd %zmm10, %zmm25, %zmm24 #245.61 + vmulpd %zmm9, %zmm25, %zmm27 #246.100 + kmovw %k0, %esi #243.70 + vmulpd %zmm24, %zmm25, %zmm26 #245.47 + vmulpd %zmm26, %zmm25, %zmm28 #245.33 + vfmsub213pd %zmm13, %zmm25, %zmm26 #246.76 + vmulpd %zmm27, %zmm26, %zmm29 #246.67 + vmulpd %zmm29, %zmm28, %zmm30 #246.53 + vmulpd %zmm30, %zmm8, %zmm23 #246.35 + kmovb %esi, %k6 #243.40 + kandb %k6, %k5, %k7 #243.40 + kmovb %k7, %esi #243.40 + kmovw %esi, %k1 #248.19 + vfmadd231pd %zmm31, %zmm23, %zmm3{%k1} #248.19 + vfmadd231pd %zmm15, %zmm23, %zmm2{%k1} #249.19 + vfmadd231pd %zmm16, %zmm23, %zmm7{%k1} #250.19 + cmpl %ecx, %r10d #228.9 + jb ..B3.15 # Prob 82% #228.9 + # LOE rdx rbx rdi r8 r9 r12 r13 eax ecx r10d r11d r14d r15d ymm6 ymm12 zmm0 zmm1 zmm2 zmm3 zmm4 zmm5 zmm7 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 +..B3.17: # Preds ..B3.15 ..B3.13 + # Execution count [5.00e+00] + vpermd %zmm3, %zmm0, %zmm4 #253.23 + incl %r11d #216.5 + vpermd %zmm2, %zmm0, %zmm16 #254.23 + vpermd %zmm7, %zmm0, %zmm21 #255.23 + vaddpd %zmm3, %zmm4, %zmm5 #253.23 + vaddpd %zmm2, %zmm16, %zmm17 #254.23 + vaddpd %zmm7, %zmm21, %zmm22 #255.23 + vshuff64x2 $17, %zmm5, %zmm5, %zmm3 #253.23 + vshuff64x2 $17, %zmm17, %zmm17, %zmm2 #254.23 + vshuff64x2 $17, %zmm22, %zmm22, %zmm7 #255.23 + vaddpd %zmm5, %zmm3, %zmm14 #253.23 + vaddpd %zmm17, %zmm2, %zmm19 #254.23 + vaddpd %zmm22, %zmm7, %zmm24 #255.23 + vpermilpd $1, %zmm14, %zmm6 #253.23 + incq %rdi #216.5 + vaddpd %zmm14, %zmm6, %zmm15 #253.23 + vmovups %zmm15, (%rsp) #253.23 + movq 64(%r13), %rax #253.9 + vpermilpd $1, %zmm19, %zmm18 #254.23 + vaddpd %zmm19, %zmm18, %zmm20 #254.23 + vmovsd (%rax,%rdx,8), %xmm26 #253.9 + vaddsd (%rsp), %xmm26, %xmm27 #253.9 + vmovups %zmm20, 64(%rsp) #254.23 + vmovsd %xmm27, (%rax,%rdx,8) #253.9 + movq 64(%r13), %rcx #254.9 + vpermilpd $1, %zmm24, %zmm23 #255.23 + vaddpd %zmm24, %zmm23, %zmm25 #255.23 + vmovsd 8(%rcx,%rdx,8), %xmm28 #254.9 + vaddsd 64(%rsp), %xmm28, %xmm29 #254.9 + vmovups %zmm25, 128(%rsp) #255.23 + vmovsd %xmm29, 8(%rcx,%rdx,8) #254.9 + movq 64(%r13), %r8 #255.9 + vmovsd 16(%r8,%rdx,8), %xmm30 #255.9 + vaddsd 128(%rsp), %xmm30, %xmm31 #255.9 + vmovsd %xmm31, 16(%r8,%rdx,8) #255.9 + addq $3, %rdx #216.5 + cmpl %r14d, %r11d #216.5 + jb ..B3.13 # Prob 82% #216.5 + # LOE rdx rbx rdi r12 r13 r11d r14d r15d ymm12 zmm0 zmm1 zmm8 zmm9 zmm10 zmm11 zmm13 +..B3.19: # Preds ..B3.17 ..B3.11 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #259.5 + vzeroupper #259.5 +..___tag_value_computeForceLJFullNeigh_simd.147: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #259.5 +..___tag_value_computeForceLJFullNeigh_simd.148: + # LOE rbx r15d +..B3.20: # Preds ..B3.19 + # Execution count [1.00e+00] + xorl %eax, %eax #260.16 +..___tag_value_computeForceLJFullNeigh_simd.149: +# getTimeStamp() + call getTimeStamp #260.16 +..___tag_value_computeForceLJFullNeigh_simd.150: + # LOE rbx r15d xmm0 +..B3.21: # Preds ..B3.20 + # Execution count [1.00e+00] + vxorpd %xmm4, %xmm4, %xmm4 #261.5 + movl $.L_2__STRING.3, %edi #261.5 + vmovsd .L_2il0floatpacket.9(%rip), %xmm3 #261.5 + movl %r15d, %esi #261.5 + vmovsd 264(%rbx), %xmm7 #261.68 + movl $3, %eax #261.5 + vsubsd 192(%rsp), %xmm0, %xmm1 #261.88[spill] + vcvtusi2sdl %r15d, %xmm4, %xmm4 #261.5 + vdivsd %xmm4, %xmm3, %xmm5 #261.5 + vmulsd %xmm1, %xmm5, %xmm6 #261.5 + vmulsd %xmm7, %xmm6, %xmm2 #261.5 + vmovapd %xmm7, %xmm0 #261.5 + vmovsd %xmm1, (%rsp) #261.5[spill] +..___tag_value_computeForceLJFullNeigh_simd.152: +# printf(const char *__restrict__, ...) + call printf #261.5 +..___tag_value_computeForceLJFullNeigh_simd.153: + # LOE +..B3.22: # Preds ..B3.21 + # Execution count [1.00e+00] + vmovsd (%rsp), %xmm1 #[spill] + vmovapd %xmm1, %xmm0 #262.14 + addq $216, %rsp #262.14 + .cfi_restore 3 + popq %rbx #262.14 + .cfi_restore 15 + popq %r15 #262.14 + .cfi_restore 14 + popq %r14 #262.14 + .cfi_restore 13 + popq %r13 #262.14 + .cfi_restore 12 + popq %r12 #262.14 + movq %rbp, %rsp #262.14 + popq %rbp #262.14 + .cfi_def_cfa 7, 8 + .cfi_restore 6 + ret #262.14 + .align 16,0x90 + # LOE + .cfi_endproc +# mark_end; + .type computeForceLJFullNeigh_simd,@function + .size computeForceLJFullNeigh_simd,.-computeForceLJFullNeigh_simd +..LNcomputeForceLJFullNeigh_simd.2: + .data +# -- End computeForceLJFullNeigh_simd + .section .rodata, "a" + .align 64 + .align 64 +.L_2il0floatpacket.2: + .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000,0x00000000,0x3ff00000 + .type .L_2il0floatpacket.2,@object + .size .L_2il0floatpacket.2,64 + .align 64 +.L_2il0floatpacket.4: + .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000,0x00000000,0x3fe00000 + .type .L_2il0floatpacket.4,@object + .size .L_2il0floatpacket.4,64 + .align 64 +.L_2il0floatpacket.6: + .long 0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f,0x00000008,0x00000009,0x0000000a,0x0000000b,0x0000000c,0x0000000d,0x0000000e,0x0000000f + .type .L_2il0floatpacket.6,@object + .size .L_2il0floatpacket.6,64 + .align 32 +.L_2il0floatpacket.0: + .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007 + .type .L_2il0floatpacket.0,@object + .size .L_2il0floatpacket.0,32 + .align 32 +.L_2il0floatpacket.1: + .long 0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008,0x00000008 + .type .L_2il0floatpacket.1,@object + .size .L_2il0floatpacket.1,32 + .align 32 +.L_2il0floatpacket.7: + .long 0x00000001,0x00000001,0x00000001,0x00000001,0x00000001,0x00000001,0x00000001,0x00000001 + .type .L_2il0floatpacket.7,@object + .size .L_2il0floatpacket.7,32 + .align 32 +.L_2il0floatpacket.8: + .long 0x00000002,0x00000002,0x00000002,0x00000002,0x00000002,0x00000002,0x00000002,0x00000002 + .type .L_2il0floatpacket.8,@object + .size .L_2il0floatpacket.8,32 + .align 32 +.L_2il0floatpacket.11: + .long 0x00000000,0x00000001,0x00000002,0x00000003,0x00000004,0x00000005,0x00000006,0x00000007 + .type .L_2il0floatpacket.11,@object + .size .L_2il0floatpacket.11,32 + .align 8 +.L_2il0floatpacket.3: + .long 0x00000000,0x40480000 + .type .L_2il0floatpacket.3,@object + .size .L_2il0floatpacket.3,8 + .align 8 +.L_2il0floatpacket.5: + .long 0x00000000,0x3ff00000 + .type .L_2il0floatpacket.5,@object + .size .L_2il0floatpacket.5,8 + .align 8 +.L_2il0floatpacket.9: + .long 0x00000000,0x41cdcd65 + .type .L_2il0floatpacket.9,@object + .size .L_2il0floatpacket.9,8 + .align 8 +.L_2il0floatpacket.10: + .long 0x00000000,0x3fe00000 + .type .L_2il0floatpacket.10,@object + .size .L_2il0floatpacket.10,8 + .section .rodata.str1.4, "aMS",@progbits,1 + .align 4 + .align 4 +.L_2__STRING.0: + .long 1668444006 + .word 101 + .type .L_2__STRING.0,@object + .size .L_2__STRING.0,6 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.1: + .long 1668444006 + .long 759843941 + .long 1718378856 + .long 1734960494 + .word 104 + .type .L_2__STRING.1,@object + .size .L_2__STRING.1,18 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.2: + .long 980644937 + .long 544548128 + .long 1701987872 + .long 622869105 + .long 1411391590 + .long 979725673 + .long 174466336 + .long 1764718915 + .long 622869108 + .long 1747460198 + .long 761687137 + .long 1734960494 + .long 665960 + .type .L_2__STRING.2,@object + .size .L_2__STRING.2,52 + .align 4 +.L_2__STRING.3: + .long 980644937 + .long 544548128 + .long 1701987872 + .long 622869105 + .long 1411391590 + .long 979725673 + .long 174466336 + .long 1764718915 + .long 622869108 + .long 1932009574 + .long 694447465 + .word 10 + .type .L_2__STRING.3,@object + .size .L_2__STRING.3,46 + .data + .section .note.GNU-stack, "" +# End diff --git a/static_analysis/jan/icx-icc-lammps-novec.s b/static_analysis/jan/icx-icc-lammps-novec.s new file mode 100644 index 0000000..0b316cc --- /dev/null +++ b/static_analysis/jan/icx-icc-lammps-novec.s @@ -0,0 +1,1310 @@ +# mark_description "Intel(R) C Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.6.0 Build 2022"; +# mark_description "0226_000000"; +# mark_description "-I/apps/likwid/5.2.2/include -I././lammps/includes -I././common/includes -S -std=c11 -pedantic-errors -D_GNU"; +# mark_description "_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DCOMPUTE_STATS -DVECTOR_WIDTH=1 -DENABLE_OMP_SIMD -DALIGNMENT="; +# mark_description "64 -restrict -Ofast -no-vec -o build-lammps-ICC-NOVEC-DP/force_lj.s"; + .file "force_lj.c" + .text +..TXTST0: +.L_2__routine_start_computeForceLJFullNeigh_plain_c_0: +# -- Begin computeForceLJFullNeigh_plain_c + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJFullNeigh_plain_c +# --- computeForceLJFullNeigh_plain_c(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJFullNeigh_plain_c: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B1.1: # Preds ..B1.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJFullNeigh_plain_c.1: +..L2: + #23.104 + pushq %r12 #23.104 + .cfi_def_cfa_offset 16 + .cfi_offset 12, -16 + pushq %r13 #23.104 + .cfi_def_cfa_offset 24 + .cfi_offset 13, -24 + pushq %r14 #23.104 + .cfi_def_cfa_offset 32 + .cfi_offset 14, -32 + pushq %r15 #23.104 + .cfi_def_cfa_offset 40 + .cfi_offset 15, -40 + pushq %rbx #23.104 + .cfi_def_cfa_offset 48 + .cfi_offset 3, -48 + pushq %rbp #23.104 + .cfi_def_cfa_offset 56 + .cfi_offset 6, -56 + subq $56, %rsp #23.104 + .cfi_def_cfa_offset 112 + movq %rdi, %rbp #23.104 + movq %rsi, %r15 #23.104 + movq %rcx, %r13 #23.104 + movq %rdx, %r12 #23.104 + movsd 144(%rbp), %xmm0 #27.27 + mulsd %xmm0, %xmm0 #27.45 + movsd 56(%rbp), %xmm1 #28.23 + movsd 40(%rbp), %xmm2 #29.24 + movl 4(%r15), %r14d #24.18 + movsd %xmm0, 32(%rsp) #27.45[spill] + movsd %xmm1, 24(%rsp) #28.23[spill] + movsd %xmm2, 40(%rsp) #29.24[spill] + testl %r14d, %r14d #32.24 + jle ..B1.16 # Prob 50% #32.24 + # LOE rbp r12 r13 r15 r14d +..B1.2: # Preds ..B1.1 + # Execution count [5.00e-03] + movq 64(%r15), %rdi #33.9 + lea (%r14,%r14,2), %eax #24.18 + cmpl $12, %eax #32.5 + jle ..B1.23 # Prob 0% #32.5 + # LOE rbp rdi r12 r13 r15 eax r14d +..B1.3: # Preds ..B1.2 + # Execution count [1.00e+00] + movslq %r14d, %r14 #32.5 + xorl %esi, %esi #32.5 + lea (%r14,%r14,2), %rdx #32.5 + shlq $3, %rdx #32.5 + call _intel_fast_memset #32.5 + # LOE rbp r12 r13 r14 r15 +..B1.5: # Preds ..B1.3 ..B1.28 ..B1.34 + # Execution count [1.00e+00] + xorl %ebx, %ebx #37.22 + xorl %eax, %eax #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.19: +# getTimeStamp() + call getTimeStamp #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.20: + # LOE rbx rbp r12 r13 r14 r15 xmm0 +..B1.31: # Preds ..B1.5 + # Execution count [1.00e+00] + movsd %xmm0, 16(%rsp) #38.16[spill] + # LOE rbx rbp r12 r13 r14 r15 +..B1.6: # Preds ..B1.31 + # Execution count [5.00e-01] + movl $.L_2__STRING.0, %edi #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.22: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.23: + # LOE rbx rbp r12 r13 r14 r15 +..B1.7: # Preds ..B1.6 + # Execution count [9.00e-01] + xorl %ecx, %ecx #41.15 + movsd .L_2il0floatpacket.0(%rip), %xmm11 #77.41 + xorl %edx, %edx #41.5 + movq 16(%r12), %rax #42.19 + xorl %r11d, %r11d #41.5 + movslq 8(%r12), %rdi #42.43 + movq 24(%r12), %rsi #43.25 + mulsd 40(%rsp), %xmm11 #77.41[spill] + movsd 24(%rsp), %xmm12 #41.5[spill] + movsd 32(%rsp), %xmm13 #41.5[spill] + movsd .L_2il0floatpacket.1(%rip), %xmm1 #77.54 + shlq $2, %rdi #25.5 + movq 16(%r15), %r12 #44.25 + movq 64(%r15), %r8 #89.9 + xorl %r15d, %r15d #41.5 + movq (%r13), %r9 #93.9 + movq 8(%r13), %r10 #94.9 + movq %rbp, 8(%rsp) #41.5[spill] + movq %r13, (%rsp) #41.5[spill] + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 xmm1 xmm11 xmm12 xmm13 +..B1.8: # Preds ..B1.14 ..B1.7 + # Execution count [5.00e+00] + movslq (%rsi,%rdx,4), %r13 #43.25 + movq %r15, %rbp #56.9 + pxor %xmm2, %xmm2 #47.22 + movaps %xmm2, %xmm4 #48.22 + movsd (%r11,%r12), %xmm9 #44.25 + movaps %xmm4, %xmm10 #49.22 + movsd 8(%r11,%r12), %xmm8 #45.25 + movsd 16(%r11,%r12), %xmm5 #46.25 + testq %r13, %r13 #56.28 + jle ..B1.14 # Prob 10% #56.28 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 xmm1 xmm2 xmm4 xmm5 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.9: # Preds ..B1.8 + # Execution count [4.50e+00] + imulq %rdi, %rcx #42.43 + addq %rax, %rcx #25.5 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 xmm1 xmm2 xmm4 xmm5 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +# LLVM-MCA-BEGIN +# OSACA-BEGIN +..B1.10: # Preds ..B1.12 ..B1.9 + # Execution count [2.50e+01] + movl (%rcx,%rbp,4), %r15d #57.21 + movaps %xmm9, %xmm3 #58.36 + movaps %xmm8, %xmm7 #59.36 + movaps %xmm5, %xmm6 #60.36 + lea (%r15,%r15,2), %r15d #58.36 + movslq %r15d, %r15 #58.36 + subsd (%r12,%r15,8), %xmm3 #58.36 + subsd 8(%r12,%r15,8), %xmm7 #59.36 + subsd 16(%r12,%r15,8), %xmm6 #60.36 + movaps %xmm3, %xmm0 #61.35 + movaps %xmm7, %xmm14 #61.49 + mulsd %xmm3, %xmm0 #61.35 + movaps %xmm6, %xmm15 #61.63 + mulsd %xmm7, %xmm14 #61.49 + mulsd %xmm6, %xmm15 #61.63 + addsd %xmm14, %xmm0 #61.49 + addsd %xmm15, %xmm0 #61.63 + comisd %xmm0, %xmm13 #71.22 + jbe ..B1.12 # Prob 50% #71.22 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.11: # Preds ..B1.10 + # Execution count [1.25e+01] + movsd .L_2il0floatpacket.3(%rip), %xmm14 #75.38 + incl %ebx #73.17 + divsd %xmm0, %xmm14 #75.38 + movaps %xmm12, %xmm0 #76.38 + mulsd %xmm14, %xmm0 #76.38 + mulsd %xmm14, %xmm0 #76.44 + mulsd %xmm14, %xmm0 #76.50 + mulsd %xmm11, %xmm14 #77.54 + mulsd %xmm0, %xmm14 #77.61 + subsd %xmm1, %xmm0 #77.54 + mulsd %xmm0, %xmm14 #77.67 + mulsd %xmm14, %xmm3 #78.31 + mulsd %xmm14, %xmm7 #79.31 + mulsd %xmm14, %xmm6 #80.31 + addsd %xmm3, %xmm2 #78.17 + addsd %xmm7, %xmm4 #79.17 + addsd %xmm6, %xmm10 #80.17 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 xmm1 xmm2 xmm4 xmm5 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.12: # Preds ..B1.11 ..B1.10 + # Execution count [2.50e+01] + incq %rbp #56.9 + cmpq %r13, %rbp #56.9 + jb ..B1.10 # Prob 82% #56.9 +# OSACA-END +# LLVM-MCA-END + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 xmm1 xmm2 xmm4 xmm5 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.13: # Preds ..B1.12 + # Execution count [4.50e+00] + xorl %r15d, %r15d # + # LOE rax rdx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 xmm1 xmm2 xmm4 xmm10 xmm11 xmm12 xmm13 +..B1.14: # Preds ..B1.13 ..B1.8 + # Execution count [5.00e+00] + movslq %edx, %rcx #41.32 + incq %rdx #41.5 + addq %r13, %r9 #93.9 + addq %r13, %r10 #94.9 + incq %rcx #41.32 + addsd (%r11,%r8), %xmm2 #89.9 + addsd 8(%r11,%r8), %xmm4 #90.9 + addsd 16(%r11,%r8), %xmm10 #91.9 + movsd %xmm2, (%r11,%r8) #89.9 + movsd %xmm4, 8(%r11,%r8) #90.9 + movsd %xmm10, 16(%r11,%r8) #91.9 + addq $24, %r11 #41.5 + cmpq %r14, %rdx #41.5 + jb ..B1.8 # Prob 82% #41.5 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r14 r15 xmm1 xmm11 xmm12 xmm13 +..B1.15: # Preds ..B1.14 + # Execution count [9.00e-01] + movq (%rsp), %r13 #[spill] + movq 8(%rsp), %rbp #[spill] + movq %r9, (%r13) #93.9 + movq %r10, 8(%r13) #94.9 + jmp ..B1.19 # Prob 100% #94.9 + # LOE rbx rbp +..B1.16: # Preds ..B1.1 + # Execution count [5.00e-01] + xorl %ebx, %ebx #37.22 + xorl %eax, %eax #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.31: +# getTimeStamp() + call getTimeStamp #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.32: + # LOE rbx rbp xmm0 +..B1.32: # Preds ..B1.16 + # Execution count [5.00e-01] + movsd %xmm0, 16(%rsp) #38.16[spill] + # LOE rbx rbp +..B1.17: # Preds ..B1.32 + # Execution count [5.00e-01] + movl $.L_2__STRING.0, %edi #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.34: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.35: + # LOE rbx rbp +..B1.19: # Preds ..B1.17 ..B1.15 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #97.5 +..___tag_value_computeForceLJFullNeigh_plain_c.36: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #97.5 +..___tag_value_computeForceLJFullNeigh_plain_c.37: + # LOE rbx rbp +..B1.20: # Preds ..B1.19 + # Execution count [1.00e+00] + xorl %eax, %eax #98.16 +..___tag_value_computeForceLJFullNeigh_plain_c.38: +# getTimeStamp() + call getTimeStamp #98.16 +..___tag_value_computeForceLJFullNeigh_plain_c.39: + # LOE rbx rbp xmm0 +..B1.33: # Preds ..B1.20 + # Execution count [1.00e+00] + movaps %xmm0, %xmm1 #98.16 + # LOE rbx rbp xmm1 +..B1.21: # Preds ..B1.33 + # Execution count [1.00e+00] + pxor %xmm3, %xmm3 #100.5 + cvtsi2sdq %rbx, %xmm3 #100.5 + subsd 16(%rsp), %xmm1 #100.91[spill] + movsd .L_2il0floatpacket.2(%rip), %xmm2 #100.5 + movl $.L_2__STRING.1, %edi #100.5 + divsd %xmm3, %xmm2 #100.5 + mulsd %xmm1, %xmm2 #100.5 + movl %ebx, %esi #100.5 + movsd 264(%rbp), %xmm0 #100.71 + movl $3, %eax #100.5 + mulsd %xmm0, %xmm2 #100.5 + movsd %xmm1, (%rsp) #100.5[spill] +..___tag_value_computeForceLJFullNeigh_plain_c.41: +# printf(const char *__restrict__, ...) + call printf #100.5 +..___tag_value_computeForceLJFullNeigh_plain_c.42: + # LOE +..B1.22: # Preds ..B1.21 + # Execution count [1.00e+00] + movsd (%rsp), %xmm1 #[spill] + movaps %xmm1, %xmm0 #102.14 + addq $56, %rsp #102.14 + .cfi_def_cfa_offset 56 + .cfi_restore 6 + popq %rbp #102.14 + .cfi_def_cfa_offset 48 + .cfi_restore 3 + popq %rbx #102.14 + .cfi_def_cfa_offset 40 + .cfi_restore 15 + popq %r15 #102.14 + .cfi_def_cfa_offset 32 + .cfi_restore 14 + popq %r14 #102.14 + .cfi_def_cfa_offset 24 + .cfi_restore 13 + popq %r13 #102.14 + .cfi_def_cfa_offset 16 + .cfi_restore 12 + popq %r12 #102.14 + .cfi_def_cfa_offset 8 + ret #102.14 + .cfi_def_cfa_offset 112 + .cfi_offset 3, -48 + .cfi_offset 6, -56 + .cfi_offset 12, -16 + .cfi_offset 13, -24 + .cfi_offset 14, -32 + .cfi_offset 15, -40 + # LOE +..B1.23: # Preds ..B1.2 + # Execution count [1.11e+00]: Infreq + movl %eax, %edx #32.5 + xorl %ebx, %ebx #32.5 + movl $1, %esi #32.5 + xorl %ecx, %ecx #32.5 + shrl $1, %edx #32.5 + je ..B1.27 # Prob 10% #32.5 + # LOE rdx rcx rbx rbp rdi r12 r13 r15 eax esi r14d +..B1.24: # Preds ..B1.23 + # Execution count [1.00e+00]: Infreq + xorl %esi, %esi #32.5 + # LOE rdx rcx rbx rbp rsi rdi r12 r13 r15 eax r14d +..B1.25: # Preds ..B1.25 ..B1.24 + # Execution count [2.78e+00]: Infreq + incq %rbx #32.5 + movq %rsi, (%rcx,%rdi) #33.9 + movq %rsi, 8(%rcx,%rdi) #33.9 + addq $16, %rcx #32.5 + cmpq %rdx, %rbx #32.5 + jb ..B1.25 # Prob 64% #32.5 + # LOE rdx rcx rbx rbp rsi rdi r12 r13 r15 eax r14d +..B1.26: # Preds ..B1.25 + # Execution count [1.00e+00]: Infreq + lea 1(%rbx,%rbx), %esi #33.9 + # LOE rbp rdi r12 r13 r15 eax esi r14d +..B1.27: # Preds ..B1.23 ..B1.26 + # Execution count [1.11e+00]: Infreq + lea -1(%rsi), %edx #32.5 + cmpl %eax, %edx #32.5 + jae ..B1.34 # Prob 10% #32.5 + # LOE rbp rdi r12 r13 r15 esi r14d +..B1.28: # Preds ..B1.27 + # Execution count [1.00e+00]: Infreq + movslq %esi, %rsi #32.5 + movslq %r14d, %r14 #32.5 + movq $0, -8(%rdi,%rsi,8) #33.9 + jmp ..B1.5 # Prob 100% #33.9 + # LOE rbp r12 r13 r14 r15 +..B1.34: # Preds ..B1.27 + # Execution count [1.11e-01]: Infreq + movslq %r14d, %r14 #32.5 + jmp ..B1.5 # Prob 100% #32.5 + .align 16,0x90 + # LOE rbp r12 r13 r14 r15 + .cfi_endproc +# mark_end; + .type computeForceLJFullNeigh_plain_c,@function + .size computeForceLJFullNeigh_plain_c,.-computeForceLJFullNeigh_plain_c +..LNcomputeForceLJFullNeigh_plain_c.0: + .data +# -- End computeForceLJFullNeigh_plain_c + .text +.L_2__routine_start_computeForceLJHalfNeigh_1: +# -- Begin computeForceLJHalfNeigh + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJHalfNeigh +# --- computeForceLJHalfNeigh(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJHalfNeigh: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B2.1: # Preds ..B2.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJHalfNeigh.66: +..L67: + #105.96 + pushq %r12 #105.96 + .cfi_def_cfa_offset 16 + .cfi_offset 12, -16 + pushq %r13 #105.96 + .cfi_def_cfa_offset 24 + .cfi_offset 13, -24 + pushq %r14 #105.96 + .cfi_def_cfa_offset 32 + .cfi_offset 14, -32 + pushq %r15 #105.96 + .cfi_def_cfa_offset 40 + .cfi_offset 15, -40 + pushq %rbx #105.96 + .cfi_def_cfa_offset 48 + .cfi_offset 3, -48 + pushq %rbp #105.96 + .cfi_def_cfa_offset 56 + .cfi_offset 6, -56 + subq $216, %rsp #105.96 + .cfi_def_cfa_offset 272 + movq %rdi, %r14 #105.96 + movq %rsi, %r15 #105.96 + movq %rcx, %r12 #105.96 + movq %rdx, 32(%rsp) #105.96[spill] + movsd 144(%r14), %xmm0 #109.27 + mulsd %xmm0, %xmm0 #109.45 + movsd 56(%r14), %xmm1 #110.23 + movsd 40(%r14), %xmm2 #111.24 + movl 4(%r15), %ebp #106.18 + movsd %xmm0, 48(%rsp) #109.45[spill] + movsd %xmm1, 40(%rsp) #110.23[spill] + movsd %xmm2, 24(%rsp) #111.24[spill] + testl %ebp, %ebp #114.24 + jle ..B2.51 # Prob 50% #114.24 + # LOE r12 r14 r15 ebp +..B2.2: # Preds ..B2.1 + # Execution count [5.00e-03] + movq 64(%r15), %rdi #115.9 + lea (%rbp,%rbp,2), %ebx #106.18 + cmpl $12, %ebx #114.5 + jle ..B2.59 # Prob 0% #114.5 + # LOE rdi r12 r14 r15 ebx ebp +..B2.3: # Preds ..B2.2 + # Execution count [1.00e+00] + movslq %ebp, %r13 #114.5 + xorl %esi, %esi #114.5 + lea (%r13,%r13,2), %rdx #114.5 + shlq $3, %rdx #114.5 + call _intel_fast_memset #114.5 + # LOE r12 r13 r14 r15 ebp +..B2.5: # Preds ..B2.3 ..B2.64 ..B2.70 + # Execution count [1.00e+00] + xorl %ebx, %ebx #120.22 + xorl %eax, %eax #121.16 +..___tag_value_computeForceLJHalfNeigh.85: +# getTimeStamp() + call getTimeStamp #121.16 +..___tag_value_computeForceLJHalfNeigh.86: + # LOE r12 r13 r14 r15 ebx ebp xmm0 +..B2.67: # Preds ..B2.5 + # Execution count [1.00e+00] + movsd %xmm0, 16(%rsp) #121.16[spill] + # LOE r12 r13 r14 r15 ebx ebp +..B2.6: # Preds ..B2.67 + # Execution count [5.00e-01] + movl $.L_2__STRING.2, %edi #122.5 +..___tag_value_computeForceLJHalfNeigh.88: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #122.5 +..___tag_value_computeForceLJHalfNeigh.89: + # LOE r12 r13 r14 r15 ebx ebp +..B2.7: # Preds ..B2.6 + # Execution count [9.00e-01] + movsd .L_2il0floatpacket.0(%rip), %xmm13 #161.41 + movd %ebp, %xmm0 #106.18 + mulsd 24(%rsp), %xmm13 #161.41[spill] + xorl %r9d, %r9d #124.15 + movq 32(%rsp), %rdx #125.19[spill] + movaps %xmm13, %xmm2 #161.41 + movsd 40(%rsp), %xmm3 #110.21[spill] + xorl %r8d, %r8d #124.5 + movsd 48(%rsp), %xmm6 #109.25[spill] + xorl %esi, %esi #124.5 + unpcklpd %xmm3, %xmm3 #110.21 + unpcklpd %xmm2, %xmm2 #161.41 + pshufd $0, %xmm0, %xmm0 #106.18 + movq 16(%rdx), %rdi #125.19 + movslq 8(%rdx), %rax #125.43 + movq 24(%rdx), %rcx #126.25 + movq 16(%r15), %rdx #127.25 + movq 64(%r15), %r15 #168.21 + unpcklpd %xmm6, %xmm6 #109.25 + movups .L_2il0floatpacket.7(%rip), %xmm1 #161.54 + movsd .L_2il0floatpacket.1(%rip), %xmm7 #161.54 + shlq $2, %rax #107.5 + movq (%r12), %r10 #179.9 + movq 8(%r12), %r11 #180.9 + movdqu %xmm0, 160(%rsp) #124.5[spill] + movups %xmm2, 192(%rsp) #124.5[spill] + movups %xmm3, 176(%rsp) #124.5[spill] + movq %rdi, 56(%rsp) #124.5[spill] + movl %ebp, 64(%rsp) #124.5[spill] + movq %r14, (%rsp) #124.5[spill] + movq %r12, 8(%rsp) #124.5[spill] + movsd 40(%rsp), %xmm10 #124.5[spill] + movsd 48(%rsp), %xmm12 #124.5[spill] + # LOE rax rdx rcx rsi r8 r9 r10 r11 r13 r15 ebx xmm6 xmm7 xmm10 xmm12 xmm13 +..B2.8: # Preds ..B2.49 ..B2.7 + # Execution count [5.00e+00] + movl (%rcx,%r8,4), %r14d #126.25 + addl %r14d, %ebx #138.9 + pxor %xmm5, %xmm5 #130.22 + movaps %xmm5, %xmm4 #131.22 + movsd (%rsi,%rdx), %xmm9 #127.25 + movaps %xmm4, %xmm0 #132.22 + movsd 8(%rsi,%rdx), %xmm8 #128.25 + movsd 16(%rsi,%rdx), %xmm11 #129.25 + testl %r14d, %r14d #143.9 + jle ..B2.48 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.9: # Preds ..B2.8 + # Execution count [2.50e+00] + jbe ..B2.48 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.10: # Preds ..B2.9 + # Execution count [2.25e+00] + cmpl $2, %r14d #143.9 + jb ..B2.58 # Prob 10% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.11: # Preds ..B2.10 + # Execution count [2.25e+00] + movq %rax, %rdi #125.43 + movl %r14d, %r12d #143.9 + imulq %r9, %rdi #125.43 + pxor %xmm5, %xmm5 #130.22 + movaps %xmm9, %xmm1 #127.23 + movaps %xmm5, %xmm4 #131.22 + movaps %xmm8, %xmm2 #128.23 + movaps %xmm11, %xmm3 #129.23 + andl $-2, %r12d #143.9 + movsd %xmm11, 120(%rsp) #143.9[spill] + addq 56(%rsp), %rdi #107.5[spill] + xorl %ebp, %ebp #143.9 + unpcklpd %xmm1, %xmm1 #127.23 + movaps %xmm4, %xmm0 #132.22 + unpcklpd %xmm2, %xmm2 #128.23 + unpcklpd %xmm3, %xmm3 #129.23 + movslq %r12d, %r12 #143.9 + movsd %xmm8, 128(%rsp) #143.9[spill] + movsd %xmm9, 136(%rsp) #143.9[spill] + movsd %xmm13, 144(%rsp) #143.9[spill] + movl %r14d, 24(%rsp) #143.9[spill] + movq %rsi, 32(%rsp) #143.9[spill] + movq %rax, 72(%rsp) #143.9[spill] + movq %r11, 80(%rsp) #143.9[spill] + movq %r10, 88(%rsp) #143.9[spill] + movq %rcx, 96(%rsp) #143.9[spill] + movq %r8, 104(%rsp) #143.9[spill] + movq %r13, 112(%rsp) #143.9[spill] + movdqu .L_2il0floatpacket.5(%rip), %xmm11 #143.9 + movdqu .L_2il0floatpacket.4(%rip), %xmm12 #143.9 + # LOE rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 +..B2.12: # Preds ..B2.38 ..B2.11 + # Execution count [1.25e+01] + movq (%rdi,%rbp,4), %xmm10 #144.21 + movdqa %xmm12, %xmm15 #146.36 + movdqa %xmm10, %xmm7 #145.36 + paddd %xmm10, %xmm7 #145.36 + paddd %xmm10, %xmm7 #145.36 + movdqa %xmm7, %xmm9 #145.36 + paddd %xmm7, %xmm15 #146.36 + movd %xmm7, %r13d #145.36 + paddd %xmm11, %xmm7 #147.36 + psrldq $4, %xmm9 #145.36 + movd %xmm9, %r11d #145.36 + movaps %xmm1, %xmm9 #145.36 + movd %xmm15, %r10d #146.36 + psrldq $4, %xmm15 #146.36 + movd %xmm15, %r8d #146.36 + movd %xmm7, %ecx #147.36 + psrldq $4, %xmm7 #147.36 + movd %xmm7, %eax #147.36 + movaps %xmm3, %xmm7 #147.36 + movslq %r13d, %r13 #145.36 + movslq %r11d, %r11 #145.36 + movslq %r10d, %r10 #146.36 + movslq %r8d, %r8 #146.36 + movsd (%rdx,%r13,8), %xmm8 #145.36 + movhpd (%rdx,%r11,8), %xmm8 #145.36 + movsd (%rdx,%r10,8), %xmm13 #146.36 + subpd %xmm8, %xmm9 #145.36 + movhpd (%rdx,%r8,8), %xmm13 #146.36 + movaps %xmm2, %xmm8 #146.36 + movslq %ecx, %rcx #147.36 + movaps %xmm9, %xmm15 #148.35 + subpd %xmm13, %xmm8 #146.36 + mulpd %xmm9, %xmm15 #148.35 + movslq %eax, %rax #147.36 + movaps %xmm8, %xmm13 #148.49 + movsd (%rdx,%rcx,8), %xmm14 #147.36 + mulpd %xmm8, %xmm13 #148.49 + movhpd (%rdx,%rax,8), %xmm14 #147.36 + subpd %xmm14, %xmm7 #147.36 + addpd %xmm13, %xmm15 #148.49 + movaps %xmm7, %xmm14 #148.63 + mulpd %xmm7, %xmm14 #148.63 + addpd %xmm14, %xmm15 #148.63 + movaps %xmm15, %xmm13 #158.22 + cmpltpd %xmm6, %xmm13 #158.22 + movmskpd %xmm13, %r14d #158.22 + testl %r14d, %r14d #158.22 + je ..B2.38 # Prob 50% #158.22 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm15 +..B2.13: # Preds ..B2.12 + # Execution count [6.25e+00] + movups .L_2il0floatpacket.6(%rip), %xmm14 #159.38 + divpd %xmm15, %xmm14 #159.38 + movdqu 160(%rsp), %xmm15 #167.24[spill] + pcmpgtd %xmm10, %xmm15 #167.24 + movups 176(%rsp), %xmm10 #160.38[spill] + mulpd %xmm14, %xmm10 #160.38 + mulpd %xmm14, %xmm10 #160.44 + mulpd %xmm14, %xmm10 #160.50 + mulpd 192(%rsp), %xmm14 #161.54[spill] + mulpd %xmm10, %xmm14 #161.61 + subpd .L_2il0floatpacket.7(%rip), %xmm10 #161.54 + mulpd %xmm10, %xmm14 #161.67 + mulpd %xmm14, %xmm9 #162.31 + mulpd %xmm14, %xmm8 #163.31 + mulpd %xmm14, %xmm7 #164.31 + punpckldq %xmm15, %xmm15 #167.24 + movaps %xmm13, %xmm14 #162.31 + andps %xmm13, %xmm15 #167.24 + movaps %xmm13, %xmm10 #163.31 + movmskpd %xmm15, %esi #167.24 + andps %xmm9, %xmm14 #162.31 + andps %xmm8, %xmm10 #163.31 + andps %xmm7, %xmm13 #164.31 + addpd %xmm14, %xmm5 #162.17 + addpd %xmm10, %xmm4 #163.17 + addpd %xmm13, %xmm0 #164.17 + testl %esi, %esi #167.24 + je ..B2.38 # Prob 50% #167.24 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 +..B2.14: # Preds ..B2.13 + # Execution count [3.12e+00] + movl %esi, %r14d #168.21 + andl $2, %r14d #168.21 + andl $1, %esi #168.21 + je ..B2.17 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 +..B2.15: # Preds ..B2.14 + # Execution count [3.12e+00] + movsd (%r15,%r13,8), %xmm10 #168.21 + testl %r14d, %r14d #168.21 + jne ..B2.18 # Prob 60% #168.21 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 +..B2.16: # Preds ..B2.15 + # Execution count [1.25e+00] + pxor %xmm13, %xmm13 #168.21 + unpcklpd %xmm13, %xmm10 #168.21 + subpd %xmm9, %xmm10 #168.21 + jmp ..B2.31 # Prob 100% #168.21 + # LOE rdx rcx rbx rbp rdi r9 r10 r12 r13 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 +..B2.17: # Preds ..B2.14 + # Execution count [3.12e+00] + pxor %xmm10, %xmm10 #168.21 + testl %r14d, %r14d #168.21 + je ..B2.30 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 +..B2.18: # Preds ..B2.15 ..B2.17 + # Execution count [3.12e+00] + movhpd (%r15,%r11,8), %xmm10 #168.21 + subpd %xmm9, %xmm10 #168.21 + testl %esi, %esi #168.21 + je ..B2.20 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r11 r12 r13 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 +..B2.19: # Preds ..B2.18 + # Execution count [1.88e+00] + movsd %xmm10, (%r15,%r13,8) #168.21 + psrldq $8, %xmm10 #168.21 + movsd %xmm10, (%r15,%r11,8) #168.21 + movsd (%r15,%r10,8), %xmm10 #169.21 + jmp ..B2.21 # Prob 100% #169.21 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 +..B2.20: # Preds ..B2.18 + # Execution count [1.25e+00] + psrldq $8, %xmm10 #168.21 + movsd %xmm10, (%r15,%r11,8) #168.21 + pxor %xmm10, %xmm10 #169.21 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 +..B2.21: # Preds ..B2.19 ..B2.20 + # Execution count [1.88e+00] + testl %r14d, %r14d #169.21 + je ..B2.72 # Prob 40% #169.21 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 +..B2.22: # Preds ..B2.21 + # Execution count [3.12e+00] + movhpd (%r15,%r8,8), %xmm10 #169.21 + subpd %xmm8, %xmm10 #169.21 + testl %esi, %esi #169.21 + je ..B2.24 # Prob 40% #169.21 + # LOE rax rdx rcx rbx rbp rdi r8 r9 r10 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.23: # Preds ..B2.22 + # Execution count [1.88e+00] + movsd %xmm10, (%r15,%r10,8) #169.21 + psrldq $8, %xmm10 #169.21 + movsd %xmm10, (%r15,%r8,8) #169.21 + movsd (%r15,%rcx,8), %xmm9 #170.21 + jmp ..B2.25 # Prob 100% #170.21 + # LOE rax rdx rcx rbx rbp rdi r9 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 +..B2.24: # Preds ..B2.22 + # Execution count [1.25e+00] + psrldq $8, %xmm10 #169.21 + movsd %xmm10, (%r15,%r8,8) #169.21 + pxor %xmm9, %xmm9 #170.21 + # LOE rax rdx rcx rbx rbp rdi r9 r12 r15 esi r14d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 +..B2.25: # Preds ..B2.23 ..B2.24 + # Execution count [1.88e+00] + testl %r14d, %r14d #170.21 + je ..B2.71 # Prob 40% #170.21 + # LOE rax rdx rcx rbx rbp rdi r9 r12 r15 esi xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 +..B2.26: # Preds ..B2.25 + # Execution count [3.12e+00] + movhpd (%r15,%rax,8), %xmm9 #170.21 + subpd %xmm7, %xmm9 #170.21 + testl %esi, %esi #170.21 + je ..B2.28 # Prob 40% #170.21 + # LOE rax rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 +..B2.27: # Preds ..B2.26 + # Execution count [1.88e+00] + movsd %xmm9, (%r15,%rcx,8) #170.21 + psrldq $8, %xmm9 #170.21 + jmp ..B2.29 # Prob 100% #170.21 + # LOE rax rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 +..B2.28: # Preds ..B2.26 + # Execution count [1.25e+00] + psrldq $8, %xmm9 #170.21 + # LOE rax rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 +..B2.29: # Preds ..B2.27 ..B2.28 + # Execution count [3.12e+00] + movsd %xmm9, (%r15,%rax,8) #170.21 + jmp ..B2.38 # Prob 100% #170.21 + # LOE rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 +..B2.30: # Preds ..B2.17 + # Execution count [1.88e+00] + pxor %xmm10, %xmm10 #168.21 + subpd %xmm9, %xmm10 #168.21 + testl %esi, %esi #168.21 + je ..B2.32 # Prob 40% #168.21 + # LOE rdx rcx rbx rbp rdi r9 r10 r12 r13 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 +..B2.31: # Preds ..B2.16 ..B2.30 + # Execution count [1.25e+00] + movsd %xmm10, (%r15,%r13,8) #168.21 + movsd (%r15,%r10,8), %xmm10 #169.21 + pxor %xmm9, %xmm9 #169.21 + unpcklpd %xmm9, %xmm10 #169.21 + subpd %xmm8, %xmm10 #169.21 + jmp ..B2.34 # Prob 100% #169.21 + # LOE rdx rcx rbx rbp rdi r9 r10 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.32: # Preds ..B2.30 + # Execution count [0.00e+00] + pxor %xmm10, %xmm10 #169.21 + jmp ..B2.33 # Prob 100% #169.21 + # LOE rdx rcx rbx rbp rdi r9 r10 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 +..B2.72: # Preds ..B2.21 + # Execution count [7.50e-01] + testl %esi, %esi #168.21 + # LOE rdx rcx rbx rbp rdi r9 r10 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm11 xmm12 +..B2.33: # Preds ..B2.32 ..B2.72 + # Execution count [2.67e+00] + pxor %xmm9, %xmm9 #169.21 + unpcklpd %xmm9, %xmm10 #169.21 + subpd %xmm8, %xmm10 #169.21 + je ..B2.35 # Prob 40% #169.21 + # LOE rdx rcx rbx rbp rdi r9 r10 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm10 xmm11 xmm12 +..B2.34: # Preds ..B2.31 ..B2.33 + # Execution count [1.25e+00] + movsd %xmm10, (%r15,%r10,8) #169.21 + movsd (%r15,%rcx,8), %xmm9 #170.21 + pxor %xmm8, %xmm8 #170.21 + unpcklpd %xmm8, %xmm9 #170.21 + subpd %xmm7, %xmm9 #170.21 + jmp ..B2.37 # Prob 100% #170.21 + # LOE rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 +..B2.35: # Preds ..B2.33 + # Execution count [0.00e+00] + pxor %xmm9, %xmm9 #170.21 + jmp ..B2.36 # Prob 100% #170.21 + # LOE rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 +..B2.71: # Preds ..B2.25 + # Execution count [7.50e-01] + testl %esi, %esi #168.21 + # LOE rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm9 xmm11 xmm12 +..B2.36: # Preds ..B2.35 ..B2.71 + # Execution count [2.67e+00] + pxor %xmm8, %xmm8 #170.21 + unpcklpd %xmm8, %xmm9 #170.21 + subpd %xmm7, %xmm9 #170.21 + je ..B2.38 # Prob 40% #170.21 + # LOE rdx rcx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 +..B2.37: # Preds ..B2.34 ..B2.36 + # Execution count [1.25e+00] + movsd %xmm9, (%r15,%rcx,8) #170.21 + # LOE rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 +..B2.38: # Preds ..B2.36 ..B2.29 ..B2.37 ..B2.13 ..B2.12 + # + # Execution count [1.25e+01] + addq $2, %rbp #143.9 + cmpq %r12, %rbp #143.9 + jb ..B2.12 # Prob 82% #143.9 + # LOE rdx rbx rbp rdi r9 r12 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 +..B2.39: # Preds ..B2.38 + # Execution count [2.25e+00] + movaps %xmm0, %xmm1 #132.22 + movaps %xmm4, %xmm2 #131.22 + movaps %xmm5, %xmm3 #130.22 + unpckhpd %xmm0, %xmm1 #132.22 + unpckhpd %xmm4, %xmm2 #131.22 + addsd %xmm1, %xmm0 #132.22 + addsd %xmm2, %xmm4 #131.22 + unpckhpd %xmm5, %xmm3 #130.22 + movsd 120(%rsp), %xmm11 #[spill] + addsd %xmm3, %xmm5 #130.22 + movsd 128(%rsp), %xmm8 #[spill] + movsd 136(%rsp), %xmm9 #[spill] + movsd 144(%rsp), %xmm13 #[spill] + movsd 40(%rsp), %xmm10 #[spill] + movsd 48(%rsp), %xmm12 #[spill] + movl 24(%rsp), %r14d #[spill] + movq 32(%rsp), %rsi #[spill] + movq 72(%rsp), %rax #[spill] + movq 80(%rsp), %r11 #[spill] + movq 88(%rsp), %r10 #[spill] + movq 96(%rsp), %rcx #[spill] + movq 104(%rsp), %r8 #[spill] + movq 112(%rsp), %r13 #[spill] + movsd .L_2il0floatpacket.1(%rip), %xmm7 # + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.40: # Preds ..B2.39 ..B2.58 + # Execution count [2.50e+00] + movslq %r14d, %r14 #143.9 + cmpq %r14, %r12 #143.9 + jae ..B2.49 # Prob 10% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.41: # Preds ..B2.40 + # Execution count [2.25e+00] + imulq %rax, %r9 #125.43 + addq 56(%rsp), %r9 #107.5[spill] + movl 64(%rsp), %ebp #107.5[spill] + movq %r13, 112(%rsp) #107.5[spill] + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 r15 ebp xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.42: # Preds ..B2.45 ..B2.41 + # Execution count [1.25e+01] + movl (%r9,%r12,4), %edi #144.21 + movaps %xmm9, %xmm14 #145.36 + movaps %xmm8, %xmm3 #146.36 + movaps %xmm11, %xmm2 #147.36 + lea (%rdi,%rdi,2), %r13d #145.36 + movslq %r13d, %r13 #145.36 + subsd (%rdx,%r13,8), %xmm14 #145.36 + subsd 8(%rdx,%r13,8), %xmm3 #146.36 + subsd 16(%rdx,%r13,8), %xmm2 #147.36 + movaps %xmm14, %xmm15 #148.35 + movaps %xmm3, %xmm1 #148.49 + mulsd %xmm14, %xmm15 #148.35 + mulsd %xmm3, %xmm1 #148.49 + addsd %xmm1, %xmm15 #148.49 + movaps %xmm2, %xmm1 #148.63 + mulsd %xmm2, %xmm1 #148.63 + addsd %xmm1, %xmm15 #148.63 + comisd %xmm15, %xmm12 #158.22 + jbe ..B2.45 # Prob 50% #158.22 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 ebp edi xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 +..B2.43: # Preds ..B2.42 + # Execution count [6.25e+00] + movsd .L_2il0floatpacket.3(%rip), %xmm1 #159.38 + divsd %xmm15, %xmm1 #159.38 + movaps %xmm10, %xmm15 #160.38 + mulsd %xmm1, %xmm15 #160.38 + mulsd %xmm1, %xmm15 #160.44 + mulsd %xmm1, %xmm15 #160.50 + mulsd %xmm13, %xmm1 #161.54 + mulsd %xmm15, %xmm1 #161.61 + subsd %xmm7, %xmm15 #161.54 + mulsd %xmm15, %xmm1 #161.67 + mulsd %xmm1, %xmm14 #162.31 + mulsd %xmm1, %xmm3 #163.31 + mulsd %xmm1, %xmm2 #164.31 + addsd %xmm14, %xmm5 #162.17 + addsd %xmm3, %xmm4 #163.17 + addsd %xmm2, %xmm0 #164.17 + cmpl %ebp, %edi #167.24 + jge ..B2.45 # Prob 50% #167.24 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 ebp xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B2.44: # Preds ..B2.43 + # Execution count [3.12e+00] + movsd (%r15,%r13,8), %xmm1 #168.21 + subsd %xmm14, %xmm1 #168.21 + movsd 8(%r15,%r13,8), %xmm14 #169.21 + subsd %xmm3, %xmm14 #169.21 + movsd 16(%r15,%r13,8), %xmm3 #170.21 + movsd %xmm1, (%r15,%r13,8) #168.21 + subsd %xmm2, %xmm3 #170.21 + movsd %xmm14, 8(%r15,%r13,8) #169.21 + movsd %xmm3, 16(%r15,%r13,8) #170.21 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 r15 ebp xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.45: # Preds ..B2.44 ..B2.43 ..B2.42 + # Execution count [1.25e+01] + incq %r12 #143.9 + cmpq %r14, %r12 #143.9 + jb ..B2.42 # Prob 82% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r14 r15 ebp xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.46: # Preds ..B2.45 + # Execution count [2.25e+00] + movq 112(%rsp), %r13 #[spill] + jmp ..B2.49 # Prob 100% # + # LOE rax rdx rcx rbx rsi r8 r10 r11 r13 r14 r15 xmm0 xmm4 xmm5 xmm6 xmm7 xmm10 xmm12 xmm13 +..B2.48: # Preds ..B2.9 ..B2.8 + # Execution count [2.50e+00] + movslq %r14d, %r14 #179.9 + # LOE rax rdx rcx rbx rsi r8 r10 r11 r13 r14 r15 xmm0 xmm4 xmm5 xmm6 xmm7 xmm10 xmm12 xmm13 +..B2.49: # Preds ..B2.46 ..B2.40 ..B2.48 + # Execution count [5.00e+00] + movslq %r8d, %r9 #124.32 + incq %r8 #124.5 + addq %r14, %r10 #179.9 + addq %r14, %r11 #180.9 + incq %r9 #124.32 + addsd (%rsi,%r15), %xmm5 #175.9 + addsd 8(%rsi,%r15), %xmm4 #176.9 + addsd 16(%rsi,%r15), %xmm0 #177.9 + movsd %xmm5, (%rsi,%r15) #175.9 + movsd %xmm4, 8(%rsi,%r15) #176.9 + movsd %xmm0, 16(%rsi,%r15) #177.9 + addq $24, %rsi #124.5 + cmpq %r13, %r8 #124.5 + jb ..B2.8 # Prob 82% #124.5 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r13 r15 xmm6 xmm7 xmm10 xmm12 xmm13 +..B2.50: # Preds ..B2.49 + # Execution count [9.00e-01] + movq 8(%rsp), %r12 #[spill] + movq (%rsp), %r14 #[spill] + movq %r10, (%r12) #179.9 + movq %r11, 8(%r12) #180.9 + jmp ..B2.54 # Prob 100% #180.9 + # LOE rbx r14 +..B2.51: # Preds ..B2.1 + # Execution count [5.00e-01] + xorl %ebx, %ebx #120.22 + xorl %eax, %eax #121.16 +..___tag_value_computeForceLJHalfNeigh.139: +# getTimeStamp() + call getTimeStamp #121.16 +..___tag_value_computeForceLJHalfNeigh.140: + # LOE rbx r14 xmm0 +..B2.68: # Preds ..B2.51 + # Execution count [5.00e-01] + movsd %xmm0, 16(%rsp) #121.16[spill] + # LOE rbx r14 +..B2.52: # Preds ..B2.68 + # Execution count [5.00e-01] + movl $.L_2__STRING.2, %edi #122.5 +..___tag_value_computeForceLJHalfNeigh.142: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #122.5 +..___tag_value_computeForceLJHalfNeigh.143: + # LOE rbx r14 +..B2.54: # Preds ..B2.52 ..B2.50 + # Execution count [1.00e+00] + movl $.L_2__STRING.2, %edi #183.5 +..___tag_value_computeForceLJHalfNeigh.144: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #183.5 +..___tag_value_computeForceLJHalfNeigh.145: + # LOE rbx r14 +..B2.55: # Preds ..B2.54 + # Execution count [1.00e+00] + xorl %eax, %eax #184.16 +..___tag_value_computeForceLJHalfNeigh.146: +# getTimeStamp() + call getTimeStamp #184.16 +..___tag_value_computeForceLJHalfNeigh.147: + # LOE rbx r14 xmm0 +..B2.69: # Preds ..B2.55 + # Execution count [1.00e+00] + movaps %xmm0, %xmm1 #184.16 + # LOE rbx r14 xmm1 +..B2.56: # Preds ..B2.69 + # Execution count [1.00e+00] + pxor %xmm3, %xmm3 #185.5 + cvtsi2sdq %rbx, %xmm3 #185.5 + subsd 16(%rsp), %xmm1 #185.94[spill] + movsd .L_2il0floatpacket.2(%rip), %xmm2 #185.5 + movl $.L_2__STRING.3, %edi #185.5 + divsd %xmm3, %xmm2 #185.5 + mulsd %xmm1, %xmm2 #185.5 + movl %ebx, %esi #185.5 + movsd 264(%r14), %xmm0 #185.74 + movl $3, %eax #185.5 + mulsd %xmm0, %xmm2 #185.5 + movsd %xmm1, (%rsp) #185.5[spill] +..___tag_value_computeForceLJHalfNeigh.149: +# printf(const char *__restrict__, ...) + call printf #185.5 +..___tag_value_computeForceLJHalfNeigh.150: + # LOE +..B2.57: # Preds ..B2.56 + # Execution count [1.00e+00] + movsd (%rsp), %xmm1 #[spill] + movaps %xmm1, %xmm0 #186.14 + addq $216, %rsp #186.14 + .cfi_def_cfa_offset 56 + .cfi_restore 6 + popq %rbp #186.14 + .cfi_def_cfa_offset 48 + .cfi_restore 3 + popq %rbx #186.14 + .cfi_def_cfa_offset 40 + .cfi_restore 15 + popq %r15 #186.14 + .cfi_def_cfa_offset 32 + .cfi_restore 14 + popq %r14 #186.14 + .cfi_def_cfa_offset 24 + .cfi_restore 13 + popq %r13 #186.14 + .cfi_def_cfa_offset 16 + .cfi_restore 12 + popq %r12 #186.14 + .cfi_def_cfa_offset 8 + ret #186.14 + .cfi_def_cfa_offset 272 + .cfi_offset 3, -48 + .cfi_offset 6, -56 + .cfi_offset 12, -16 + .cfi_offset 13, -24 + .cfi_offset 14, -32 + .cfi_offset 15, -40 + # LOE +..B2.58: # Preds ..B2.10 + # Execution count [2.25e-01]: Infreq + xorl %r12d, %r12d #143.9 + jmp ..B2.40 # Prob 100% #143.9 + # LOE rax rdx rcx rbx rsi r8 r9 r10 r11 r12 r13 r15 r14d xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.59: # Preds ..B2.2 + # Execution count [1.11e+00]: Infreq + movl %ebx, %eax #114.5 + xorl %ecx, %ecx #114.5 + movl $1, %esi #114.5 + xorl %edx, %edx #114.5 + shrl $1, %eax #114.5 + je ..B2.63 # Prob 10% #114.5 + # LOE rax rdx rcx rdi r12 r14 r15 ebx ebp esi +..B2.60: # Preds ..B2.59 + # Execution count [1.00e+00]: Infreq + xorl %esi, %esi #114.5 + # LOE rax rdx rcx rsi rdi r12 r14 r15 ebx ebp +..B2.61: # Preds ..B2.61 ..B2.60 + # Execution count [2.78e+00]: Infreq + incq %rcx #114.5 + movq %rsi, (%rdx,%rdi) #115.9 + movq %rsi, 8(%rdx,%rdi) #115.9 + addq $16, %rdx #114.5 + cmpq %rax, %rcx #114.5 + jb ..B2.61 # Prob 64% #114.5 + # LOE rax rdx rcx rsi rdi r12 r14 r15 ebx ebp +..B2.62: # Preds ..B2.61 + # Execution count [1.00e+00]: Infreq + lea 1(%rcx,%rcx), %esi #115.9 + # LOE rdi r12 r14 r15 ebx ebp esi +..B2.63: # Preds ..B2.59 ..B2.62 + # Execution count [1.11e+00]: Infreq + lea -1(%rsi), %eax #114.5 + cmpl %ebx, %eax #114.5 + jae ..B2.70 # Prob 10% #114.5 + # LOE rdi r12 r14 r15 ebp esi +..B2.64: # Preds ..B2.63 + # Execution count [1.00e+00]: Infreq + movslq %esi, %rsi #114.5 + movslq %ebp, %r13 #114.5 + movq $0, -8(%rdi,%rsi,8) #115.9 + jmp ..B2.5 # Prob 100% #115.9 + # LOE r12 r13 r14 r15 ebp +..B2.70: # Preds ..B2.63 + # Execution count [1.11e-01]: Infreq + movslq %ebp, %r13 #114.5 + jmp ..B2.5 # Prob 100% #114.5 + .align 16,0x90 + # LOE r12 r13 r14 r15 ebp + .cfi_endproc +# mark_end; + .type computeForceLJHalfNeigh,@function + .size computeForceLJHalfNeigh,.-computeForceLJHalfNeigh +..LNcomputeForceLJHalfNeigh.1: + .data +# -- End computeForceLJHalfNeigh + .text +.L_2__routine_start_computeForceLJFullNeigh_simd_2: +# -- Begin computeForceLJFullNeigh_simd + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJFullNeigh_simd +# --- computeForceLJFullNeigh_simd(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJFullNeigh_simd: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B3.1: # Preds ..B3.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJFullNeigh_simd.174: +..L175: + #189.101 + pushq %rsi #189.101 + .cfi_def_cfa_offset 16 + movl 4(%rsi), %edx #190.18 + testl %edx, %edx #196.24 + jle ..B3.4 # Prob 50% #196.24 + # LOE rbx rbp rsi r12 r13 r14 r15 edx +..B3.2: # Preds ..B3.1 + # Execution count [5.00e-03] + movq 64(%rsi), %rdi #197.9 + lea (%rdx,%rdx,2), %eax #190.18 + cmpl $12, %eax #196.5 + jle ..B3.8 # Prob 0% #196.5 + # LOE rbx rbp rdi r12 r13 r14 r15 eax edx +..B3.3: # Preds ..B3.2 + # Execution count [1.00e+00] + movslq %edx, %rdx #196.5 + xorl %esi, %esi #196.5 + lea (%rdx,%rdx,2), %rdx #196.5 + shlq $3, %rdx #196.5 + call _intel_fast_memset #196.5 + # LOE rbx rbp r12 r13 r14 r15 +..B3.4: # Preds ..B3.1 ..B3.12 ..B3.3 ..B3.13 + # Execution count [1.00e+00] + xorl %eax, %eax #203.16 +..___tag_value_computeForceLJFullNeigh_simd.177: +# getTimeStamp() + call getTimeStamp #203.16 +..___tag_value_computeForceLJFullNeigh_simd.178: + # LOE rbx rbp r12 r13 r14 r15 +..B3.5: # Preds ..B3.4 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #204.5 +..___tag_value_computeForceLJFullNeigh_simd.179: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #204.5 +..___tag_value_computeForceLJFullNeigh_simd.180: + # LOE +..B3.6: # Preds ..B3.5 + # Execution count [1.00e+00] + movl $il0_peep_printf_format_0, %edi #207.5 + movq stderr(%rip), %rsi #207.5 + call fputs #207.5 + # LOE +..B3.7: # Preds ..B3.6 + # Execution count [1.00e+00] + movl $-1, %edi #208.5 +# exit(int) + call exit #208.5 + # LOE +..B3.8: # Preds ..B3.2 + # Execution count [1.11e+00]: Infreq + movl %eax, %edx #196.5 + xorl %r8d, %r8d #196.5 + movl $1, %r9d #196.5 + xorl %esi, %esi #196.5 + xorl %ecx, %ecx #196.5 + shrl $1, %edx #196.5 + je ..B3.12 # Prob 10% #196.5 + # LOE rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 eax r9d +..B3.10: # Preds ..B3.8 ..B3.10 + # Execution count [2.78e+00]: Infreq + incq %r8 #196.5 + movq %rsi, (%rcx,%rdi) #197.9 + movq %rsi, 8(%rcx,%rdi) #197.9 + addq $16, %rcx #196.5 + cmpq %rdx, %r8 #196.5 + jb ..B3.10 # Prob 64% #196.5 + # LOE rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 eax +..B3.11: # Preds ..B3.10 + # Execution count [1.00e+00]: Infreq + lea 1(%r8,%r8), %r9d #197.9 + # LOE rbx rbp rdi r12 r13 r14 r15 eax r9d +..B3.12: # Preds ..B3.11 ..B3.8 + # Execution count [1.11e+00]: Infreq + lea -1(%r9), %edx #196.5 + cmpl %eax, %edx #196.5 + jae ..B3.4 # Prob 10% #196.5 + # LOE rbx rbp rdi r12 r13 r14 r15 r9d +..B3.13: # Preds ..B3.12 + # Execution count [1.00e+00]: Infreq + movslq %r9d, %r9 #196.5 + movq $0, -8(%rdi,%r9,8) #197.9 + jmp ..B3.4 # Prob 100% #197.9 + .align 16,0x90 + # LOE rbx rbp r12 r13 r14 r15 + .cfi_endproc +# mark_end; + .type computeForceLJFullNeigh_simd,@function + .size computeForceLJFullNeigh_simd,.-computeForceLJFullNeigh_simd +..LNcomputeForceLJFullNeigh_simd.2: + .section .rodata.str1.32, "aMS",@progbits,1 + .align 32 + .align 32 +il0_peep_printf_format_0: + .long 1869771333 + .long 1394621042 + .long 541347145 + .long 1852990827 + .long 1847618661 + .long 1763734639 + .long 1701605485 + .long 1953391981 + .long 1713398885 + .long 1931506287 + .long 1768121712 + .long 1684367718 + .long 1936615712 + .long 1668641396 + .long 1852795252 + .long 1952805664 + .word 33 + .data +# -- End computeForceLJFullNeigh_simd + .section .rodata, "a" + .align 16 + .align 16 +.L_2il0floatpacket.4: + .long 0x00000001,0x00000001,0x00000001,0x00000001 + .type .L_2il0floatpacket.4,@object + .size .L_2il0floatpacket.4,16 + .align 16 +.L_2il0floatpacket.5: + .long 0x00000002,0x00000002,0x00000002,0x00000002 + .type .L_2il0floatpacket.5,@object + .size .L_2il0floatpacket.5,16 + .align 16 +.L_2il0floatpacket.6: + .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000 + .type .L_2il0floatpacket.6,@object + .size .L_2il0floatpacket.6,16 + .align 16 +.L_2il0floatpacket.7: + .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000 + .type .L_2il0floatpacket.7,@object + .size .L_2il0floatpacket.7,16 + .align 8 +.L_2il0floatpacket.0: + .long 0x00000000,0x40480000 + .type .L_2il0floatpacket.0,@object + .size .L_2il0floatpacket.0,8 + .align 8 +.L_2il0floatpacket.1: + .long 0x00000000,0x3fe00000 + .type .L_2il0floatpacket.1,@object + .size .L_2il0floatpacket.1,8 + .align 8 +.L_2il0floatpacket.2: + .long 0x00000000,0x41cdcd65 + .type .L_2il0floatpacket.2,@object + .size .L_2il0floatpacket.2,8 + .align 8 +.L_2il0floatpacket.3: + .long 0x00000000,0x3ff00000 + .type .L_2il0floatpacket.3,@object + .size .L_2il0floatpacket.3,8 + .section .rodata.str1.4, "aMS",@progbits,1 + .align 4 + .align 4 +.L_2__STRING.0: + .long 1668444006 + .word 101 + .type .L_2__STRING.0,@object + .size .L_2__STRING.0,6 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.1: + .long 980644937 + .long 544548128 + .long 1701987872 + .long 622869105 + .long 1411391590 + .long 979725673 + .long 174466336 + .long 1764718915 + .long 622869108 + .long 1881677926 + .long 1852399980 + .long 170484575 + .byte 0 + .type .L_2__STRING.1,@object + .size .L_2__STRING.1,49 + .space 3, 0x00 # pad + .align 4 +.L_2__STRING.2: + .long 1668444006 + .long 759843941 + .long 1718378856 + .long 1734960494 + .word 104 + .type .L_2__STRING.2,@object + .size .L_2__STRING.2,18 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.3: + .long 980644937 + .long 544548128 + .long 1701987872 + .long 622869105 + .long 1411391590 + .long 979725673 + .long 174466336 + .long 1764718915 + .long 622869108 + .long 1747460198 + .long 761687137 + .long 1734960494 + .long 665960 + .type .L_2__STRING.3,@object + .size .L_2__STRING.3,52 + .data + .section .note.GNU-stack, "" +# End diff --git a/static_analysis/jan/icx-icc-lammps-sse.s b/static_analysis/jan/icx-icc-lammps-sse.s new file mode 100644 index 0000000..22783ab --- /dev/null +++ b/static_analysis/jan/icx-icc-lammps-sse.s @@ -0,0 +1,1522 @@ +# mark_description "Intel(R) C Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.6.0 Build 2022"; +# mark_description "0226_000000"; +# mark_description "-I/apps/likwid/5.2.2/include -I././lammps/includes -I././common/includes -S -std=c11 -pedantic-errors -D_GNU"; +# mark_description "_SOURCE -DLIKWID_PERFMON -DAOS -DPRECISION=2 -DCOMPUTE_STATS -DVECTOR_WIDTH=2 -DENABLE_OMP_SIMD -DALIGNMENT="; +# mark_description "64 -restrict -Ofast -xSSE4.2 -o build-lammps-ICC-SSE-DP/force_lj.s"; + .file "force_lj.c" + .text +..TXTST0: +.L_2__routine_start_computeForceLJFullNeigh_plain_c_0: +# -- Begin computeForceLJFullNeigh_plain_c + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJFullNeigh_plain_c +# --- computeForceLJFullNeigh_plain_c(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJFullNeigh_plain_c: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B1.1: # Preds ..B1.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJFullNeigh_plain_c.1: +..L2: + #23.104 + pushq %r14 #23.104 + .cfi_def_cfa_offset 16 + .cfi_offset 14, -16 + pushq %r15 #23.104 + .cfi_def_cfa_offset 24 + .cfi_offset 15, -24 + pushq %rbx #23.104 + .cfi_def_cfa_offset 32 + .cfi_offset 3, -32 + pushq %rbp #23.104 + .cfi_def_cfa_offset 40 + .cfi_offset 6, -40 + subq $136, %rsp #23.104 + .cfi_def_cfa_offset 176 + movq %rsi, %r14 #23.104 + movsd 144(%rdi), %xmm0 #27.27 + movq %rcx, %rbp #23.104 + mulsd %xmm0, %xmm0 #27.45 + movq %rdx, %r15 #23.104 + movsd 56(%rdi), %xmm1 #28.23 + movsd 40(%rdi), %xmm2 #29.24 + movl 4(%r14), %eax #24.18 + movsd %xmm0, 64(%rsp) #27.45[spill] + movsd %xmm1, 40(%rsp) #28.23[spill] + movsd %xmm2, 32(%rsp) #29.24[spill] + testl %eax, %eax #32.24 + jle ..B1.23 # Prob 50% #32.24 + # LOE rbp r12 r13 r14 r15 eax +..B1.2: # Preds ..B1.1 + # Execution count [5.00e-03] + movslq %eax, %rbx #24.18 + lea (%rax,%rax,2), %eax #24.18 + movq 64(%r14), %rdi #33.9 + cmpl $12, %eax #32.5 + jle ..B1.29 # Prob 0% #32.5 + # LOE rbx rbp rdi r12 r13 r14 r15 +..B1.3: # Preds ..B1.2 + # Execution count [1.00e+00] + xorl %esi, %esi #32.5 + lea (%rbx,%rbx,2), %rdx #32.5 + shlq $3, %rdx #32.5 + call _intel_fast_memset #32.5 + # LOE rbx rbp r12 r13 r14 r15 +..B1.5: # Preds ..B1.43 ..B1.3 ..B1.41 + # Execution count [1.00e+00] + xorl %eax, %eax #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.15: +# getTimeStamp() + call getTimeStamp #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.16: + # LOE rbx rbp r12 r13 r14 r15 xmm0 +..B1.50: # Preds ..B1.5 + # Execution count [1.00e+00] + movsd %xmm0, 24(%rsp) #38.16[spill] + # LOE rbx rbp r12 r13 r14 r15 +..B1.6: # Preds ..B1.50 + # Execution count [5.00e-01] + movl $.L_2__STRING.0, %edi #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.18: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.19: + # LOE rbx rbp r12 r13 r14 r15 +..B1.7: # Preds ..B1.6 + # Execution count [9.00e-01] + movsd .L_2il0floatpacket.3(%rip), %xmm13 #77.41 + xorl %eax, %eax #41.15 + mulsd 32(%rsp), %xmm13 #77.41[spill] + xorl %ecx, %ecx #41.5 + movddup 64(%rsp), %xmm3 #27.25[spill] + xorl %edi, %edi #41.5 + movddup %xmm13, %xmm1 #77.41 + movq 16(%r15), %rdx #42.19 + movslq 8(%r15), %rsi #42.43 + movq 24(%r15), %r15 #43.25 + movups .L_2il0floatpacket.2(%rip), %xmm2 #75.32 + movddup 40(%rsp), %xmm5 #28.21[spill] + movsd 40(%rsp), %xmm10 #41.5[spill] + movsd 64(%rsp), %xmm12 #41.5[spill] + movsd .L_2il0floatpacket.5(%rip), %xmm7 #77.54 + shlq $2, %rsi #25.5 + movq 16(%r14), %r11 #44.25 + movq 64(%r14), %r8 #89.9 + movq (%rbp), %r9 #93.9 + movq 8(%rbp), %r10 #94.9 + movups %xmm1, 112(%rsp) #41.5[spill] + movups %xmm3, 48(%rsp) #41.5[spill] + movq %rbx, 104(%rsp) #41.5[spill] + movq %rbp, (%rsp) #41.5[spill] + movq %r12, 8(%rsp) #41.5[spill] + movq %r13, 16(%rsp) #41.5[spill] + .cfi_offset 12, -168 + .cfi_offset 13, -160 + # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r15 xmm5 xmm7 xmm10 xmm12 xmm13 +..B1.8: # Preds ..B1.21 ..B1.7 + # Execution count [5.00e+00] + movl (%r15,%rcx,4), %ebx #43.25 + xorps %xmm6, %xmm6 #47.22 + movaps %xmm6, %xmm4 #48.22 + movsd (%rdi,%r11), %xmm9 #44.25 + movaps %xmm4, %xmm0 #49.22 + movsd 8(%rdi,%r11), %xmm8 #45.25 + movsd 16(%rdi,%r11), %xmm11 #46.25 + movslq %ebx, %r13 #56.9 + testl %ebx, %ebx #56.28 + jle ..B1.21 # Prob 50% #56.28 + # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.9: # Preds ..B1.8 + # Execution count [4.50e+00] + cmpq $2, %r13 #56.9 + jl ..B1.28 # Prob 10% #56.9 + # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.10: # Preds ..B1.9 + # Execution count [4.50e+00] + movq %rsi, %r14 #42.43 + movl %ebx, %ebp #56.9 + imulq %rax, %r14 #42.43 + xorps %xmm6, %xmm6 #47.22 + andl $-2, %ebp #56.9 + movaps %xmm6, %xmm4 #48.22 + movsd %xmm8, 80(%rsp) #71.22[spill] + movaps %xmm4, %xmm0 #49.22 + movsd %xmm9, 88(%rsp) #71.22[spill] + xorl %r12d, %r12d #56.9 + movslq %ebp, %rbp #56.9 + addq %rdx, %r14 #25.5 + movddup %xmm9, %xmm1 #44.23 + movddup %xmm8, %xmm2 #45.23 + movddup %xmm11, %xmm3 #46.23 + movsd %xmm11, 72(%rsp) #71.22[spill] + movsd %xmm13, 96(%rsp) #71.22[spill] + movq %rcx, 32(%rsp) #71.22[spill] + movups 48(%rsp), %xmm8 #71.22[spill] + movdqu .L_2il0floatpacket.1(%rip), %xmm9 #71.22 + movdqu .L_2il0floatpacket.0(%rip), %xmm10 #71.22 + # LOE rax rdx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 +# LLVM-MCA-BEGIN +# OSACA-BEGIN +..B1.11: # Preds ..B1.13 ..B1.10 + # Execution count [2.50e+01] + movq (%r14,%r12,4), %xmm15 #57.21 + movdqa %xmm10, %xmm13 #59.36 + movdqa %xmm15, %xmm7 #58.36 + paddd %xmm15, %xmm7 #58.36 + paddd %xmm7, %xmm15 #58.36 + movaps %xmm1, %xmm7 #58.36 + movd %xmm15, %ecx #58.36 + paddd %xmm15, %xmm13 #59.36 + pshufd $57, %xmm15, %xmm11 #58.36 + paddd %xmm9, %xmm15 #60.36 + pshufd $57, %xmm13, %xmm12 #59.36 + movslq %ecx, %rcx #58.36 + movsd (%r11,%rcx,8), %xmm14 #58.36 + movd %xmm11, %ecx #58.36 + movaps %xmm2, %xmm11 #59.36 + movslq %ecx, %rcx #58.36 + movhpd (%r11,%rcx,8), %xmm14 #58.36 + movd %xmm13, %ecx #59.36 + subpd %xmm14, %xmm7 #58.36 + movslq %ecx, %rcx #59.36 + movsd (%r11,%rcx,8), %xmm14 #59.36 + movd %xmm12, %ecx #59.36 + movslq %ecx, %rcx #59.36 + movhpd (%r11,%rcx,8), %xmm14 #59.36 + movd %xmm15, %ecx #60.36 + pshufd $57, %xmm15, %xmm15 #60.36 + subpd %xmm14, %xmm11 #59.36 + movslq %ecx, %rcx #60.36 + movaps %xmm7, %xmm14 #61.35 + movaps %xmm11, %xmm12 #61.49 + mulpd %xmm7, %xmm14 #61.35 + mulpd %xmm11, %xmm12 #61.49 + movsd (%r11,%rcx,8), %xmm13 #60.36 + movd %xmm15, %ecx #60.36 + movaps %xmm3, %xmm15 #60.36 + addpd %xmm12, %xmm14 #61.49 + movslq %ecx, %rcx #60.36 + pcmpeqd %xmm12, %xmm12 #71.22 + movhpd (%r11,%rcx,8), %xmm13 #60.36 + subpd %xmm13, %xmm15 #60.36 + movaps %xmm15, %xmm13 #61.63 + mulpd %xmm15, %xmm13 #61.63 + addpd %xmm13, %xmm14 #61.63 + movaps %xmm14, %xmm13 #71.22 + cmpltpd %xmm8, %xmm13 #71.22 + ptest %xmm12, %xmm13 #71.22 + je ..B1.13 # Prob 50% #71.22 + # LOE rax rdx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 xmm15 +..B1.12: # Preds ..B1.11 + # Execution count [1.25e+01] + movups .L_2il0floatpacket.2(%rip), %xmm12 #75.38 + divpd %xmm14, %xmm12 #75.38 + movaps %xmm5, %xmm14 #76.38 + mulpd %xmm12, %xmm14 #76.38 + mulpd %xmm12, %xmm14 #76.44 + mulpd %xmm12, %xmm14 #76.50 + mulpd 112(%rsp), %xmm12 #77.54[spill] + mulpd %xmm14, %xmm12 #77.61 + subpd .L_2il0floatpacket.4(%rip), %xmm14 #77.54 + mulpd %xmm14, %xmm12 #77.67 + mulpd %xmm12, %xmm7 #78.31 + mulpd %xmm12, %xmm11 #79.31 + mulpd %xmm12, %xmm15 #80.31 + andps %xmm13, %xmm7 #78.31 + andps %xmm13, %xmm11 #79.31 + andps %xmm15, %xmm13 #80.31 + addpd %xmm7, %xmm6 #78.17 + addpd %xmm11, %xmm4 #79.17 + addpd %xmm13, %xmm0 #80.17 + # LOE rax rdx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 +..B1.13: # Preds ..B1.12 ..B1.11 + # Execution count [2.50e+01] + addq $2, %r12 #56.9 + cmpq %rbp, %r12 #56.9 + jb ..B1.11 # Prob 82% #56.9 +# OSACA-END +# LLVM-MCA-END + # LOE rax rdx rbp rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 ebx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 +..B1.14: # Preds ..B1.13 + # Execution count [4.50e+00] + movaps %xmm0, %xmm1 #49.22 + movaps %xmm4, %xmm2 #48.22 + movaps %xmm6, %xmm3 #47.22 + unpckhpd %xmm0, %xmm1 #49.22 + unpckhpd %xmm4, %xmm2 #48.22 + addsd %xmm1, %xmm0 #49.22 + addsd %xmm2, %xmm4 #48.22 + unpckhpd %xmm6, %xmm3 #47.22 + movsd 72(%rsp), %xmm11 #[spill] + addsd %xmm3, %xmm6 #47.22 + movsd 80(%rsp), %xmm8 #[spill] + movsd 88(%rsp), %xmm9 #[spill] + movsd 96(%rsp), %xmm13 #[spill] + movsd 40(%rsp), %xmm10 #[spill] + movsd 64(%rsp), %xmm12 #[spill] + movq 32(%rsp), %rcx #[spill] + movsd .L_2il0floatpacket.5(%rip), %xmm7 # + # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.15: # Preds ..B1.14 ..B1.28 + # Execution count [5.00e+00] + cmpq %r13, %rbp #56.9 + jae ..B1.21 # Prob 10% #56.9 + # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.16: # Preds ..B1.15 + # Execution count [4.50e+00] + imulq %rsi, %rax #42.43 + addq %rdx, %rax #25.5 + # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.17: # Preds ..B1.19 ..B1.16 + # Execution count [2.50e+01] + movl (%rax,%rbp,4), %r12d #57.21 + movaps %xmm9, %xmm14 #58.36 + movaps %xmm8, %xmm3 #59.36 + movaps %xmm11, %xmm2 #60.36 + lea (%r12,%r12,2), %r14d #58.36 + movslq %r14d, %r14 #58.36 + subsd (%r11,%r14,8), %xmm14 #58.36 + subsd 8(%r11,%r14,8), %xmm3 #59.36 + subsd 16(%r11,%r14,8), %xmm2 #60.36 + movaps %xmm14, %xmm15 #61.35 + movaps %xmm3, %xmm1 #61.49 + mulsd %xmm14, %xmm15 #61.35 + mulsd %xmm3, %xmm1 #61.49 + addsd %xmm1, %xmm15 #61.49 + movaps %xmm2, %xmm1 #61.63 + mulsd %xmm2, %xmm1 #61.63 + addsd %xmm1, %xmm15 #61.63 + comisd %xmm15, %xmm12 #71.22 + jbe ..B1.19 # Prob 50% #71.22 + # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 +..B1.18: # Preds ..B1.17 + # Execution count [1.25e+01] + movsd .L_2il0floatpacket.6(%rip), %xmm1 #75.38 + divsd %xmm15, %xmm1 #75.38 + movaps %xmm10, %xmm15 #76.38 + mulsd %xmm1, %xmm15 #76.38 + mulsd %xmm1, %xmm15 #76.44 + mulsd %xmm1, %xmm15 #76.50 + mulsd %xmm13, %xmm1 #77.54 + mulsd %xmm15, %xmm1 #77.61 + subsd %xmm7, %xmm15 #77.54 + mulsd %xmm15, %xmm1 #77.67 + mulsd %xmm1, %xmm14 #78.31 + mulsd %xmm1, %xmm3 #79.31 + mulsd %xmm1, %xmm2 #80.31 + addsd %xmm14, %xmm6 #78.17 + addsd %xmm3, %xmm4 #79.17 + addsd %xmm2, %xmm0 #80.17 + # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.19: # Preds ..B1.18 ..B1.17 + # Execution count [2.50e+01] + incq %rbp #56.9 + cmpq %r13, %rbp #56.9 + jb ..B1.17 # Prob 82% #56.9 + # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.21: # Preds ..B1.19 ..B1.8 ..B1.15 + # Execution count [5.00e+00] + addq %r13, %r9 #93.9 + lea 1(%rbx), %eax #94.9 + shrl $31, %eax #94.9 + addsd (%rdi,%r8), %xmm6 #89.9 + addsd 8(%rdi,%r8), %xmm4 #90.9 + addsd 16(%rdi,%r8), %xmm0 #91.9 + movsd %xmm6, (%rdi,%r8) #89.9 + lea 1(%rbx,%rax), %ebx #94.9 + sarl $1, %ebx #94.9 + movslq %ebx, %rbx #94.9 + movslq %ecx, %rax #41.32 + incq %rcx #41.5 + movsd %xmm4, 8(%rdi,%r8) #90.9 + addq %rbx, %r10 #94.9 + movsd %xmm0, 16(%rdi,%r8) #91.9 + addq $24, %rdi #41.5 + incq %rax #41.32 + cmpq 104(%rsp), %rcx #41.5[spill] + jb ..B1.8 # Prob 82% #41.5 + # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r15 xmm5 xmm7 xmm10 xmm12 xmm13 +..B1.22: # Preds ..B1.21 + # Execution count [9.00e-01] + movq (%rsp), %rbp #[spill] + movq 8(%rsp), %r12 #[spill] + .cfi_restore 12 + movq 16(%rsp), %r13 #[spill] + .cfi_restore 13 + movq %r9, (%rbp) #93.9 + movq %r10, 8(%rbp) #94.9 + jmp ..B1.25 # Prob 100% #94.9 + # LOE r12 r13 +..B1.23: # Preds ..B1.1 + # Execution count [5.00e-01] + xorl %eax, %eax #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.53: +# getTimeStamp() + call getTimeStamp #38.16 +..___tag_value_computeForceLJFullNeigh_plain_c.54: + # LOE r12 r13 xmm0 +..B1.51: # Preds ..B1.23 + # Execution count [5.00e-01] + movsd %xmm0, 24(%rsp) #38.16[spill] + # LOE r12 r13 +..B1.24: # Preds ..B1.51 + # Execution count [5.00e-01] + movl $.L_2__STRING.0, %edi #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.56: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #39.5 +..___tag_value_computeForceLJFullNeigh_plain_c.57: + # LOE r12 r13 +..B1.25: # Preds ..B1.22 ..B1.24 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #97.5 +..___tag_value_computeForceLJFullNeigh_plain_c.58: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #97.5 +..___tag_value_computeForceLJFullNeigh_plain_c.59: + # LOE r12 r13 +..B1.26: # Preds ..B1.25 + # Execution count [1.00e+00] + xorl %eax, %eax #98.16 +..___tag_value_computeForceLJFullNeigh_plain_c.60: +# getTimeStamp() + call getTimeStamp #98.16 +..___tag_value_computeForceLJFullNeigh_plain_c.61: + # LOE r12 r13 xmm0 +..B1.27: # Preds ..B1.26 + # Execution count [1.00e+00] + subsd 24(%rsp), %xmm0 #102.14[spill] + addq $136, %rsp #102.14 + .cfi_def_cfa_offset 40 + .cfi_restore 6 + popq %rbp #102.14 + .cfi_def_cfa_offset 32 + .cfi_restore 3 + popq %rbx #102.14 + .cfi_def_cfa_offset 24 + .cfi_restore 15 + popq %r15 #102.14 + .cfi_def_cfa_offset 16 + .cfi_restore 14 + popq %r14 #102.14 + .cfi_def_cfa_offset 8 + ret #102.14 + .cfi_def_cfa_offset 176 + .cfi_offset 3, -32 + .cfi_offset 6, -40 + .cfi_offset 12, -168 + .cfi_offset 13, -160 + .cfi_offset 14, -16 + .cfi_offset 15, -24 + # LOE +..B1.28: # Preds ..B1.9 + # Execution count [4.50e-01]: Infreq + xorl %ebp, %ebp #56.9 + jmp ..B1.15 # Prob 100% #56.9 + .cfi_restore 12 + .cfi_restore 13 + # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11 r13 r15 ebx xmm0 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B1.29: # Preds ..B1.2 + # Execution count [1.00e+00]: Infreq + lea (%rbx,%rbx,2), %rdx #24.18 + cmpq $4, %rdx #32.5 + jl ..B1.45 # Prob 10% #32.5 + # LOE rdx rbx rbp rdi r12 r13 r14 r15 +..B1.30: # Preds ..B1.29 + # Execution count [1.00e+00]: Infreq + movq %rdi, %rcx #32.5 + andq $15, %rcx #32.5 + testl %ecx, %ecx #32.5 + je ..B1.33 # Prob 50% #32.5 + # LOE rdx rbx rbp rdi r12 r13 r14 r15 ecx +..B1.31: # Preds ..B1.30 + # Execution count [1.00e+00]: Infreq + testb $7, %cl #32.5 + jne ..B1.45 # Prob 10% #32.5 + # LOE rdx rbx rbp rdi r12 r13 r14 r15 +..B1.32: # Preds ..B1.31 + # Execution count [5.00e-01]: Infreq + movl $1, %ecx #32.5 + # LOE rdx rbx rbp rdi r12 r13 r14 r15 ecx +..B1.33: # Preds ..B1.32 ..B1.30 + # Execution count [1.00e+00]: Infreq + movl %ecx, %eax #32.5 + lea 4(%rax), %rsi #32.5 + cmpq %rsi, %rdx #32.5 + jl ..B1.45 # Prob 10% #32.5 + # LOE rax rdx rbx rbp rdi r12 r13 r14 r15 ecx +..B1.34: # Preds ..B1.33 + # Execution count [1.11e+00]: Infreq + movl %edx, %r9d #32.5 + movl %r9d, %esi #32.5 + subl %ecx, %esi #32.5 + andl $3, %esi #32.5 + subl %esi, %r9d #32.5 + xorl %esi, %esi #32.5 + xorl %r8d, %r8d #33.22 + testl %ecx, %ecx #32.5 + movslq %r9d, %rcx #32.5 + jbe ..B1.38 # Prob 10% #32.5 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 +..B1.36: # Preds ..B1.34 ..B1.36 + # Execution count [5.56e+00]: Infreq + movq %r8, (%rdi,%rsi,8) #33.9 + incq %rsi #32.5 + cmpq %rax, %rsi #32.5 + jb ..B1.36 # Prob 82% #32.5 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 +..B1.38: # Preds ..B1.36 ..B1.34 + # Execution count [1.00e+00]: Infreq + xorps %xmm0, %xmm0 #33.22 + # LOE rax rdx rcx rbx rbp rdi r12 r13 r14 r15 xmm0 +..B1.39: # Preds ..B1.39 ..B1.38 + # Execution count [5.56e+00]: Infreq + movups %xmm0, (%rdi,%rax,8) #33.9 + movups %xmm0, 16(%rdi,%rax,8) #33.9 + addq $4, %rax #32.5 + cmpq %rcx, %rax #32.5 + jb ..B1.39 # Prob 82% #32.5 + # LOE rax rdx rcx rbx rbp rdi r12 r13 r14 r15 xmm0 +..B1.41: # Preds ..B1.39 ..B1.45 + # Execution count [1.11e+00]: Infreq + cmpq %rdx, %rcx #32.5 + jae ..B1.5 # Prob 10% #32.5 + # LOE rdx rcx rbx rbp rdi r12 r13 r14 r15 +..B1.43: # Preds ..B1.41 ..B1.43 + # Execution count [5.56e+00]: Infreq + movq $0, (%rdi,%rcx,8) #33.9 + incq %rcx #32.5 + cmpq %rdx, %rcx #32.5 + jb ..B1.43 # Prob 82% #32.5 + jmp ..B1.5 # Prob 100% #32.5 + # LOE rdx rcx rbx rbp rdi r12 r13 r14 r15 +..B1.45: # Preds ..B1.29 ..B1.31 ..B1.33 + # Execution count [1.00e-01]: Infreq + xorl %ecx, %ecx #32.5 + jmp ..B1.41 # Prob 100% #32.5 + .align 16,0x90 + # LOE rdx rcx rbx rbp rdi r12 r13 r14 r15 + .cfi_endproc +# mark_end; + .type computeForceLJFullNeigh_plain_c,@function + .size computeForceLJFullNeigh_plain_c,.-computeForceLJFullNeigh_plain_c +..LNcomputeForceLJFullNeigh_plain_c.0: + .data +# -- End computeForceLJFullNeigh_plain_c + .text +.L_2__routine_start_computeForceLJHalfNeigh_1: +# -- Begin computeForceLJHalfNeigh + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJHalfNeigh +# --- computeForceLJHalfNeigh(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJHalfNeigh: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B2.1: # Preds ..B2.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJHalfNeigh.82: +..L83: + #105.96 + pushq %r12 #105.96 + .cfi_def_cfa_offset 16 + .cfi_offset 12, -16 + pushq %r13 #105.96 + .cfi_def_cfa_offset 24 + .cfi_offset 13, -24 + pushq %r14 #105.96 + .cfi_def_cfa_offset 32 + .cfi_offset 14, -32 + pushq %r15 #105.96 + .cfi_def_cfa_offset 40 + .cfi_offset 15, -40 + pushq %rbx #105.96 + .cfi_def_cfa_offset 48 + .cfi_offset 3, -48 + pushq %rbp #105.96 + .cfi_def_cfa_offset 56 + .cfi_offset 6, -56 + subq $216, %rsp #105.96 + .cfi_def_cfa_offset 272 + movq %rdi, %r15 #105.96 + movq %rsi, %r12 #105.96 + movq %rcx, %r14 #105.96 + movq %rdx, %r13 #105.96 + movsd 144(%r15), %xmm0 #109.27 + mulsd %xmm0, %xmm0 #109.45 + movsd 56(%r15), %xmm1 #110.23 + movsd 40(%r15), %xmm2 #111.24 + movl 4(%r12), %ebp #106.18 + movsd %xmm0, 48(%rsp) #109.45[spill] + movsd %xmm1, 40(%rsp) #110.23[spill] + movsd %xmm2, 24(%rsp) #111.24[spill] + testl %ebp, %ebp #114.24 + jle ..B2.51 # Prob 50% #114.24 + # LOE r12 r13 r14 r15 ebp +..B2.2: # Preds ..B2.1 + # Execution count [5.00e-03] + movslq %ebp, %rbp #106.18 + movq 64(%r12), %rdi #115.9 + lea (%rbp,%rbp,2), %eax #106.18 + movq %rbp, 32(%rsp) #106.18[spill] + cmpl $12, %eax #114.5 + jle ..B2.59 # Prob 0% #114.5 + # LOE rbp rdi r12 r13 r14 r15 ebp +..B2.3: # Preds ..B2.2 + # Execution count [1.00e+00] + movq %rbp, %rax #114.5 + xorl %esi, %esi #114.5 + lea (%rax,%rax,2), %rdx #114.5 + shlq $3, %rdx #114.5 + call _intel_fast_memset #114.5 + # LOE r12 r13 r14 r15 ebp +..B2.5: # Preds ..B2.73 ..B2.3 ..B2.71 + # Execution count [1.00e+00] + xorl %ebx, %ebx #120.22 + xorl %eax, %eax #121.16 +..___tag_value_computeForceLJHalfNeigh.101: +# getTimeStamp() + call getTimeStamp #121.16 +..___tag_value_computeForceLJHalfNeigh.102: + # LOE r12 r13 r14 r15 ebx ebp xmm0 +..B2.80: # Preds ..B2.5 + # Execution count [1.00e+00] + movsd %xmm0, 16(%rsp) #121.16[spill] + # LOE r12 r13 r14 r15 ebx ebp +..B2.6: # Preds ..B2.80 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #122.5 +..___tag_value_computeForceLJHalfNeigh.104: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #122.5 +..___tag_value_computeForceLJHalfNeigh.105: + # LOE r12 r13 r14 r15 ebx ebp +..B2.7: # Preds ..B2.6 + # Execution count [9.00e-01] + movsd .L_2il0floatpacket.3(%rip), %xmm10 #161.41 + movd %ebp, %xmm0 #106.18 + mulsd 24(%rsp), %xmm10 #161.41[spill] + xorl %eax, %eax #124.15 + movslq 8(%r13), %rdx #125.43 + xorl %edi, %edi #124.5 + shlq $2, %rdx #107.5 + xorl %r11d, %r11d #124.5 + movddup 40(%rsp), %xmm3 #110.21[spill] + movddup %xmm10, %xmm2 #161.41 + pshufd $0, %xmm0, %xmm0 #106.18 + movq 16(%r13), %rcx #125.19 + movq %rdx, 56(%rsp) #124.5[spill] + movddup 48(%rsp), %xmm6 #109.25[spill] + movsd 40(%rsp), %xmm8 #124.5[spill] + movsd 48(%rsp), %xmm13 #124.5[spill] + movq 32(%rsp), %rdx #124.5[spill] + movups .L_2il0floatpacket.4(%rip), %xmm1 #161.54 + movsd .L_2il0floatpacket.5(%rip), %xmm7 #161.54 + movq 24(%r13), %r13 #126.25 + movq 16(%r12), %rsi #127.25 + movq 64(%r12), %r8 #168.21 + movq (%r14), %r9 #179.9 + movq 8(%r14), %r10 #180.9 + movdqu %xmm0, 160(%rsp) #124.5[spill] + movups %xmm2, 192(%rsp) #124.5[spill] + movups %xmm3, 176(%rsp) #124.5[spill] + movl %ebp, 64(%rsp) #124.5[spill] + movq %r15, (%rsp) #124.5[spill] + movq %r14, 8(%rsp) #124.5[spill] + # LOE rax rdx rcx rsi rdi r8 r9 r10 r11 r13 ebx xmm6 xmm7 xmm8 xmm10 xmm13 +..B2.8: # Preds ..B2.49 ..B2.7 + # Execution count [5.00e+00] + movl (%r13,%rdi,4), %ebp #126.25 + addl %ebp, %ebx #138.9 + xorps %xmm5, %xmm5 #130.22 + movaps %xmm5, %xmm4 #131.22 + movsd (%r11,%rsi), %xmm12 #127.25 + movaps %xmm4, %xmm2 #132.22 + movsd 8(%r11,%rsi), %xmm11 #128.25 + movsd 16(%r11,%rsi), %xmm9 #129.25 + testl %ebp, %ebp #143.9 + jle ..B2.48 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.9: # Preds ..B2.8 + # Execution count [2.50e+00] + jbe ..B2.48 # Prob 50% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.10: # Preds ..B2.9 + # Execution count [2.25e+00] + cmpl $2, %ebp #143.9 + jb ..B2.58 # Prob 10% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.11: # Preds ..B2.10 + # Execution count [2.25e+00] + movq 56(%rsp), %r14 #125.43[spill] + movl %ebp, %r12d #143.9 + imulq %rax, %r14 #125.43 + xorps %xmm5, %xmm5 #130.22 + andl $-2, %r12d #143.9 + movaps %xmm5, %xmm4 #131.22 + movsd %xmm11, 128(%rsp) #143.9[spill] + movaps %xmm4, %xmm2 #132.22 + movsd %xmm12, 136(%rsp) #143.9[spill] + movddup %xmm12, %xmm1 #127.23 + xorl %r15d, %r15d #143.9 + movddup %xmm11, %xmm0 #128.23 + addq %rcx, %r14 #107.5 + movddup %xmm9, %xmm3 #129.23 + movslq %r12d, %r12 #143.9 + movsd %xmm9, 120(%rsp) #143.9[spill] + movsd %xmm10, 144(%rsp) #143.9[spill] + movl %ebp, 24(%rsp) #143.9[spill] + movq %r11, 72(%rsp) #143.9[spill] + movq %r10, 80(%rsp) #143.9[spill] + movq %r9, 88(%rsp) #143.9[spill] + movq %r13, 96(%rsp) #143.9[spill] + movq %rcx, 104(%rsp) #143.9[spill] + movq %rdi, 112(%rsp) #143.9[spill] + movdqu .L_2il0floatpacket.1(%rip), %xmm11 #143.9 + movdqu .L_2il0floatpacket.0(%rip), %xmm12 #143.9 + # LOE rax rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 +..B2.12: # Preds ..B2.38 ..B2.11 + # Execution count [1.25e+01] + movq (%r14,%r15,4), %xmm7 #144.21 + movdqa %xmm12, %xmm15 #146.36 + movdqa %xmm7, %xmm8 #145.36 + paddd %xmm7, %xmm8 #145.36 + paddd %xmm7, %xmm8 #145.36 + movd %xmm8, %r9d #145.36 + paddd %xmm8, %xmm15 #146.36 + pshufd $57, %xmm8, %xmm10 #145.36 + paddd %xmm11, %xmm8 #147.36 + pshufd $57, %xmm15, %xmm13 #146.36 + movd %xmm10, %edi #145.36 + movaps %xmm1, %xmm10 #145.36 + movd %xmm15, %ebp #146.36 + movd %xmm13, %ecx #146.36 + movd %xmm8, %edx #147.36 + pshufd $57, %xmm8, %xmm8 #147.36 + movd %xmm8, %r10d #147.36 + movaps %xmm3, %xmm8 #147.36 + movslq %r9d, %r9 #145.36 + movslq %edi, %rdi #145.36 + movslq %ebp, %rbp #146.36 + movslq %ecx, %rcx #146.36 + movsd (%rsi,%r9,8), %xmm9 #145.36 + movhpd (%rsi,%rdi,8), %xmm9 #145.36 + movsd (%rsi,%rbp,8), %xmm14 #146.36 + subpd %xmm9, %xmm10 #145.36 + movhpd (%rsi,%rcx,8), %xmm14 #146.36 + movaps %xmm0, %xmm9 #146.36 + movslq %edx, %rdx #147.36 + subpd %xmm14, %xmm9 #146.36 + movslq %r10d, %r10 #147.36 + movaps %xmm9, %xmm13 #148.49 + movsd (%rsi,%rdx,8), %xmm15 #147.36 + mulpd %xmm9, %xmm13 #148.49 + movhpd (%rsi,%r10,8), %xmm15 #147.36 + subpd %xmm15, %xmm8 #147.36 + movaps %xmm10, %xmm15 #148.35 + movaps %xmm8, %xmm14 #148.63 + mulpd %xmm10, %xmm15 #148.35 + mulpd %xmm8, %xmm14 #148.63 + addpd %xmm13, %xmm15 #148.49 + addpd %xmm14, %xmm15 #148.63 + movaps %xmm15, %xmm13 #158.22 + pcmpeqd %xmm14, %xmm14 #158.22 + cmpltpd %xmm6, %xmm13 #158.22 + ptest %xmm14, %xmm13 #158.22 + je ..B2.38 # Prob 50% #158.22 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm15 +..B2.13: # Preds ..B2.12 + # Execution count [6.25e+00] + movups .L_2il0floatpacket.2(%rip), %xmm14 #159.38 + divpd %xmm15, %xmm14 #159.38 + movdqu 160(%rsp), %xmm15 #167.24[spill] + pcmpgtd %xmm7, %xmm15 #167.24 + pmovsxdq %xmm15, %xmm15 #167.24 + pcmpeqd %xmm7, %xmm7 #167.24 + andps %xmm13, %xmm15 #167.24 + ptest %xmm7, %xmm15 #167.24 + movups 176(%rsp), %xmm7 #160.38[spill] + mulpd %xmm14, %xmm7 #160.38 + mulpd %xmm14, %xmm7 #160.44 + mulpd %xmm14, %xmm7 #160.50 + mulpd 192(%rsp), %xmm14 #161.54[spill] + mulpd %xmm7, %xmm14 #161.61 + subpd .L_2il0floatpacket.4(%rip), %xmm7 #161.54 + mulpd %xmm7, %xmm14 #161.67 + mulpd %xmm14, %xmm10 #162.31 + mulpd %xmm14, %xmm9 #163.31 + mulpd %xmm14, %xmm8 #164.31 + movaps %xmm13, %xmm14 #162.31 + movaps %xmm13, %xmm7 #163.31 + andps %xmm10, %xmm14 #162.31 + andps %xmm9, %xmm7 #163.31 + andps %xmm8, %xmm13 #164.31 + addpd %xmm14, %xmm5 #162.17 + addpd %xmm7, %xmm4 #163.17 + addpd %xmm13, %xmm2 #164.17 + je ..B2.38 # Prob 50% #167.24 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 xmm11 xmm12 xmm15 +..B2.14: # Preds ..B2.13 + # Execution count [3.12e+00] + movmskpd %xmm15, %r13d #168.21 + movl %r13d, %r11d #168.21 + andl $2, %r11d #168.21 + andl $1, %r13d #168.21 + je ..B2.17 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm10 xmm11 xmm12 +..B2.15: # Preds ..B2.14 + # Execution count [3.12e+00] + movsd (%r8,%r9,8), %xmm7 #168.21 + testl %r11d, %r11d #168.21 + jne ..B2.18 # Prob 60% #168.21 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 +..B2.16: # Preds ..B2.15 + # Execution count [1.25e+00] + xorps %xmm13, %xmm13 #168.21 + unpcklpd %xmm13, %xmm7 #168.21 + subpd %xmm10, %xmm7 #168.21 + jmp ..B2.31 # Prob 100% #168.21 + # LOE rax rdx rbx rbp rsi r8 r9 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 +..B2.17: # Preds ..B2.14 + # Execution count [3.12e+00] + testl %r11d, %r11d #168.21 + xorps %xmm7, %xmm7 #168.21 + je ..B2.30 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 +..B2.18: # Preds ..B2.15 ..B2.17 + # Execution count [3.12e+00] + testl %r13d, %r13d #168.21 + movhpd (%r8,%rdi,8), %xmm7 #168.21 + subpd %xmm10, %xmm7 #168.21 + je ..B2.20 # Prob 40% #168.21 + # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 +..B2.19: # Preds ..B2.18 + # Execution count [1.88e+00] + pshufd $14, %xmm7, %xmm10 #168.21 + movsd %xmm7, (%r8,%r9,8) #168.21 + movsd %xmm10, (%r8,%rdi,8) #168.21 + movsd (%r8,%rbp,8), %xmm13 #169.21 + jmp ..B2.21 # Prob 100% #169.21 + # LOE rax rdx rcx rbx rbp rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 +..B2.20: # Preds ..B2.18 + # Execution count [1.25e+00] + pshufd $14, %xmm7, %xmm7 #168.21 + movsd %xmm7, (%r8,%rdi,8) #168.21 + xorps %xmm13, %xmm13 #169.21 + # LOE rax rdx rcx rbx rbp rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 +..B2.21: # Preds ..B2.19 ..B2.20 + # Execution count [1.88e+00] + testl %r11d, %r11d #169.21 + je ..B2.84 # Prob 40% #169.21 + # LOE rax rdx rcx rbx rbp rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 +..B2.22: # Preds ..B2.21 + # Execution count [3.12e+00] + testl %r13d, %r13d #169.21 + movhpd (%r8,%rcx,8), %xmm13 #169.21 + subpd %xmm9, %xmm13 #169.21 + je ..B2.24 # Prob 40% #169.21 + # LOE rax rdx rcx rbx rbp rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm11 xmm12 xmm13 +..B2.23: # Preds ..B2.22 + # Execution count [1.88e+00] + pshufd $14, %xmm13, %xmm7 #169.21 + movsd %xmm13, (%r8,%rbp,8) #169.21 + movsd %xmm7, (%r8,%rcx,8) #169.21 + movsd (%r8,%rdx,8), %xmm9 #170.21 + jmp ..B2.25 # Prob 100% #170.21 + # LOE rax rdx rbx rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 +..B2.24: # Preds ..B2.22 + # Execution count [1.25e+00] + pshufd $14, %xmm13, %xmm7 #169.21 + movsd %xmm7, (%r8,%rcx,8) #169.21 + xorps %xmm9, %xmm9 #170.21 + # LOE rax rdx rbx rsi r8 r10 r12 r14 r15 r11d r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 +..B2.25: # Preds ..B2.23 ..B2.24 + # Execution count [1.88e+00] + testl %r11d, %r11d #170.21 + je ..B2.83 # Prob 40% #170.21 + # LOE rax rdx rbx rsi r8 r10 r12 r14 r15 r13d xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 +..B2.26: # Preds ..B2.25 + # Execution count [3.12e+00] + testl %r13d, %r13d #170.21 + movhpd (%r8,%r10,8), %xmm9 #170.21 + subpd %xmm8, %xmm9 #170.21 + je ..B2.28 # Prob 40% #170.21 + # LOE rax rdx rbx rsi r8 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 +..B2.27: # Preds ..B2.26 + # Execution count [1.88e+00] + movsd %xmm9, (%r8,%rdx,8) #170.21 + pshufd $14, %xmm9, %xmm7 #170.21 + jmp ..B2.29 # Prob 100% #170.21 + # LOE rax rbx rsi r8 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm11 xmm12 +..B2.28: # Preds ..B2.26 + # Execution count [1.25e+00] + pshufd $14, %xmm9, %xmm7 #170.21 + # LOE rax rbx rsi r8 r10 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm11 xmm12 +..B2.29: # Preds ..B2.27 ..B2.28 + # Execution count [3.12e+00] + movsd %xmm7, (%r8,%r10,8) #170.21 + jmp ..B2.38 # Prob 100% #170.21 + # LOE rax rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 +..B2.30: # Preds ..B2.17 + # Execution count [1.88e+00] + testl %r13d, %r13d #168.21 + xorps %xmm7, %xmm7 #168.21 + subpd %xmm10, %xmm7 #168.21 + je ..B2.32 # Prob 40% #168.21 + # LOE rax rdx rbx rbp rsi r8 r9 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 xmm12 +..B2.31: # Preds ..B2.16 ..B2.30 + # Execution count [1.25e+00] + movsd %xmm7, (%r8,%r9,8) #168.21 + movsd (%r8,%rbp,8), %xmm13 #169.21 + xorps %xmm10, %xmm10 #169.21 + unpcklpd %xmm10, %xmm13 #169.21 + subpd %xmm9, %xmm13 #169.21 + jmp ..B2.34 # Prob 100% #169.21 + # LOE rax rdx rbx rbp rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm11 xmm12 xmm13 +..B2.32: # Preds ..B2.30 + # Execution count [0.00e+00] + xorps %xmm13, %xmm13 #169.21 + jmp ..B2.33 # Prob 100% #169.21 + # LOE rax rdx rbx rbp rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 +..B2.84: # Preds ..B2.21 + # Execution count [7.50e-01] + testl %r13d, %r13d #168.21 + # LOE rax rdx rbx rbp rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 xmm13 +..B2.33: # Preds ..B2.32 ..B2.84 + # Execution count [2.67e+00] + xorps %xmm7, %xmm7 #169.21 + unpcklpd %xmm7, %xmm13 #169.21 + subpd %xmm9, %xmm13 #169.21 + je ..B2.35 # Prob 40% #169.21 + # LOE rax rdx rbx rbp rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm11 xmm12 xmm13 +..B2.34: # Preds ..B2.31 ..B2.33 + # Execution count [1.25e+00] + movsd %xmm13, (%r8,%rbp,8) #169.21 + movsd (%r8,%rdx,8), %xmm9 #170.21 + xorps %xmm7, %xmm7 #170.21 + unpcklpd %xmm7, %xmm9 #170.21 + subpd %xmm8, %xmm9 #170.21 + jmp ..B2.37 # Prob 100% #170.21 + # LOE rax rdx rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 +..B2.35: # Preds ..B2.33 + # Execution count [0.00e+00] + xorps %xmm9, %xmm9 #170.21 + jmp ..B2.36 # Prob 100% #170.21 + # LOE rax rdx rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 +..B2.83: # Preds ..B2.25 + # Execution count [7.50e-01] + testl %r13d, %r13d #168.21 + # LOE rax rdx rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm8 xmm9 xmm11 xmm12 +..B2.36: # Preds ..B2.35 ..B2.83 + # Execution count [2.67e+00] + xorps %xmm7, %xmm7 #170.21 + unpcklpd %xmm7, %xmm9 #170.21 + subpd %xmm8, %xmm9 #170.21 + je ..B2.38 # Prob 40% #170.21 + # LOE rax rdx rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm9 xmm11 xmm12 +..B2.37: # Preds ..B2.34 ..B2.36 + # Execution count [1.25e+00] + movsd %xmm9, (%r8,%rdx,8) #170.21 + # LOE rax rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 +..B2.38: # Preds ..B2.36 ..B2.29 ..B2.37 ..B2.13 ..B2.12 + # + # Execution count [1.25e+01] + addq $2, %r15 #143.9 + cmpq %r12, %r15 #143.9 + jb ..B2.12 # Prob 82% #143.9 + # LOE rax rbx rsi r8 r12 r14 r15 xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm11 xmm12 +..B2.39: # Preds ..B2.38 + # Execution count [2.25e+00] + movaps %xmm2, %xmm0 #132.22 + movaps %xmm4, %xmm1 #131.22 + movaps %xmm5, %xmm3 #130.22 + unpckhpd %xmm2, %xmm0 #132.22 + unpckhpd %xmm4, %xmm1 #131.22 + addsd %xmm0, %xmm2 #132.22 + addsd %xmm1, %xmm4 #131.22 + unpckhpd %xmm5, %xmm3 #130.22 + movsd 120(%rsp), %xmm9 #[spill] + addsd %xmm3, %xmm5 #130.22 + movsd 128(%rsp), %xmm11 #[spill] + movsd 136(%rsp), %xmm12 #[spill] + movsd 144(%rsp), %xmm10 #[spill] + movsd 40(%rsp), %xmm8 #[spill] + movsd 48(%rsp), %xmm13 #[spill] + movl 24(%rsp), %ebp #[spill] + movq 72(%rsp), %r11 #[spill] + movq 80(%rsp), %r10 #[spill] + movq 88(%rsp), %r9 #[spill] + movq 96(%rsp), %r13 #[spill] + movq 104(%rsp), %rcx #[spill] + movq 112(%rsp), %rdi #[spill] + movq 32(%rsp), %rdx #[spill] + movsd .L_2il0floatpacket.5(%rip), %xmm7 # + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.40: # Preds ..B2.39 ..B2.58 + # Execution count [2.50e+00] + movslq %ebp, %r14 #143.9 + cmpq %r14, %r12 #143.9 + jae ..B2.49 # Prob 10% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 r14 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.41: # Preds ..B2.40 + # Execution count [2.25e+00] + imulq 56(%rsp), %rax #125.43[spill] + movl 64(%rsp), %edx #107.5[spill] + addq %rcx, %rax #107.5 + movq %rdi, 112(%rsp) #107.5[spill] + # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 edx ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.42: # Preds ..B2.45 ..B2.41 + # Execution count [1.25e+01] + movl (%rax,%r12,4), %edi #144.21 + movaps %xmm12, %xmm14 #145.36 + movaps %xmm11, %xmm3 #146.36 + movaps %xmm9, %xmm1 #147.36 + lea (%rdi,%rdi,2), %r15d #145.36 + movslq %r15d, %r15 #145.36 + subsd (%rsi,%r15,8), %xmm14 #145.36 + subsd 8(%rsi,%r15,8), %xmm3 #146.36 + subsd 16(%rsi,%r15,8), %xmm1 #147.36 + movaps %xmm14, %xmm15 #148.35 + movaps %xmm3, %xmm0 #148.49 + mulsd %xmm14, %xmm15 #148.35 + mulsd %xmm3, %xmm0 #148.49 + addsd %xmm0, %xmm15 #148.49 + movaps %xmm1, %xmm0 #148.63 + mulsd %xmm1, %xmm0 #148.63 + addsd %xmm0, %xmm15 #148.63 + comisd %xmm15, %xmm13 #158.22 + jbe ..B2.45 # Prob 50% #158.22 + # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 edx ebp edi xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15 +..B2.43: # Preds ..B2.42 + # Execution count [6.25e+00] + movsd .L_2il0floatpacket.6(%rip), %xmm0 #159.38 + divsd %xmm15, %xmm0 #159.38 + movaps %xmm8, %xmm15 #160.38 + mulsd %xmm0, %xmm15 #160.38 + mulsd %xmm0, %xmm15 #160.44 + mulsd %xmm0, %xmm15 #160.50 + mulsd %xmm10, %xmm0 #161.54 + mulsd %xmm15, %xmm0 #161.61 + subsd %xmm7, %xmm15 #161.54 + mulsd %xmm15, %xmm0 #161.67 + mulsd %xmm0, %xmm14 #162.31 + mulsd %xmm0, %xmm3 #163.31 + mulsd %xmm0, %xmm1 #164.31 + addsd %xmm14, %xmm5 #162.17 + addsd %xmm3, %xmm4 #163.17 + addsd %xmm1, %xmm2 #164.17 + cmpl %edx, %edi #167.24 + jge ..B2.45 # Prob 50% #167.24 + # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 r15 edx ebp xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 +..B2.44: # Preds ..B2.43 + # Execution count [3.12e+00] + movsd (%r8,%r15,8), %xmm0 #168.21 + subsd %xmm14, %xmm0 #168.21 + movsd 8(%r8,%r15,8), %xmm14 #169.21 + subsd %xmm3, %xmm14 #169.21 + movsd 16(%r8,%r15,8), %xmm3 #170.21 + movsd %xmm0, (%r8,%r15,8) #168.21 + subsd %xmm1, %xmm3 #170.21 + movsd %xmm14, 8(%r8,%r15,8) #169.21 + movsd %xmm3, 16(%r8,%r15,8) #170.21 + # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 edx ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.45: # Preds ..B2.44 ..B2.43 ..B2.42 + # Execution count [1.25e+01] + incq %r12 #143.9 + cmpq %r14, %r12 #143.9 + jb ..B2.42 # Prob 82% #143.9 + # LOE rax rcx rbx rsi r8 r9 r10 r11 r12 r13 r14 edx ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.46: # Preds ..B2.45 + # Execution count [2.25e+00] + movq 112(%rsp), %rdi #[spill] + movq 32(%rsp), %rdx #[spill] + jmp ..B2.49 # Prob 100% # + # LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 r14 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm13 +..B2.48: # Preds ..B2.9 ..B2.8 + # Execution count [2.50e+00] + movslq %ebp, %r14 #179.9 + # LOE rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 r14 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm10 xmm13 +..B2.49: # Preds ..B2.46 ..B2.40 ..B2.48 + # Execution count [5.00e+00] + addq %r14, %r9 #179.9 + lea 1(%rbp), %eax #180.9 + shrl $31, %eax #180.9 + addsd (%r11,%r8), %xmm5 #175.9 + addsd 8(%r11,%r8), %xmm4 #176.9 + addsd 16(%r11,%r8), %xmm2 #177.9 + movsd %xmm5, (%r11,%r8) #175.9 + lea 1(%rbp,%rax), %ebp #180.9 + sarl $1, %ebp #180.9 + movslq %ebp, %rbp #180.9 + movslq %edi, %rax #124.32 + incq %rdi #124.5 + movsd %xmm4, 8(%r11,%r8) #176.9 + addq %rbp, %r10 #180.9 + movsd %xmm2, 16(%r11,%r8) #177.9 + addq $24, %r11 #124.5 + incq %rax #124.32 + cmpq %rdx, %rdi #124.5 + jb ..B2.8 # Prob 82% #124.5 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r13 xmm6 xmm7 xmm8 xmm10 xmm13 +..B2.50: # Preds ..B2.49 + # Execution count [9.00e-01] + movq 8(%rsp), %r14 #[spill] + movq (%rsp), %r15 #[spill] + movq %r9, (%r14) #179.9 + movq %r10, 8(%r14) #180.9 + jmp ..B2.54 # Prob 100% #180.9 + # LOE rbx r15 +..B2.51: # Preds ..B2.1 + # Execution count [5.00e-01] + xorl %ebx, %ebx #120.22 + xorl %eax, %eax #121.16 +..___tag_value_computeForceLJHalfNeigh.155: +# getTimeStamp() + call getTimeStamp #121.16 +..___tag_value_computeForceLJHalfNeigh.156: + # LOE rbx r15 xmm0 +..B2.81: # Preds ..B2.51 + # Execution count [5.00e-01] + movsd %xmm0, 16(%rsp) #121.16[spill] + # LOE rbx r15 +..B2.52: # Preds ..B2.81 + # Execution count [5.00e-01] + movl $.L_2__STRING.1, %edi #122.5 +..___tag_value_computeForceLJHalfNeigh.158: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #122.5 +..___tag_value_computeForceLJHalfNeigh.159: + # LOE rbx r15 +..B2.54: # Preds ..B2.52 ..B2.50 + # Execution count [1.00e+00] + movl $.L_2__STRING.1, %edi #183.5 +..___tag_value_computeForceLJHalfNeigh.160: +# likwid_markerStopRegion(const char *) + call likwid_markerStopRegion #183.5 +..___tag_value_computeForceLJHalfNeigh.161: + # LOE rbx r15 +..B2.55: # Preds ..B2.54 + # Execution count [1.00e+00] + xorl %eax, %eax #184.16 +..___tag_value_computeForceLJHalfNeigh.162: +# getTimeStamp() + call getTimeStamp #184.16 +..___tag_value_computeForceLJHalfNeigh.163: + # LOE rbx r15 xmm0 +..B2.82: # Preds ..B2.55 + # Execution count [1.00e+00] + movaps %xmm0, %xmm1 #184.16 + # LOE rbx r15 xmm1 +..B2.56: # Preds ..B2.82 + # Execution count [1.00e+00] + xorps %xmm3, %xmm3 #185.5 + cvtsi2sdq %rbx, %xmm3 #185.5 + subsd 16(%rsp), %xmm1 #185.94[spill] + movsd .L_2il0floatpacket.7(%rip), %xmm2 #185.5 + movl $.L_2__STRING.2, %edi #185.5 + divsd %xmm3, %xmm2 #185.5 + mulsd %xmm1, %xmm2 #185.5 + movl %ebx, %esi #185.5 + movsd 264(%r15), %xmm0 #185.74 + movl $3, %eax #185.5 + mulsd %xmm0, %xmm2 #185.5 + movsd %xmm1, (%rsp) #185.5[spill] +..___tag_value_computeForceLJHalfNeigh.165: +# printf(const char *__restrict__, ...) + call printf #185.5 +..___tag_value_computeForceLJHalfNeigh.166: + # LOE +..B2.57: # Preds ..B2.56 + # Execution count [1.00e+00] + movsd (%rsp), %xmm1 #[spill] + movaps %xmm1, %xmm0 #186.14 + addq $216, %rsp #186.14 + .cfi_def_cfa_offset 56 + .cfi_restore 6 + popq %rbp #186.14 + .cfi_def_cfa_offset 48 + .cfi_restore 3 + popq %rbx #186.14 + .cfi_def_cfa_offset 40 + .cfi_restore 15 + popq %r15 #186.14 + .cfi_def_cfa_offset 32 + .cfi_restore 14 + popq %r14 #186.14 + .cfi_def_cfa_offset 24 + .cfi_restore 13 + popq %r13 #186.14 + .cfi_def_cfa_offset 16 + .cfi_restore 12 + popq %r12 #186.14 + .cfi_def_cfa_offset 8 + ret #186.14 + .cfi_def_cfa_offset 272 + .cfi_offset 3, -48 + .cfi_offset 6, -56 + .cfi_offset 12, -16 + .cfi_offset 13, -24 + .cfi_offset 14, -32 + .cfi_offset 15, -40 + # LOE +..B2.58: # Preds ..B2.10 + # Execution count [2.25e-01]: Infreq + xorl %r12d, %r12d #143.9 + jmp ..B2.40 # Prob 100% #143.9 + # LOE rax rdx rcx rbx rsi rdi r8 r9 r10 r11 r12 r13 ebp xmm2 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 +..B2.59: # Preds ..B2.2 + # Execution count [1.00e+00]: Infreq + movq %rbp, %rax #106.18 + lea (%rax,%rax,2), %rax #106.18 + cmpq $4, %rax #114.5 + jl ..B2.75 # Prob 10% #114.5 + # LOE rax rdi r12 r13 r14 r15 ebp +..B2.60: # Preds ..B2.59 + # Execution count [1.00e+00]: Infreq + movq %rdi, %rdx #114.5 + andq $15, %rdx #114.5 + testl %edx, %edx #114.5 + je ..B2.63 # Prob 50% #114.5 + # LOE rax rdi r12 r13 r14 r15 edx ebp +..B2.61: # Preds ..B2.60 + # Execution count [1.00e+00]: Infreq + testb $7, %dl #114.5 + jne ..B2.75 # Prob 10% #114.5 + # LOE rax rdi r12 r13 r14 r15 ebp +..B2.62: # Preds ..B2.61 + # Execution count [5.00e-01]: Infreq + movl $1, %edx #114.5 + # LOE rax rdi r12 r13 r14 r15 edx ebp +..B2.63: # Preds ..B2.62 ..B2.60 + # Execution count [1.00e+00]: Infreq + movl %edx, %esi #114.5 + lea 4(%rsi), %rcx #114.5 + cmpq %rcx, %rax #114.5 + jl ..B2.75 # Prob 10% #114.5 + # LOE rax rsi rdi r12 r13 r14 r15 edx ebp +..B2.64: # Preds ..B2.63 + # Execution count [1.11e+00]: Infreq + movl %eax, %r8d #114.5 + movl %r8d, %ecx #114.5 + subl %edx, %ecx #114.5 + andl $3, %ecx #114.5 + subl %ecx, %r8d #114.5 + xorl %ecx, %ecx #114.5 + xorl %ebx, %ebx #115.22 + testl %edx, %edx #114.5 + movslq %r8d, %rdx #114.5 + jbe ..B2.68 # Prob 10% #114.5 + # LOE rax rdx rcx rbx rsi rdi r12 r13 r14 r15 ebp +..B2.66: # Preds ..B2.64 ..B2.66 + # Execution count [5.56e+00]: Infreq + movq %rbx, (%rdi,%rcx,8) #115.9 + incq %rcx #114.5 + cmpq %rsi, %rcx #114.5 + jb ..B2.66 # Prob 82% #114.5 + # LOE rax rdx rcx rbx rsi rdi r12 r13 r14 r15 ebp +..B2.68: # Preds ..B2.66 ..B2.64 + # Execution count [1.00e+00]: Infreq + xorps %xmm0, %xmm0 #115.22 + # LOE rax rdx rsi rdi r12 r13 r14 r15 ebp xmm0 +..B2.69: # Preds ..B2.69 ..B2.68 + # Execution count [5.56e+00]: Infreq + movups %xmm0, (%rdi,%rsi,8) #115.9 + movups %xmm0, 16(%rdi,%rsi,8) #115.9 + addq $4, %rsi #114.5 + cmpq %rdx, %rsi #114.5 + jb ..B2.69 # Prob 82% #114.5 + # LOE rax rdx rsi rdi r12 r13 r14 r15 ebp xmm0 +..B2.71: # Preds ..B2.69 ..B2.75 + # Execution count [1.11e+00]: Infreq + cmpq %rax, %rdx #114.5 + jae ..B2.5 # Prob 10% #114.5 + # LOE rax rdx rdi r12 r13 r14 r15 ebp +..B2.73: # Preds ..B2.71 ..B2.73 + # Execution count [5.56e+00]: Infreq + movq $0, (%rdi,%rdx,8) #115.9 + incq %rdx #114.5 + cmpq %rax, %rdx #114.5 + jb ..B2.73 # Prob 82% #114.5 + jmp ..B2.5 # Prob 100% #114.5 + # LOE rax rdx rdi r12 r13 r14 r15 ebp +..B2.75: # Preds ..B2.59 ..B2.61 ..B2.63 + # Execution count [1.00e-01]: Infreq + xorl %edx, %edx #114.5 + jmp ..B2.71 # Prob 100% #114.5 + .align 16,0x90 + # LOE rax rdx rdi r12 r13 r14 r15 ebp + .cfi_endproc +# mark_end; + .type computeForceLJHalfNeigh,@function + .size computeForceLJHalfNeigh,.-computeForceLJHalfNeigh +..LNcomputeForceLJHalfNeigh.1: + .data +# -- End computeForceLJHalfNeigh + .text +.L_2__routine_start_computeForceLJFullNeigh_simd_2: +# -- Begin computeForceLJFullNeigh_simd + .text +# mark_begin; + .align 16,0x90 + .globl computeForceLJFullNeigh_simd +# --- computeForceLJFullNeigh_simd(Parameter *, Atom *, Neighbor *, Stats *) +computeForceLJFullNeigh_simd: +# parameter 1: %rdi +# parameter 2: %rsi +# parameter 3: %rdx +# parameter 4: %rcx +..B3.1: # Preds ..B3.0 + # Execution count [1.00e+00] + .cfi_startproc +..___tag_value_computeForceLJFullNeigh_simd.190: +..L191: + #189.101 + pushq %rsi #189.101 + .cfi_def_cfa_offset 16 + movl 4(%rsi), %edx #190.18 + testl %edx, %edx #196.24 + jle ..B3.4 # Prob 50% #196.24 + # LOE rbx rbp rsi r12 r13 r14 r15 edx +..B3.2: # Preds ..B3.1 + # Execution count [5.00e-03] + movq 64(%rsi), %rdi #197.9 + lea (%rdx,%rdx,2), %eax #190.18 + cmpl $12, %eax #196.5 + jle ..B3.8 # Prob 0% #196.5 + # LOE rbx rbp rdi r12 r13 r14 r15 edx +..B3.3: # Preds ..B3.2 + # Execution count [1.00e+00] + movslq %edx, %rdx #196.5 + xorl %esi, %esi #196.5 + lea (%rdx,%rdx,2), %rdx #196.5 + shlq $3, %rdx #196.5 + call _intel_fast_memset #196.5 + # LOE rbx rbp r12 r13 r14 r15 +..B3.4: # Preds ..B3.22 ..B3.1 ..B3.20 ..B3.3 + # Execution count [1.00e+00] + xorl %eax, %eax #203.16 +..___tag_value_computeForceLJFullNeigh_simd.193: +# getTimeStamp() + call getTimeStamp #203.16 +..___tag_value_computeForceLJFullNeigh_simd.194: + # LOE rbx rbp r12 r13 r14 r15 +..B3.5: # Preds ..B3.4 + # Execution count [1.00e+00] + movl $.L_2__STRING.0, %edi #204.5 +..___tag_value_computeForceLJFullNeigh_simd.195: +# likwid_markerStartRegion(const char *) + call likwid_markerStartRegion #204.5 +..___tag_value_computeForceLJFullNeigh_simd.196: + # LOE +..B3.6: # Preds ..B3.5 + # Execution count [1.00e+00] + movl $il0_peep_printf_format_0, %edi #207.5 + movq stderr(%rip), %rsi #207.5 + call fputs #207.5 + # LOE +..B3.7: # Preds ..B3.6 + # Execution count [1.00e+00] + movl $-1, %edi #208.5 +# exit(int) + call exit #208.5 + # LOE +..B3.8: # Preds ..B3.2 + # Execution count [1.00e+00]: Infreq + movslq %edx, %rdx #196.5 + lea (%rdx,%rdx,2), %rcx #190.18 + cmpq $4, %rcx #196.5 + jl ..B3.24 # Prob 10% #196.5 + # LOE rcx rbx rbp rdi r12 r13 r14 r15 +..B3.9: # Preds ..B3.8 + # Execution count [1.00e+00]: Infreq + movq %rdi, %rdx #196.5 + andq $15, %rdx #196.5 + testl %edx, %edx #196.5 + je ..B3.12 # Prob 50% #196.5 + # LOE rcx rbx rbp rdi r12 r13 r14 r15 edx +..B3.10: # Preds ..B3.9 + # Execution count [1.00e+00]: Infreq + testb $7, %dl #196.5 + jne ..B3.24 # Prob 10% #196.5 + # LOE rcx rbx rbp rdi r12 r13 r14 r15 +..B3.11: # Preds ..B3.10 + # Execution count [5.00e-01]: Infreq + movl $1, %edx #196.5 + # LOE rcx rbx rbp rdi r12 r13 r14 r15 edx +..B3.12: # Preds ..B3.11 ..B3.9 + # Execution count [1.00e+00]: Infreq + movl %edx, %eax #196.5 + lea 4(%rax), %rsi #196.5 + cmpq %rsi, %rcx #196.5 + jl ..B3.24 # Prob 10% #196.5 + # LOE rax rcx rbx rbp rdi r12 r13 r14 r15 edx +..B3.13: # Preds ..B3.12 + # Execution count [1.11e+00]: Infreq + movl %ecx, %r8d #196.5 + xorl %r9d, %r9d #196.5 + movl %r8d, %esi #196.5 + subl %edx, %esi #196.5 + andl $3, %esi #196.5 + subl %esi, %r8d #196.5 + xorl %esi, %esi #196.5 + movslq %r8d, %r8 #196.5 + testl %edx, %edx #196.5 + jbe ..B3.17 # Prob 10% #196.5 + # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 +..B3.15: # Preds ..B3.13 ..B3.15 + # Execution count [5.56e+00]: Infreq + movq %rsi, (%rdi,%r9,8) #197.9 + incq %r9 #196.5 + cmpq %rax, %r9 #196.5 + jb ..B3.15 # Prob 82% #196.5 + # LOE rax rcx rbx rbp rsi rdi r8 r9 r12 r13 r14 r15 +..B3.17: # Preds ..B3.15 ..B3.13 + # Execution count [1.00e+00]: Infreq + xorps %xmm0, %xmm0 #197.22 + # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 xmm0 +..B3.18: # Preds ..B3.18 ..B3.17 + # Execution count [5.56e+00]: Infreq + movups %xmm0, (%rdi,%rax,8) #197.9 + movups %xmm0, 16(%rdi,%rax,8) #197.9 + addq $4, %rax #196.5 + cmpq %r8, %rax #196.5 + jb ..B3.18 # Prob 82% #196.5 + # LOE rax rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 xmm0 +..B3.20: # Preds ..B3.18 ..B3.24 + # Execution count [1.11e+00]: Infreq + cmpq %rcx, %r8 #196.5 + jae ..B3.4 # Prob 10% #196.5 + # LOE rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 +..B3.22: # Preds ..B3.20 ..B3.22 + # Execution count [5.56e+00]: Infreq + movq %rsi, (%rdi,%r8,8) #197.9 + incq %r8 #196.5 + cmpq %rcx, %r8 #196.5 + jb ..B3.22 # Prob 82% #196.5 + jmp ..B3.4 # Prob 100% #196.5 + # LOE rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 +..B3.24: # Preds ..B3.8 ..B3.10 ..B3.12 + # Execution count [1.00e-01]: Infreq + xorl %r8d, %r8d #196.5 + xorl %esi, %esi #196.5 + jmp ..B3.20 # Prob 100% #196.5 + .align 16,0x90 + # LOE rcx rbx rbp rsi rdi r8 r12 r13 r14 r15 + .cfi_endproc +# mark_end; + .type computeForceLJFullNeigh_simd,@function + .size computeForceLJFullNeigh_simd,.-computeForceLJFullNeigh_simd +..LNcomputeForceLJFullNeigh_simd.2: + .section .rodata.str1.32, "aMS",@progbits,1 + .align 32 + .align 32 +il0_peep_printf_format_0: + .long 1869771333 + .long 1394621042 + .long 541347145 + .long 1852990827 + .long 1847618661 + .long 1763734639 + .long 1701605485 + .long 1953391981 + .long 1713398885 + .long 1931506287 + .long 1768121712 + .long 1684367718 + .long 1936615712 + .long 1668641396 + .long 1852795252 + .long 1952805664 + .word 33 + .data +# -- End computeForceLJFullNeigh_simd + .section .rodata, "a" + .align 16 + .align 16 +.L_2il0floatpacket.0: + .long 0x00000001,0x00000001,0x00000001,0x00000001 + .type .L_2il0floatpacket.0,@object + .size .L_2il0floatpacket.0,16 + .align 16 +.L_2il0floatpacket.1: + .long 0x00000002,0x00000002,0x00000002,0x00000002 + .type .L_2il0floatpacket.1,@object + .size .L_2il0floatpacket.1,16 + .align 16 +.L_2il0floatpacket.2: + .long 0x00000000,0x3ff00000,0x00000000,0x3ff00000 + .type .L_2il0floatpacket.2,@object + .size .L_2il0floatpacket.2,16 + .align 16 +.L_2il0floatpacket.4: + .long 0x00000000,0x3fe00000,0x00000000,0x3fe00000 + .type .L_2il0floatpacket.4,@object + .size .L_2il0floatpacket.4,16 + .align 8 +.L_2il0floatpacket.3: + .long 0x00000000,0x40480000 + .type .L_2il0floatpacket.3,@object + .size .L_2il0floatpacket.3,8 + .align 8 +.L_2il0floatpacket.5: + .long 0x00000000,0x3fe00000 + .type .L_2il0floatpacket.5,@object + .size .L_2il0floatpacket.5,8 + .align 8 +.L_2il0floatpacket.6: + .long 0x00000000,0x3ff00000 + .type .L_2il0floatpacket.6,@object + .size .L_2il0floatpacket.6,8 + .align 8 +.L_2il0floatpacket.7: + .long 0x00000000,0x41cdcd65 + .type .L_2il0floatpacket.7,@object + .size .L_2il0floatpacket.7,8 + .section .rodata.str1.4, "aMS",@progbits,1 + .align 4 + .align 4 +.L_2__STRING.0: + .long 1668444006 + .word 101 + .type .L_2__STRING.0,@object + .size .L_2__STRING.0,6 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.1: + .long 1668444006 + .long 759843941 + .long 1718378856 + .long 1734960494 + .word 104 + .type .L_2__STRING.1,@object + .size .L_2__STRING.1,18 + .space 2, 0x00 # pad + .align 4 +.L_2__STRING.2: + .long 980644937 + .long 544548128 + .long 1701987872 + .long 622869105 + .long 1411391590 + .long 979725673 + .long 174466336 + .long 1764718915 + .long 622869108 + .long 1747460198 + .long 761687137 + .long 1734960494 + .long 665960 + .type .L_2__STRING.2,@object + .size .L_2__STRING.2,52 + .data + .section .note.GNU-stack, "" +# End diff --git a/static_analysis/jan/icx-icx-gromacs-avx512-sp.s b/static_analysis/jan/icx-icx-gromacs-avx512-sp.s new file mode 100644 index 0000000..69698ef --- /dev/null +++ b/static_analysis/jan/icx-icx-gromacs-avx512-sp.s @@ -0,0 +1,2103 @@ + .text + .file "force_lj.c" + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_ref +.LCPI0_0: + .quad 0x4048000000000000 # 48 +.LCPI0_2: + .quad 0xbfe0000000000000 # -0.5 +.LCPI0_3: + .quad 0x3fe0000000000000 # 0.5 + .section .rodata.cst4,"aM",@progbits,4 + .p2align 2 +.LCPI0_1: + .long 0x3f800000 # 1 + .text + .globl computeForceLJ_ref + .p2align 4, 0x90 + .type computeForceLJ_ref,@function +computeForceLJ_ref: # + .cfi_startproc +# %bb.0: + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $152, %rsp + .cfi_def_cfa_offset 208 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, 16(%rsp) # 8-byte Spill + movq %rdx, 32(%rsp) # 8-byte Spill + movq %rsi, %r14 + movq %rdi, %r12 + movl $.L.str, %edi + xorl %eax, %eax + callq debug_printf + vmovss 108(%r12), %xmm0 # xmm0 = mem[0],zero,zero,zero + vmovss %xmm0, 8(%rsp) # 4-byte Spill + vmovss 40(%r12), %xmm0 # xmm0 = mem[0],zero,zero,zero + vmovss %xmm0, (%rsp) # 4-byte Spill + vmovss 48(%r12), %xmm2 # xmm2 = mem[0],zero,zero,zero + movl 20(%r14), %r11d + testl %r11d, %r11d + jle .LBB0_5 +# %bb.1: + movq 176(%r14), %r9 + movq 192(%r14), %r10 + decq %r11 + leaq 64(%r9), %r8 + xorl %edi, %edi + vxorps %xmm0, %xmm0, %xmm0 + vxorps %xmm1, %xmm1, %xmm1 + jmp .LBB0_2 + .p2align 5, 0x90 +.LBB0_79: # in Loop: Header=BB0_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB0_5 +.LBB0_2: # =>This Loop Header: Depth=1 + # Child Loop BB0_74 Depth 2 + # Child Loop BB0_78 Depth 2 + leaq (%rdi,%rdi,8), %rax + leaq (%rax,%rax,2), %rax + addq %rdi, %rax + movl (%r10,%rax), %esi + testl %esi, %esi + jle .LBB0_79 +# %bb.3: # in Loop: Header=BB0_2 Depth=1 + leal (,%rdi,4), %ebx + movl %ebx, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %ebx + orl %eax, %ebx + cmpl $7, %esi + ja .LBB0_73 +# %bb.4: # in Loop: Header=BB0_2 Depth=1 + movl %esi, %ebp + andl $-8, %ebp + cmpq %rsi, %rbp + jae .LBB0_79 + jmp .LBB0_77 + .p2align 5, 0x90 +.LBB0_73: # in Loop: Header=BB0_2 Depth=1 + leaq (,%rsi,4), %rbp + andq $-32, %rbp + movl %ebx, %ecx + leaq (%r9,%rcx,4), %rdx + xorl %eax, %eax + .p2align 4, 0x90 +.LBB0_74: # Parent Loop BB0_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, (%rdx,%rax) + addq $32, %rax + cmpq %rax, %rbp + jne .LBB0_74 +# %bb.75: # in Loop: Header=BB0_2 Depth=1 + movl %esi, %ebp + andl $-8, %ebp + addq %rbp, %rcx + vmovups %zmm1, (%r9,%rcx,4) + cmpq %rsi, %rbp + jae .LBB0_79 +.LBB0_77: # in Loop: Header=BB0_2 Depth=1 + movl %ebx, %eax + leaq (%r8,%rax,4), %rcx + .p2align 4, 0x90 +.LBB0_78: # Parent Loop BB0_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movl $0, -64(%rcx,%rbp,4) + movl $0, -32(%rcx,%rbp,4) + movl $0, (%rcx,%rbp,4) + incq %rbp + cmpq %rbp, %rsi + jne .LBB0_78 + jmp .LBB0_79 + .p2align 5, 0x90 +.LBB0_5: + vmovss %xmm2, 28(%rsp) # 4-byte Spill + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 56(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + movl 20(%r14), %esi + testl %esi, %esi + jle .LBB0_17 +# %bb.6: + vmovss 8(%rsp), %xmm0 # 4-byte Reload + # xmm0 = mem[0],zero,zero,zero + vmulss %xmm0, %xmm0, %xmm13 + movq 16(%rsp), %rax # 8-byte Reload + leaq 32(%rax), %r15 + vmovss (%rsp), %xmm0 # 4-byte Reload + # xmm0 = mem[0],zero,zero,zero + vcvtss2sd %xmm0, %xmm0, %xmm0 + vmulsd .LCPI0_0(%rip), %xmm0, %xmm12 + leaq 24(%rax), %rdx + movq 160(%r14), %rdi + movq 176(%r14), %rbp + movq 32(%rsp), %rcx # 8-byte Reload + movq 8(%rcx), %rbx + movq %rbx, 72(%rsp) # 8-byte Spill + movq 24(%rcx), %rbx + movq %rbx, 96(%rsp) # 8-byte Spill + movslq 16(%rcx), %rcx + movq %rcx, 64(%rsp) # 8-byte Spill + vmovdqu 8(%rax), %xmm9 + leal -1(%rsi), %ecx + addq (%rax), %rcx + movq %rcx, 48(%rsp) # 8-byte Spill + movq %rbp, 80(%rsp) # 8-byte Spill + leaq 64(%rbp), %rax + movq %rax, 128(%rsp) # 8-byte Spill + movq %rdi, 40(%rsp) # 8-byte Spill + leaq 64(%rdi), %rax + movq %rax, 120(%rsp) # 8-byte Spill + xorl %edi, %edi + vmovss .LCPI0_1(%rip), %xmm10 # xmm10 = mem[0],zero,zero,zero + vmovsd .LCPI0_2(%rip), %xmm11 # xmm11 = mem[0],zero + vmovsd .LCPI0_3(%rip), %xmm8 # xmm8 = mem[0],zero + vmovss 28(%rsp), %xmm20 # 4-byte Reload + # xmm20 = mem[0],zero,zero,zero + movq %rsi, 88(%rsp) # 8-byte Spill + jmp .LBB0_7 + .p2align 5, 0x90 +.LBB0_19: # in Loop: Header=BB0_7 Depth=1 + movq 88(%rsp), %rsi # 8-byte Reload + movq 112(%rsp), %rdi # 8-byte Reload + movq 104(%rsp), %rbp # 8-byte Reload +.LBB0_20: # in Loop: Header=BB0_7 Depth=1 + vcvtsi2sd %ebp, %xmm21, %xmm0 + vmulsd %xmm0, %xmm8, %xmm0 + vcvttsd2si %xmm0, %rax + vmovq %rax, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0] + vpaddq %xmm0, %xmm9, %xmm9 + incq %rdi + cmpq %rsi, %rdi + je .LBB0_16 +.LBB0_7: # =>This Loop Header: Depth=1 + # Child Loop BB0_9 Depth 2 + # Child Loop BB0_10 Depth 3 + # Child Loop BB0_12 Depth 4 + movq 96(%rsp), %rax # 8-byte Reload + movslq (%rax,%rdi,4), %rbp + movq %rbp, %rcx + testq %rbp, %rbp + jle .LBB0_20 +# %bb.8: # in Loop: Header=BB0_7 Depth=1 + movl %edi, %r13d + shrl %r13d + leal (,%r13,8), %eax + leal (%rax,%rax,2), %eax + leal (,%rdi,4), %ecx + andl $4, %ecx + orl %ecx, %eax + movq 40(%rsp), %rsi # 8-byte Reload + leaq (%rsi,%rax,4), %r8 + movq 80(%rsp), %rsi # 8-byte Reload + leaq (%rsi,%rax,4), %r11 + movq %rdi, 112(%rsp) # 8-byte Spill + movq %rdi, %rax + imulq 64(%rsp), %rax # 8-byte Folded Reload + movq 72(%rsp), %rsi # 8-byte Reload + leaq (%rsi,%rax,4), %rax + movq %rax, 136(%rsp) # 8-byte Spill + movq 32(%rsp), %rax # 8-byte Reload + movl 32(%rax), %eax + movl %eax, (%rsp) # 4-byte Spill + movl %ecx, %r12d + movq %rbp, 104(%rsp) # 8-byte Spill + movl %ebp, %ecx + xorl %esi, %esi + movq %rcx, 144(%rsp) # 8-byte Spill + jmp .LBB0_9 + .p2align 5, 0x90 +.LBB0_18: # in Loop: Header=BB0_9 Depth=2 + movq 8(%rsp), %rsi # 8-byte Reload + incq %rsi + movq 144(%rsp), %rcx # 8-byte Reload + cmpq %rcx, %rsi + je .LBB0_19 +.LBB0_9: # Parent Loop BB0_7 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_10 Depth 3 + # Child Loop BB0_12 Depth 4 + movq 136(%rsp), %rax # 8-byte Reload + movq %rsi, 8(%rsp) # 8-byte Spill + movslq (%rax,%rsi,4), %r10 + movq %r10, %rax + shlq $5, %rax + leaq (%rax,%rax,2), %rdi + movq 40(%rsp), %rax # 8-byte Reload + addq %rdi, %rax + movq 128(%rsp), %rcx # 8-byte Reload + leaq (%rcx,%rdi), %rsi + addq 120(%rsp), %rdi # 8-byte Folded Reload + xorl %r9d, %r9d + xorl %r14d, %r14d + jmp .LBB0_10 + .p2align 5, 0x90 +.LBB0_67: # in Loop: Header=BB0_10 Depth=3 + vdivss %xmm3, %xmm10, %xmm3 + vmulss %xmm20, %xmm3, %xmm14 + vmulss %xmm3, %xmm3, %xmm4 + vmulss %xmm4, %xmm14, %xmm4 + vcvtss2sd %xmm4, %xmm4, %xmm4 + vaddsd %xmm4, %xmm11, %xmm14 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm14, %xmm4 + vmulsd %xmm3, %xmm4, %xmm3 + vcvtsd2ss %xmm3, %xmm3, %xmm3 + vfmadd231ss %xmm2, %xmm3, %xmm7 # xmm7 = (xmm3 * xmm2) + xmm7 + vfmadd231ss %xmm1, %xmm3, %xmm5 # xmm5 = (xmm3 * xmm1) + xmm5 + vfmadd231ss %xmm0, %xmm3, %xmm19 # xmm19 = (xmm3 * xmm0) + xmm19 + movl $1, %r14d + movq %rdx, %rbx +.LBB0_68: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) + .p2align 4, 0x90 +.LBB0_69: # in Loop: Header=BB0_10 Depth=3 + xorl %ecx, %ecx + testl %r14d, %r14d + sete %cl + movq 16(%rsp), %rbx # 8-byte Reload + incq 40(%rbx,%rcx,8) + vaddss (%r11,%r9,4), %xmm7, %xmm0 + vmovss %xmm0, (%r11,%r9,4) + vaddss 32(%r11,%r9,4), %xmm5, %xmm0 + vmovss %xmm0, 32(%r11,%r9,4) + vaddss 64(%r11,%r9,4), %xmm19, %xmm0 + vmovss %xmm0, 64(%r11,%r9,4) + incq %r9 + cmpq $4, %r9 + je .LBB0_18 +.LBB0_10: # Parent Loop BB0_7 Depth=1 + # Parent Loop BB0_9 Depth=2 + # => This Loop Header: Depth=3 + # Child Loop BB0_12 Depth 4 + vmovss (%r8,%r9,4), %xmm14 # xmm14 = mem[0],zero,zero,zero + leaq (%r9,%r12), %rbp + vmovss 32(%r8,%r9,4), %xmm16 # xmm16 = mem[0],zero,zero,zero + vmovss 64(%r8,%r9,4), %xmm18 # xmm18 = mem[0],zero,zero,zero + cmpl $0, (%rsp) # 4-byte Folded Reload + je .LBB0_21 +# %bb.11: # in Loop: Header=BB0_10 Depth=3 + vxorps %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vxorps %xmm5, %xmm5, %xmm5 + vxorps %xmm19, %xmm19, %xmm19 + jmp .LBB0_12 + .p2align 5, 0x90 +.LBB0_70: # in Loop: Header=BB0_12 Depth=4 + vdivss %xmm3, %xmm10, %xmm3 + vmulss %xmm20, %xmm3, %xmm4 + vmulss %xmm3, %xmm3, %xmm6 + vmulss %xmm4, %xmm6, %xmm4 + vcvtss2sd %xmm4, %xmm4, %xmm4 + vaddsd %xmm4, %xmm11, %xmm6 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm6, %xmm4 + vmulsd %xmm3, %xmm4, %xmm3 + vcvtsd2ss %xmm3, %xmm3, %xmm3 + vmovss -32(%rsi,%rbx,4), %xmm4 # xmm4 = mem[0],zero,zero,zero + vmovss -64(%rsi,%rbx,4), %xmm6 # xmm6 = mem[0],zero,zero,zero + vfnmadd231ss %xmm2, %xmm3, %xmm6 # xmm6 = -(xmm3 * xmm2) + xmm6 + vmovss %xmm6, -64(%rsi,%rbx,4) + vfnmadd231ss %xmm0, %xmm3, %xmm4 # xmm4 = -(xmm3 * xmm0) + xmm4 + vmovss %xmm4, -32(%rsi,%rbx,4) + vmovss (%rsi,%rbx,4), %xmm4 # xmm4 = mem[0],zero,zero,zero + vfnmadd231ss %xmm1, %xmm3, %xmm4 # xmm4 = -(xmm3 * xmm1) + xmm4 + vmovss %xmm4, (%rsi,%rbx,4) + vfmadd231ss %xmm2, %xmm3, %xmm7 # xmm7 = (xmm3 * xmm2) + xmm7 + vfmadd231ss %xmm0, %xmm3, %xmm5 # xmm5 = (xmm3 * xmm0) + xmm5 + vfmadd231ss %xmm1, %xmm3, %xmm19 # xmm19 = (xmm3 * xmm1) + xmm19 + movl $1, %r14d + movq %rdx, %rcx +.LBB0_71: # in Loop: Header=BB0_12 Depth=4 + incq (%rcx) +.LBB0_72: # in Loop: Header=BB0_12 Depth=4 + incq %rbx + cmpq $8, %rbx + je .LBB0_69 +.LBB0_12: # Parent Loop BB0_7 Depth=1 + # Parent Loop BB0_9 Depth=2 + # Parent Loop BB0_10 Depth=3 + # => This Inner Loop Header: Depth=4 + cmpl %r10d, %r13d + jne .LBB0_14 +# %bb.13: # in Loop: Header=BB0_12 Depth=4 + cmpq %rbx, %rbp + jae .LBB0_72 +.LBB0_14: # in Loop: Header=BB0_12 Depth=4 + vsubss -64(%rdi,%rbx,4), %xmm14, %xmm2 + vsubss -32(%rdi,%rbx,4), %xmm16, %xmm0 + vsubss (%rdi,%rbx,4), %xmm18, %xmm1 + vmulss %xmm2, %xmm2, %xmm3 + vfmadd231ss %xmm0, %xmm0, %xmm3 # xmm3 = (xmm0 * xmm0) + xmm3 + vfmadd231ss %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vucomiss %xmm13, %xmm3 + jb .LBB0_70 +# %bb.15: # in Loop: Header=BB0_12 Depth=4 + movq %r15, %rcx + jmp .LBB0_71 + .p2align 5, 0x90 +.LBB0_21: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_24 +# %bb.22: # in Loop: Header=BB0_10 Depth=3 + vxorps %xmm19, %xmm19, %xmm19 + testq %rbp, %rbp + jne .LBB0_24 +# %bb.23: # in Loop: Header=BB0_10 Depth=3 + vxorps %xmm5, %xmm5, %xmm5 + vxorps %xmm7, %xmm7, %xmm7 + cmpl %r10d, %r13d + je .LBB0_28 + jmp .LBB0_29 + .p2align 5, 0x90 +.LBB0_24: # in Loop: Header=BB0_10 Depth=3 + vsubss (%rax), %xmm14, %xmm15 + vsubss 32(%rax), %xmm16, %xmm1 + vsubss 64(%rax), %xmm18, %xmm2 + vmulss %xmm15, %xmm15, %xmm0 + vfmadd231ss %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231ss %xmm2, %xmm2, %xmm0 # xmm0 = (xmm2 * xmm2) + xmm0 + vxorps %xmm19, %xmm19, %xmm19 + vucomiss %xmm13, %xmm0 + movq %r15, %rbx + vxorps %xmm5, %xmm5, %xmm5 + vxorps %xmm7, %xmm7, %xmm7 + jae .LBB0_26 +# %bb.25: # in Loop: Header=BB0_10 Depth=3 + vdivss %xmm0, %xmm10, %xmm0 + vmulss %xmm20, %xmm0, %xmm3 + vmulss %xmm0, %xmm0, %xmm5 + vmulss %xmm3, %xmm5, %xmm3 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vaddsd %xmm3, %xmm11, %xmm5 + vcvtss2sd %xmm0, %xmm0, %xmm0 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm5, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vmulss %xmm0, %xmm15, %xmm7 + vmulss %xmm0, %xmm1, %xmm5 + vmulss %xmm0, %xmm2, %xmm19 + movl $1, %r14d + movq %rdx, %rbx +.LBB0_26: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) + cmpl %r10d, %r13d + jne .LBB0_29 +.LBB0_28: # in Loop: Header=BB0_10 Depth=3 + cmpq $1, %rbp + je .LBB0_33 +.LBB0_29: # in Loop: Header=BB0_10 Depth=3 + vsubss 4(%rax), %xmm14, %xmm2 + vsubss 36(%rax), %xmm16, %xmm1 + vsubss 68(%rax), %xmm18, %xmm15 + vmulss %xmm2, %xmm2, %xmm0 + vfmadd231ss %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231ss %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomiss %xmm13, %xmm0 + jae .LBB0_30 +# %bb.31: # in Loop: Header=BB0_10 Depth=3 + vdivss %xmm0, %xmm10, %xmm0 + vmulss %xmm20, %xmm0, %xmm17 + vmulss %xmm0, %xmm0, %xmm3 + vmulss %xmm17, %xmm3, %xmm3 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vcvtss2sd %xmm0, %xmm0, %xmm0 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_32 + .p2align 5, 0x90 +.LBB0_30: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_32: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_33: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_35 +# %bb.34: # in Loop: Header=BB0_10 Depth=3 + cmpq $2, %rbp + je .LBB0_39 +.LBB0_35: # in Loop: Header=BB0_10 Depth=3 + vsubss 8(%rax), %xmm14, %xmm2 + vsubss 40(%rax), %xmm16, %xmm1 + vsubss 72(%rax), %xmm18, %xmm15 + vmulss %xmm2, %xmm2, %xmm0 + vfmadd231ss %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231ss %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomiss %xmm13, %xmm0 + jae .LBB0_36 +# %bb.37: # in Loop: Header=BB0_10 Depth=3 + vdivss %xmm0, %xmm10, %xmm0 + vmulss %xmm20, %xmm0, %xmm17 + vmulss %xmm0, %xmm0, %xmm3 + vmulss %xmm17, %xmm3, %xmm3 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vcvtss2sd %xmm0, %xmm0, %xmm0 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_38 + .p2align 5, 0x90 +.LBB0_36: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_38: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_39: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_41 +# %bb.40: # in Loop: Header=BB0_10 Depth=3 + cmpq $3, %rbp + je .LBB0_45 +.LBB0_41: # in Loop: Header=BB0_10 Depth=3 + vsubss 12(%rax), %xmm14, %xmm2 + vsubss 44(%rax), %xmm16, %xmm1 + vsubss 76(%rax), %xmm18, %xmm15 + vmulss %xmm2, %xmm2, %xmm0 + vfmadd231ss %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231ss %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomiss %xmm13, %xmm0 + jae .LBB0_42 +# %bb.43: # in Loop: Header=BB0_10 Depth=3 + vdivss %xmm0, %xmm10, %xmm0 + vmulss %xmm20, %xmm0, %xmm17 + vmulss %xmm0, %xmm0, %xmm3 + vmulss %xmm17, %xmm3, %xmm3 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vcvtss2sd %xmm0, %xmm0, %xmm0 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_44 + .p2align 5, 0x90 +.LBB0_42: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_44: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_45: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_47 +# %bb.46: # in Loop: Header=BB0_10 Depth=3 + cmpq $4, %rbp + je .LBB0_51 +.LBB0_47: # in Loop: Header=BB0_10 Depth=3 + vsubss 16(%rax), %xmm14, %xmm2 + vsubss 48(%rax), %xmm16, %xmm1 + vsubss 80(%rax), %xmm18, %xmm15 + vmulss %xmm2, %xmm2, %xmm0 + vfmadd231ss %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231ss %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomiss %xmm13, %xmm0 + jae .LBB0_48 +# %bb.49: # in Loop: Header=BB0_10 Depth=3 + vdivss %xmm0, %xmm10, %xmm0 + vmulss %xmm20, %xmm0, %xmm17 + vmulss %xmm0, %xmm0, %xmm3 + vmulss %xmm17, %xmm3, %xmm3 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vcvtss2sd %xmm0, %xmm0, %xmm0 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_50 + .p2align 5, 0x90 +.LBB0_48: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_50: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_51: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_53 +# %bb.52: # in Loop: Header=BB0_10 Depth=3 + cmpq $5, %rbp + je .LBB0_57 +.LBB0_53: # in Loop: Header=BB0_10 Depth=3 + vsubss 20(%rax), %xmm14, %xmm2 + vsubss 52(%rax), %xmm16, %xmm1 + vsubss 84(%rax), %xmm18, %xmm15 + vmulss %xmm2, %xmm2, %xmm0 + vfmadd231ss %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231ss %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomiss %xmm13, %xmm0 + jae .LBB0_54 +# %bb.55: # in Loop: Header=BB0_10 Depth=3 + vdivss %xmm0, %xmm10, %xmm0 + vmulss %xmm20, %xmm0, %xmm17 + vmulss %xmm0, %xmm0, %xmm3 + vmulss %xmm17, %xmm3, %xmm3 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vcvtss2sd %xmm0, %xmm0, %xmm0 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_56 + .p2align 5, 0x90 +.LBB0_54: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_56: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_57: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_59 +# %bb.58: # in Loop: Header=BB0_10 Depth=3 + cmpq $6, %rbp + je .LBB0_63 +.LBB0_59: # in Loop: Header=BB0_10 Depth=3 + vsubss 24(%rax), %xmm14, %xmm2 + vsubss 56(%rax), %xmm16, %xmm1 + vsubss 88(%rax), %xmm18, %xmm15 + vmulss %xmm2, %xmm2, %xmm0 + vfmadd231ss %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231ss %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomiss %xmm13, %xmm0 + jae .LBB0_60 +# %bb.61: # in Loop: Header=BB0_10 Depth=3 + vdivss %xmm0, %xmm10, %xmm0 + vmulss %xmm20, %xmm0, %xmm17 + vmulss %xmm0, %xmm0, %xmm3 + vmulss %xmm17, %xmm3, %xmm3 + vcvtss2sd %xmm3, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vcvtss2sd %xmm0, %xmm0, %xmm0 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vcvtsd2ss %xmm0, %xmm0, %xmm0 + vfmadd231ss %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231ss %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231ss %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_62 + .p2align 5, 0x90 +.LBB0_60: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_62: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_63: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_65 +# %bb.64: # in Loop: Header=BB0_10 Depth=3 + cmpq $7, %rbp + je .LBB0_69 +.LBB0_65: # in Loop: Header=BB0_10 Depth=3 + vsubss 28(%rax), %xmm14, %xmm2 + vsubss 60(%rax), %xmm16, %xmm1 + vsubss 92(%rax), %xmm18, %xmm0 + vmulss %xmm2, %xmm2, %xmm3 + vfmadd231ss %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vfmadd231ss %xmm0, %xmm0, %xmm3 # xmm3 = (xmm0 * xmm0) + xmm3 + vucomiss %xmm13, %xmm3 + jb .LBB0_67 +# %bb.66: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx + jmp .LBB0_68 + .p2align 5, 0x90 +.LBB0_16: + movq 48(%rsp), %rcx # 8-byte Reload + incq %rcx + movq 16(%rsp), %rax # 8-byte Reload + movq %rcx, (%rax) + vmovdqu %xmm9, 8(%rax) +.LBB0_17: + movl $.L.str.1, %edi + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, (%rsp) # 8-byte Spill + movl $.L.str.2, %edi + xorl %eax, %eax + callq debug_printf + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 56(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $152, %rsp + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end0: + .size computeForceLJ_ref, .Lfunc_end0-computeForceLJ_ref + .cfi_endproc + # -- End function + .section .rodata.cst4,"aM",@progbits,4 + .p2align 2 # -- Begin function computeForceLJ_2xnn_half +.LCPI1_0: + .long 0xbf000000 # -0.5 +.LCPI1_1: + .long 0x42400000 # 48 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_2: + .quad 1 # 0x1 + .zero 8 + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 +.LCPI1_3: + .quad 0x3fe0000000000000 # 0.5 +.LCPI1_4: + .quad 0x41cdcd6500000000 # 1.0E+9 + .text + .globl computeForceLJ_2xnn_half + .p2align 4, 0x90 + .type computeForceLJ_2xnn_half,@function +computeForceLJ_2xnn_half: # + .cfi_startproc +# %bb.0: + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $232, %rsp + .cfi_def_cfa_offset 288 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, 48(%rsp) # 8-byte Spill + movq %rdx, %r12 + movq %rsi, %r15 + movq %rdi, %rbp + xorl %ebx, %ebx + movl $.L.str.3, %edi + xorl %eax, %eax + callq debug_printf + vmovss 108(%rbp), %xmm0 # xmm0 = mem[0],zero,zero,zero + vmovss %xmm0, (%rsp) # 4-byte Spill + vbroadcastss 48(%rbp), %zmm2 + movq %rbp, 32(%rsp) # 8-byte Spill + vpbroadcastd 40(%rbp), %zmm3 + movl 20(%r15), %r11d + testl %r11d, %r11d + jle .LBB1_5 +# %bb.1: + movq 176(%r15), %r9 + movq 192(%r15), %r10 + decq %r11 + leaq 64(%r9), %r8 + xorl %edi, %edi + vxorps %xmm0, %xmm0, %xmm0 + vxorps %xmm1, %xmm1, %xmm1 + jmp .LBB1_2 + .p2align 5, 0x90 +.LBB1_21: # in Loop: Header=BB1_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB1_5 +.LBB1_2: # =>This Loop Header: Depth=1 + # Child Loop BB1_16 Depth 2 + # Child Loop BB1_20 Depth 2 + leaq (%rdi,%rdi,8), %rax + leaq (%rax,%rax,2), %rax + addq %rdi, %rax + movl (%r10,%rax), %ecx + testl %ecx, %ecx + jle .LBB1_21 +# %bb.3: # in Loop: Header=BB1_2 Depth=1 + leal (,%rdi,4), %esi + movl %esi, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %esi + orl %eax, %esi + cmpl $7, %ecx + ja .LBB1_15 +# %bb.4: # in Loop: Header=BB1_2 Depth=1 + movl %ecx, %ebp + andl $-8, %ebp + cmpq %rcx, %rbp + jae .LBB1_21 + jmp .LBB1_19 + .p2align 5, 0x90 +.LBB1_15: # in Loop: Header=BB1_2 Depth=1 + leaq (,%rcx,4), %rbp + andq $-32, %rbp + movl %esi, %r14d + leaq (%r9,%r14,4), %rax + xorl %edx, %edx + .p2align 4, 0x90 +.LBB1_16: # Parent Loop BB1_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, (%rax,%rdx) + addq $32, %rdx + cmpq %rdx, %rbp + jne .LBB1_16 +# %bb.17: # in Loop: Header=BB1_2 Depth=1 + movl %ecx, %ebp + andl $-8, %ebp + addq %rbp, %r14 + vmovups %zmm1, (%r9,%r14,4) + cmpq %rcx, %rbp + jae .LBB1_21 +.LBB1_19: # in Loop: Header=BB1_2 Depth=1 + movl %esi, %eax + leaq (%r8,%rax,4), %rdx + .p2align 4, 0x90 +.LBB1_20: # Parent Loop BB1_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movl $0, -64(%rdx,%rbp,4) + movl $0, -32(%rdx,%rbp,4) + movl $0, (%rdx,%rbp,4) + incq %rbp + cmpq %rbp, %rcx + jne .LBB1_20 + jmp .LBB1_21 + .p2align 5, 0x90 +.LBB1_5: + xorl %eax, %eax + vmovups %zmm2, 160(%rsp) # 64-byte Spill + vmovdqu64 %zmm3, 96(%rsp) # 64-byte Spill + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 24(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + vmovups 96(%rsp), %zmm31 # 64-byte Reload + vmovups 160(%rsp), %zmm30 # 64-byte Reload + cmpl $0, 20(%r15) + jle .LBB1_10 +# %bb.6: + vmovss (%rsp), %xmm0 # 4-byte Reload + # xmm0 = mem[0],zero,zero,zero + vmulss %xmm0, %xmm0, %xmm0 + vbroadcastss %xmm0, %zmm0 + xorl %r11d, %r11d + vbroadcastss .LCPI1_0(%rip), %zmm1 # zmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vbroadcastss .LCPI1_1(%rip), %zmm2 # zmm2 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] + movw $4369, %cx # imm = 0x1111 + kmovw %ecx, %k1 + vmovdqu .LCPI1_2(%rip), %xmm3 # xmm3 = <1,u> + # AlignMOV convert to UnAlignMOV + vmovsd .LCPI1_3(%rip), %xmm4 # xmm4 = mem[0],zero + xorl %ebx, %ebx + movq %r12, 40(%rsp) # 8-byte Spill + movq %r15, 16(%rsp) # 8-byte Spill + jmp .LBB1_7 + .p2align 5, 0x90 +.LBB1_13: # in Loop: Header=BB1_7 Depth=1 + movl 12(%rsp), %ebx # 4-byte Reload + movq 40(%rsp), %r12 # 8-byte Reload + movq 16(%rsp), %r15 # 8-byte Reload + movq 80(%rsp), %rax # 8-byte Reload + movq 72(%rsp), %rsi # 8-byte Reload + movq 64(%rsp), %r10 # 8-byte Reload + movq 56(%rsp), %rcx # 8-byte Reload +.LBB1_9: # in Loop: Header=BB1_7 Depth=1 + vshuff64x2 $136, %zmm14, %zmm12, %zmm7 # zmm7 = zmm12[0,1,4,5],zmm14[0,1,4,5] + vshuff64x2 $221, %zmm14, %zmm12, %zmm10 # zmm10 = zmm12[2,3,6,7],zmm14[2,3,6,7] + vaddps %zmm10, %zmm7, %zmm7 + vpermilpd $85, %zmm7, %zmm10 # zmm10 = zmm7[1,0,3,2,5,4,7,6] + vaddps %zmm10, %zmm7, %zmm7 + vpermilps $177, %zmm7, %zmm10 # zmm10 = zmm7[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] + vaddps %zmm10, %zmm7, %zmm7 + vcompressps %zmm7, %zmm7 {%k1} {z} + vaddps (%r10,%rax,4), %xmm7, %xmm7 + vmovups %xmm7, (%r10,%rax,4) # AlignMOV convert to UnAlignMOV + vshuff64x2 $136, %zmm9, %zmm8, %zmm7 # zmm7 = zmm8[0,1,4,5],zmm9[0,1,4,5] + vshuff64x2 $221, %zmm9, %zmm8, %zmm8 # zmm8 = zmm8[2,3,6,7],zmm9[2,3,6,7] + vaddps %zmm8, %zmm7, %zmm7 + vpermilpd $85, %zmm7, %zmm8 # zmm8 = zmm7[1,0,3,2,5,4,7,6] + vaddps %zmm8, %zmm7, %zmm7 + vpermilps $177, %zmm7, %zmm8 # zmm8 = zmm7[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] + vaddps %zmm8, %zmm7, %zmm7 + vcompressps %zmm7, %zmm7 {%k1} {z} + vaddps 32(%r10,%rax,4), %xmm7, %xmm7 + vmovups %xmm7, 32(%r10,%rax,4) # AlignMOV convert to UnAlignMOV + vshuff64x2 $136, %zmm5, %zmm6, %zmm7 # zmm7 = zmm6[0,1,4,5],zmm5[0,1,4,5] + vshuff64x2 $221, %zmm5, %zmm6, %zmm5 # zmm5 = zmm6[2,3,6,7],zmm5[2,3,6,7] + vaddps %zmm5, %zmm7, %zmm5 + vpermilpd $85, %zmm5, %zmm6 # zmm6 = zmm5[1,0,3,2,5,4,7,6] + vaddps %zmm6, %zmm5, %zmm5 + vpermilps $177, %zmm5, %zmm6 # zmm6 = zmm5[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] + vaddps %zmm6, %zmm5, %zmm5 + vcompressps %zmm5, %zmm5 {%k1} {z} + vaddps 64(%r10,%rax,4), %xmm5, %xmm5 + vmovups %xmm5, 64(%r10,%rax,4) # AlignMOV convert to UnAlignMOV + vpinsrq $1, %rcx, %xmm3, %xmm5 + movq 48(%rsp), %rdx # 8-byte Reload + vpaddq (%rdx), %xmm5, %xmm5 + vmovdqu %xmm5, (%rdx) + addl %esi, %ebx + vcvtsi2sd %esi, %xmm11, %xmm5 + vmulsd %xmm4, %xmm5, %xmm5 + vcvttsd2si %xmm5, %rcx + addq %rcx, 16(%rdx) + incq %r11 + movslq 20(%r15), %rcx + cmpq %rcx, %r11 + jge .LBB1_10 +.LBB1_7: # =>This Loop Header: Depth=1 + # Child Loop BB1_12 Depth 2 + leal (,%r11,4), %eax + movl %eax, %ecx + andl $2147483640, %ecx # imm = 0x7FFFFFF8 + leal (%rcx,%rcx,2), %ecx + andl $4, %eax + orl %ecx, %eax + movq 176(%r15), %r10 + movq 24(%r12), %rcx + movslq (%rcx,%r11,4), %rsi + testq %rsi, %rsi + jle .LBB1_8 +# %bb.11: # in Loop: Header=BB1_7 Depth=1 + movl %ebx, 12(%rsp) # 4-byte Spill + movq 160(%r15), %r15 + vbroadcastss (%r15,%rax,4), %ymm5 + movq 8(%r12), %rcx + vbroadcastss 4(%r15,%rax,4), %ymm6 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm7 + vbroadcastss 8(%r15,%rax,4), %ymm5 + vbroadcastss 12(%r15,%rax,4), %ymm6 + vbroadcastss 32(%r15,%rax,4), %ymm8 + vbroadcastss 36(%r15,%rax,4), %ymm9 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm10 + vinsertf64x4 $1, %ymm9, %zmm8, %zmm11 + vbroadcastss 40(%r15,%rax,4), %ymm5 + vbroadcastss 44(%r15,%rax,4), %ymm6 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm13 + vbroadcastss 64(%r15,%rax,4), %ymm5 + vbroadcastss 68(%r15,%rax,4), %ymm6 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm15 + vbroadcastss 72(%r15,%rax,4), %ymm5 + movq %rax, 80(%rsp) # 8-byte Spill + vbroadcastss 76(%r15,%rax,4), %ymm6 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm16 + movq %rsi, 72(%rsp) # 8-byte Spill + movl %esi, %eax + movl 16(%r12), %edx + imull %r11d, %edx + movslq %edx, %rdx + leaq (%rcx,%rdx,4), %rcx + movq %rcx, (%rsp) # 8-byte Spill + movq %rax, 56(%rsp) # 8-byte Spill + decq %rax + movq %rax, 88(%rsp) # 8-byte Spill + vxorps %xmm12, %xmm12, %xmm12 + movq %r10, 64(%rsp) # 8-byte Spill + xorl %ecx, %ecx + vxorps %xmm8, %xmm8, %xmm8 + vxorps %xmm6, %xmm6, %xmm6 + vxorps %xmm14, %xmm14, %xmm14 + vxorps %xmm9, %xmm9, %xmm9 + vxorps %xmm5, %xmm5, %xmm5 + .p2align 4, 0x90 +.LBB1_12: # Parent Loop BB1_7 Depth=1 + # => This Inner Loop Header: Depth=2 + movq (%rsp), %rax # 8-byte Reload + movslq (%rax,%rcx,4), %rdx + leal (%rdx,%rdx), %esi + xorl %ebx, %ebx + cmpq %rsi, %r11 + leal 1(%rdx,%rdx), %edi + setne %bl + leal (%rbx,%rbx,2), %ebx + movl $255, %ebp + movl $248, %eax + cmovel %eax, %ebp + orl $252, %ebx + leal -127(%rbp), %r8d + cmpq %rdi, %r11 + cmovnel %ebp, %r8d + leal 193(%rbx), %r14d + xorl %r13d, %r13d + cmpq %rdi, %r11 + cmovnel %ebx, %r14d + sete %r13b + movl $0, %r9d + movl $-31, %eax + cmovel %eax, %r9d + leal 240(%r13), %edi + addl $255, %r13d + xorl %ebx, %ebx + cmpq %rsi, %r11 + cmovel %edi, %r13d + sete %bl + shlq $5, %rdx + leaq (%rdx,%rdx,2), %r12 + vmovupd (%r15,%r12), %zmm17 + vbroadcastf64x4 (%r15,%r12), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3] + vbroadcastf64x4 64(%r15,%r12), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3] + subl %ebx, %r9d + addl $255, %r9d + shll $8, %r14d + orl %r9d, %r14d + kmovd %r14d, %k2 + shll $8, %r13d + orl %r8d, %r13d + kmovd %r13d, %k3 + vshuff64x2 $238, %zmm17, %zmm17, %zmm17 # zmm17 = zmm17[4,5,6,7,4,5,6,7] + vsubps %zmm18, %zmm7, %zmm20 + vsubps %zmm17, %zmm11, %zmm21 + vsubps %zmm19, %zmm15, %zmm22 + vsubps %zmm18, %zmm10, %zmm18 + vsubps %zmm17, %zmm13, %zmm17 + vsubps %zmm19, %zmm16, %zmm19 + vmulps %zmm22, %zmm22, %zmm23 + vfmadd231ps %zmm21, %zmm21, %zmm23 # zmm23 = (zmm21 * zmm21) + zmm23 + vfmadd231ps %zmm20, %zmm20, %zmm23 # zmm23 = (zmm20 * zmm20) + zmm23 + vmulps %zmm19, %zmm19, %zmm24 + vrcp14ps %zmm23, %zmm25 + vfmadd231ps %zmm17, %zmm17, %zmm24 # zmm24 = (zmm17 * zmm17) + zmm24 + vfmadd231ps %zmm18, %zmm18, %zmm24 # zmm24 = (zmm18 * zmm18) + zmm24 + vrcp14ps %zmm24, %zmm26 + vmulps %zmm25, %zmm30, %zmm27 + vmulps %zmm27, %zmm25, %zmm27 + vmulps %zmm27, %zmm25, %zmm27 + vmulps %zmm26, %zmm30, %zmm28 + vmulps %zmm28, %zmm26, %zmm28 + vmulps %zmm28, %zmm26, %zmm28 + vaddps %zmm1, %zmm27, %zmm29 + vmulps %zmm25, %zmm31, %zmm25 + vmulps %zmm29, %zmm25, %zmm25 + vmulps %zmm25, %zmm27, %zmm25 + vmulps %zmm2, %zmm25, %zmm25 + vaddps %zmm1, %zmm28, %zmm27 + vmulps %zmm26, %zmm31, %zmm26 + vmulps %zmm27, %zmm26, %zmm26 + vmulps %zmm26, %zmm28, %zmm26 + vmulps %zmm2, %zmm26, %zmm26 + vcmpltps %zmm0, %zmm23, %k2 {%k2} + vmulps %zmm25, %zmm20, %zmm20 {%k2} {z} + vmulps %zmm25, %zmm21, %zmm21 {%k2} {z} + vmulps %zmm25, %zmm22, %zmm22 {%k2} {z} + vcmpltps %zmm0, %zmm24, %k2 {%k3} + vmulps %zmm26, %zmm18, %zmm18 {%k2} {z} + vmulps %zmm26, %zmm17, %zmm17 {%k2} {z} + vmulps %zmm26, %zmm19, %zmm19 {%k2} {z} + vaddps %zmm18, %zmm20, %zmm23 + vaddps %zmm17, %zmm21, %zmm24 + vextractf64x4 $1, %zmm23, %ymm25 + vaddps %ymm25, %ymm23, %ymm23 + vmovups (%r10,%r12), %ymm25 # AlignMOV convert to UnAlignMOV + vsubps %ymm23, %ymm25, %ymm23 + vmovups 32(%r10,%r12), %ymm25 # AlignMOV convert to UnAlignMOV + vmovups 64(%r10,%r12), %ymm26 # AlignMOV convert to UnAlignMOV + vmovups %ymm23, (%r10,%r12) # AlignMOV convert to UnAlignMOV + vaddps %zmm19, %zmm22, %zmm23 + vextractf64x4 $1, %zmm24, %ymm27 + vaddps %ymm27, %ymm24, %ymm24 + vsubps %ymm24, %ymm25, %ymm24 + vmovups %ymm24, 32(%r10,%r12) # AlignMOV convert to UnAlignMOV + vextractf64x4 $1, %zmm23, %ymm24 + vaddps %ymm24, %ymm23, %ymm23 + vsubps %ymm23, %ymm26, %ymm23 + vmovups %ymm23, 64(%r10,%r12) # AlignMOV convert to UnAlignMOV + vaddps %zmm20, %zmm12, %zmm12 + vaddps %zmm21, %zmm8, %zmm8 + vaddps %zmm22, %zmm6, %zmm6 + vaddps %zmm18, %zmm14, %zmm14 + vaddps %zmm17, %zmm9, %zmm9 + vaddps %zmm19, %zmm5, %zmm5 + cmpq %rcx, 88(%rsp) # 8-byte Folded Reload + je .LBB1_13 +# %bb.14: # in Loop: Header=BB1_12 Depth=2 + movq 16(%rsp), %rdx # 8-byte Reload + movq 160(%rdx), %r15 + movq 176(%rdx), %r10 + incq %rcx + jmp .LBB1_12 + .p2align 5, 0x90 +.LBB1_8: # in Loop: Header=BB1_7 Depth=1 + vxorps %xmm5, %xmm5, %xmm5 + movq %rsi, %rcx + vxorps %xmm9, %xmm9, %xmm9 + vxorps %xmm14, %xmm14, %xmm14 + vxorps %xmm6, %xmm6, %xmm6 + vxorps %xmm8, %xmm8, %xmm8 + vxorps %xmm12, %xmm12, %xmm12 + jmp .LBB1_9 + .p2align 5, 0x90 +.LBB1_10: + movl $.L.str.1, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + movq 32(%rsp), %rax # 8-byte Reload + vmovsd 184(%rax), %xmm3 # xmm3 = mem[0],zero + vsubsd 24(%rsp), %xmm0, %xmm1 # 8-byte Folded Reload + vmovsd %xmm1, (%rsp) # 8-byte Spill + vmulsd .LCPI1_4(%rip), %xmm3, %xmm0 + vmulsd %xmm1, %xmm0, %xmm0 + vcvtusi2sd %ebx, %xmm11, %xmm2 + vdivsd %xmm2, %xmm0, %xmm2 + movl $.L.str.4, %edi + movl %ebx, %esi + vmovapd %xmm3, %xmm0 + movb $3, %al + callq printf + movl $.L.str.5, %edi + xorl %eax, %eax + callq debug_printf + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + addq $232, %rsp + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end1: + .size computeForceLJ_2xnn_half, .Lfunc_end1-computeForceLJ_2xnn_half + .cfi_endproc + # -- End function + .section .rodata.cst4,"aM",@progbits,4 + .p2align 2 # -- Begin function computeForceLJ_2xnn_full +.LCPI2_0: + .long 0xbf000000 # -0.5 +.LCPI2_1: + .long 0x42400000 # 48 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI2_2: + .quad 1 # 0x1 + .zero 8 + .text + .globl computeForceLJ_2xnn_full + .p2align 4, 0x90 + .type computeForceLJ_2xnn_full,@function +computeForceLJ_2xnn_full: # + .cfi_startproc +# %bb.0: + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $168, %rsp + .cfi_def_cfa_offset 224 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, %r13 + movq %rdx, %r14 + movq %rsi, %rbp + movq %rdi, %r12 + movl $.L.str.3, %edi + xorl %eax, %eax + callq debug_printf + vmovss 108(%r12), %xmm0 # xmm0 = mem[0],zero,zero,zero + vmovss %xmm0, 20(%rsp) # 4-byte Spill + vbroadcastss 48(%r12), %zmm2 + vpbroadcastd 40(%r12), %zmm3 + movl 20(%rbp), %r11d + testl %r11d, %r11d + jle .LBB2_5 +# %bb.1: + movq 176(%rbp), %r9 + movq 192(%rbp), %r10 + decq %r11 + leaq 64(%r9), %r8 + xorl %edi, %edi + vxorps %xmm0, %xmm0, %xmm0 + vxorps %xmm1, %xmm1, %xmm1 + jmp .LBB2_2 + .p2align 5, 0x90 +.LBB2_20: # in Loop: Header=BB2_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB2_5 +.LBB2_2: # =>This Loop Header: Depth=1 + # Child Loop BB2_15 Depth 2 + # Child Loop BB2_19 Depth 2 + leaq (%rdi,%rdi,8), %rax + leaq (%rax,%rax,2), %rax + addq %rdi, %rax + movl (%r10,%rax), %ecx + testl %ecx, %ecx + jle .LBB2_20 +# %bb.3: # in Loop: Header=BB2_2 Depth=1 + leal (,%rdi,4), %esi + movl %esi, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %esi + orl %eax, %esi + cmpl $7, %ecx + ja .LBB2_14 +# %bb.4: # in Loop: Header=BB2_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + cmpq %rcx, %rbx + jae .LBB2_20 + jmp .LBB2_18 + .p2align 5, 0x90 +.LBB2_14: # in Loop: Header=BB2_2 Depth=1 + leaq (,%rcx,4), %rbx + andq $-32, %rbx + movl %esi, %r12d + leaq (%r9,%r12,4), %rax + xorl %edx, %edx + .p2align 4, 0x90 +.LBB2_15: # Parent Loop BB2_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovups %ymm0, (%rax,%rdx) + addq $32, %rdx + cmpq %rdx, %rbx + jne .LBB2_15 +# %bb.16: # in Loop: Header=BB2_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + addq %rbx, %r12 + vmovups %zmm1, (%r9,%r12,4) + cmpq %rcx, %rbx + jae .LBB2_20 +.LBB2_18: # in Loop: Header=BB2_2 Depth=1 + movl %esi, %eax + leaq (%r8,%rax,4), %rdx + .p2align 4, 0x90 +.LBB2_19: # Parent Loop BB2_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movl $0, -64(%rdx,%rbx,4) + movl $0, -32(%rdx,%rbx,4) + movl $0, (%rdx,%rbx,4) + incq %rbx + cmpq %rbx, %rcx + jne .LBB2_19 + jmp .LBB2_20 + .p2align 5, 0x90 +.LBB2_5: + xorl %r12d, %r12d + xorl %eax, %eax + vmovups %zmm2, 96(%rsp) # 64-byte Spill + vmovdqu64 %zmm3, 32(%rsp) # 64-byte Spill + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 24(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + vmovups 32(%rsp), %zmm27 # 64-byte Reload + vmovups 96(%rsp), %zmm26 # 64-byte Reload + cmpl $0, 20(%rbp) + jle .LBB2_10 +# %bb.6: + vmovss 20(%rsp), %xmm0 # 4-byte Reload + # xmm0 = mem[0],zero,zero,zero + vmulss %xmm0, %xmm0, %xmm0 + vbroadcastss %xmm0, %zmm0 + movl $224, %r8d + vbroadcastss .LCPI2_0(%rip), %zmm1 # zmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vbroadcastss .LCPI2_1(%rip), %zmm2 # zmm2 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] + movw $4369, %ax # imm = 0x1111 + kmovw %eax, %k1 + vmovdqu .LCPI2_2(%rip), %xmm3 # xmm3 = <1,u> + # AlignMOV convert to UnAlignMOV + movq %rbp, 8(%rsp) # 8-byte Spill + jmp .LBB2_7 + .p2align 5, 0x90 +.LBB2_8: # in Loop: Header=BB2_7 Depth=1 + vxorps %xmm4, %xmm4, %xmm4 + vxorps %xmm8, %xmm8, %xmm8 + vxorps %xmm13, %xmm13, %xmm13 + vxorps %xmm5, %xmm5, %xmm5 + vxorps %xmm9, %xmm9, %xmm9 + vxorps %xmm12, %xmm12, %xmm12 +.LBB2_9: # in Loop: Header=BB2_7 Depth=1 + movq 176(%rbp), %rax + vshuff64x2 $136, %zmm13, %zmm12, %zmm6 # zmm6 = zmm12[0,1,4,5],zmm13[0,1,4,5] + vshuff64x2 $221, %zmm13, %zmm12, %zmm7 # zmm7 = zmm12[2,3,6,7],zmm13[2,3,6,7] + vaddps %zmm7, %zmm6, %zmm6 + vpermilpd $85, %zmm6, %zmm7 # zmm7 = zmm6[1,0,3,2,5,4,7,6] + vaddps %zmm7, %zmm6, %zmm6 + vpermilps $177, %zmm6, %zmm7 # zmm7 = zmm6[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] + vaddps %zmm7, %zmm6, %zmm6 + vcompressps %zmm6, %zmm6 {%k1} {z} + vaddps (%rax,%r9,4), %xmm6, %xmm6 + vmovups %xmm6, (%rax,%r9,4) # AlignMOV convert to UnAlignMOV + vshuff64x2 $136, %zmm8, %zmm9, %zmm6 # zmm6 = zmm9[0,1,4,5],zmm8[0,1,4,5] + vshuff64x2 $221, %zmm8, %zmm9, %zmm7 # zmm7 = zmm9[2,3,6,7],zmm8[2,3,6,7] + vaddps %zmm7, %zmm6, %zmm6 + vpermilpd $85, %zmm6, %zmm7 # zmm7 = zmm6[1,0,3,2,5,4,7,6] + vaddps %zmm7, %zmm6, %zmm6 + vpermilps $177, %zmm6, %zmm7 # zmm7 = zmm6[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] + vaddps %zmm7, %zmm6, %zmm6 + vcompressps %zmm6, %zmm6 {%k1} {z} + vaddps 32(%rax,%r9,4), %xmm6, %xmm6 + vmovups %xmm6, 32(%rax,%r9,4) # AlignMOV convert to UnAlignMOV + vshuff64x2 $136, %zmm4, %zmm5, %zmm6 # zmm6 = zmm5[0,1,4,5],zmm4[0,1,4,5] + vshuff64x2 $221, %zmm4, %zmm5, %zmm4 # zmm4 = zmm5[2,3,6,7],zmm4[2,3,6,7] + vaddps %zmm4, %zmm6, %zmm4 + vpermilpd $85, %zmm4, %zmm5 # zmm5 = zmm4[1,0,3,2,5,4,7,6] + vaddps %zmm5, %zmm4, %zmm4 + vpermilps $177, %zmm4, %zmm5 # zmm5 = zmm4[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] + vaddps %zmm5, %zmm4, %zmm4 + vcompressps %zmm4, %zmm4 {%k1} {z} + vaddps 64(%rax,%r9,4), %xmm4, %xmm4 + vmovups %xmm4, 64(%rax,%r9,4) # AlignMOV convert to UnAlignMOV + vpinsrq $1, %r10, %xmm3, %xmm4 + vpaddq (%r13), %xmm4, %xmm4 + vmovdqu %xmm4, (%r13) + addq %r10, 16(%r13) + incq %r12 + movslq 20(%rbp), %rax + cmpq %rax, %r12 + jge .LBB2_10 +.LBB2_7: # =>This Loop Header: Depth=1 + # Child Loop BB2_12 Depth 2 + leal (,%r12,4), %r9d + movl %r9d, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %r9d + orl %eax, %r9d + movq 24(%r14), %rax + movslq (%rax,%r12,4), %r10 + testq %r10, %r10 + jle .LBB2_8 +# %bb.11: # in Loop: Header=BB2_7 Depth=1 + movq 160(%rbp), %rsi + movq 8(%r14), %rax + vbroadcastss (%rsi,%r9,4), %ymm4 + vbroadcastss 4(%rsi,%r9,4), %ymm5 + vbroadcastss 8(%rsi,%r9,4), %ymm7 + vinsertf64x4 $1, %ymm5, %zmm4, %zmm6 + vbroadcastss 12(%rsi,%r9,4), %ymm4 + vinsertf64x4 $1, %ymm4, %zmm7, %zmm7 + vbroadcastss 32(%rsi,%r9,4), %ymm4 + vbroadcastss 36(%rsi,%r9,4), %ymm5 + vbroadcastss 40(%rsi,%r9,4), %ymm8 + vbroadcastss 44(%rsi,%r9,4), %ymm9 + vinsertf64x4 $1, %ymm5, %zmm4, %zmm10 + vinsertf64x4 $1, %ymm9, %zmm8, %zmm11 + vbroadcastss 64(%rsi,%r9,4), %ymm4 + vbroadcastss 68(%rsi,%r9,4), %ymm5 + vinsertf64x4 $1, %ymm5, %zmm4, %zmm14 + vbroadcastss 72(%rsi,%r9,4), %ymm4 + vbroadcastss 76(%rsi,%r9,4), %ymm5 + vinsertf64x4 $1, %ymm5, %zmm4, %zmm15 + movl %r10d, %r10d + movq %r14, %r15 + movl 16(%r14), %ecx + imull %r12d, %ecx + movslq %ecx, %rcx + leaq (%rax,%rcx,4), %r11 + vxorps %xmm12, %xmm12, %xmm12 + xorl %eax, %eax + vxorps %xmm9, %xmm9, %xmm9 + vxorps %xmm5, %xmm5, %xmm5 + vxorps %xmm13, %xmm13, %xmm13 + vxorps %xmm8, %xmm8, %xmm8 + vxorps %xmm4, %xmm4, %xmm4 + .p2align 4, 0x90 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# LLVM-MCA-BEGIN +# pointer_increment=256 a23042eac7d8a1e13e9ff886fc02a80e +.LBB2_12: # Parent Loop BB2_7 Depth=1 + # => This Inner Loop Header: Depth=2 + movslq (%r11,%rax,4), %rcx + leaq (%rcx,%rcx,2), %rdx + shlq $5, %rdx + vmovupd (%rsi,%rdx), %zmm16 + vbroadcastf64x4 64(%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3] + vbroadcastf64x4 (%rsi,%rdx), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3] + vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7] + vsubps %zmm19, %zmm6, %zmm18 + vsubps %zmm21, %zmm10, %zmm17 + vsubps %zmm20, %zmm14, %zmm16 + vmulps %zmm16, %zmm16, %zmm22 + vfmadd231ps %zmm17, %zmm17, %zmm22 # zmm22 = (zmm17 * zmm17) + zmm22 + vfmadd231ps %zmm18, %zmm18, %zmm22 # zmm22 = (zmm18 * zmm18) + zmm22 + vrcp14ps %zmm22, %zmm23 + vmulps %zmm23, %zmm26, %zmm24 + vmulps %zmm24, %zmm23, %zmm24 + vmulps %zmm24, %zmm23, %zmm24 + vaddps %zmm1, %zmm24, %zmm25 + vmulps %zmm23, %zmm27, %zmm23 + vmulps %zmm25, %zmm23, %zmm23 + vmulps %zmm23, %zmm24, %zmm23 + leal (%rcx,%rcx), %edx + xorl %edi, %edi + xorl %ebp, %ebp + cmpq %rdx, %r12 + setne %dil + leal 1(%rcx,%rcx), %ecx + sete %bpl + xorl %edx, %edx + xorl %ebx, %ebx + cmpq %rcx, %r12 + sete %dl + movl $0, %ecx + setne %bl + cmovel %r8d, %ecx + movl %ebx, %r14d + shll $4, %r14d + subl %ebp, %r14d + leal (%rcx,%rdi,2), %ecx + shll $8, %ecx + addl $239, %r14d + addl $-768, %ecx # imm = 0xFD00 + orl %r14d, %ecx + kmovd %ecx, %k2 + vcmpltps %zmm0, %zmm22, %k2 {%k2} + vsubps %zmm21, %zmm11, %zmm21 + vsubps %zmm20, %zmm15, %zmm20 + vsubps %zmm19, %zmm7, %zmm19 + vmulps %zmm2, %zmm23, %zmm22 + vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12 + vmulps %zmm20, %zmm20, %zmm18 + vfmadd231ps %zmm21, %zmm21, %zmm18 # zmm18 = (zmm21 * zmm21) + zmm18 + vfmadd231ps %zmm19, %zmm19, %zmm18 # zmm18 = (zmm19 * zmm19) + zmm18 + vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9 + vrcp14ps %zmm18, %zmm17 + vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5 + vmulps %zmm17, %zmm26, %zmm16 + vmulps %zmm16, %zmm17, %zmm16 + vmulps %zmm16, %zmm17, %zmm16 + vaddps %zmm1, %zmm16, %zmm22 + vmulps %zmm17, %zmm27, %zmm17 + vmulps %zmm22, %zmm17, %zmm17 + vmulps %zmm17, %zmm16, %zmm16 + shll $6, %ebx + leal (%rbx,%rdi,4), %ecx + shll $7, %edx + leal (%rdx,%rdi,8), %edx + shll $8, %edx + addl %edx, %ecx + addl $-2117, %ecx # imm = 0xF7BB + kmovd %ecx, %k2 + vcmpltps %zmm0, %zmm18, %k2 {%k2} + vmulps %zmm2, %zmm16, %zmm16 + vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13 + vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8 + vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4 + incq %rax + cmpq %rax, %r10 + jne .LBB2_12 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER +# %bb.13: # in Loop: Header=BB2_7 Depth=1 + movq %r15, %r14 + movq 8(%rsp), %rbp # 8-byte Reload + jmp .LBB2_9 + .p2align 5, 0x90 +.LBB2_10: + movl $.L.str.1, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + movl $.L.str.5, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 8(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 24(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $168, %rsp + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end2: + .size computeForceLJ_2xnn_full, .Lfunc_end2-computeForceLJ_2xnn_full + .cfi_endproc + # -- End function + .globl computeForceLJ_2xnn # -- Begin function computeForceLJ_2xnn + .p2align 4, 0x90 + .type computeForceLJ_2xnn,@function +computeForceLJ_2xnn: # + .cfi_startproc +# %bb.0: + cmpl $0, 32(%rdx) + je .LBB3_2 +# %bb.1: + jmp computeForceLJ_2xnn_half # TAILCALL + .p2align 5, 0x90 +.LBB3_2: + jmp computeForceLJ_2xnn_full # TAILCALL +.Lfunc_end3: + .size computeForceLJ_2xnn, .Lfunc_end3-computeForceLJ_2xnn + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_4xn_half +.LCPI4_0: + .quad 0x41cdcd6500000000 # 1.0E+9 + .section .rodata.cst4,"aM",@progbits,4 + .p2align 2 +.LCPI4_1: + .long 0xbf000000 # -0.5 +.LCPI4_2: + .long 0x42400000 # 48 + .text + .globl computeForceLJ_4xn_half + .p2align 4, 0x90 + .type computeForceLJ_4xn_half,@function +computeForceLJ_4xn_half: # + .cfi_startproc +# %bb.0: + pushq %r15 + .cfi_def_cfa_offset 16 + pushq %r14 + .cfi_def_cfa_offset 24 + pushq %r13 + .cfi_def_cfa_offset 32 + pushq %r12 + .cfi_def_cfa_offset 40 + pushq %rbx + .cfi_def_cfa_offset 48 + subq $576, %rsp # imm = 0x240 + .cfi_def_cfa_offset 624 + .cfi_offset %rbx, -48 + .cfi_offset %r12, -40 + .cfi_offset %r13, -32 + .cfi_offset %r14, -24 + .cfi_offset %r15, -16 + movq %rdx, %r14 + movq %rsi, %r15 + movq %rdi, %r12 + movl $.L.str.6, %edi + xorl %eax, %eax + callq debug_printf + vmovss 108(%r12), %xmm0 # xmm0 = mem[0],zero,zero,zero + vmovss %xmm0, 64(%rsp) # 4-byte Spill + vbroadcastss 48(%r12), %zmm0 + vmovups %zmm0, 512(%rsp) # 64-byte Spill + vbroadcastss 40(%r12), %zmm0 + vmovupd %zmm0, 448(%rsp) # 64-byte Spill + movl 20(%r15), %r11d + testl %r11d, %r11d + jle .LBB4_5 +# %bb.1: + movq 176(%r15), %r9 + movq 192(%r15), %r10 + decq %r11 + leaq 64(%r9), %r8 + xorl %edi, %edi + vxorpd %xmm0, %xmm0, %xmm0 + vxorps %xmm1, %xmm1, %xmm1 + jmp .LBB4_2 + .p2align 5, 0x90 +.LBB4_16: # in Loop: Header=BB4_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB4_5 +.LBB4_2: # =>This Loop Header: Depth=1 + # Child Loop BB4_11 Depth 2 + # Child Loop BB4_15 Depth 2 + leaq (%rdi,%rdi,8), %rax + leaq (%rax,%rax,2), %rax + addq %rdi, %rax + movl (%r10,%rax), %ecx + testl %ecx, %ecx + jle .LBB4_16 +# %bb.3: # in Loop: Header=BB4_2 Depth=1 + leal (,%rdi,4), %esi + movl %esi, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %esi + orl %eax, %esi + cmpl $7, %ecx + ja .LBB4_10 +# %bb.4: # in Loop: Header=BB4_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + cmpq %rcx, %rbx + jae .LBB4_16 + jmp .LBB4_14 + .p2align 5, 0x90 +.LBB4_10: # in Loop: Header=BB4_2 Depth=1 + leaq (,%rcx,4), %rbx + andq $-32, %rbx + movl %esi, %r13d + leaq (%r9,%r13,4), %rax + xorl %edx, %edx + .p2align 4, 0x90 +.LBB4_11: # Parent Loop BB4_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %ymm0, (%rax,%rdx) + addq $32, %rdx + cmpq %rdx, %rbx + jne .LBB4_11 +# %bb.12: # in Loop: Header=BB4_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + addq %rbx, %r13 + vmovups %zmm1, (%r9,%r13,4) + cmpq %rcx, %rbx + jae .LBB4_16 +.LBB4_14: # in Loop: Header=BB4_2 Depth=1 + movl %esi, %eax + leaq (%r8,%rax,4), %rdx + .p2align 4, 0x90 +.LBB4_15: # Parent Loop BB4_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movl $0, -64(%rdx,%rbx,4) + movl $0, -32(%rdx,%rbx,4) + movl $0, (%rdx,%rbx,4) + incq %rbx + cmpq %rbx, %rcx + jne .LBB4_15 + jmp .LBB4_16 + .p2align 5, 0x90 +.LBB4_5: + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, (%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + cmpl $0, 20(%r15) + jg .LBB4_6 +# %bb.17: + movl $.L.str.1, %edi + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd 184(%r12), %xmm3 # xmm3 = mem[0],zero + vsubsd (%rsp), %xmm0, %xmm1 # 8-byte Folded Reload + vmovsd %xmm1, (%rsp) # 8-byte Spill + vmulsd .LCPI4_0(%rip), %xmm3, %xmm0 + vmulsd %xmm1, %xmm0, %xmm0 + vxorpd %xmm2, %xmm2, %xmm2 + vdivsd %xmm2, %xmm0, %xmm2 + movl $.L.str.4, %edi + xorl %esi, %esi + vmovapd %xmm3, %xmm0 + movb $3, %al + callq printf + movl $.L.str.7, %edi + xorl %eax, %eax + callq debug_printf + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + addq $576, %rsp # imm = 0x240 + .cfi_def_cfa_offset 48 + popq %rbx + .cfi_def_cfa_offset 40 + popq %r12 + .cfi_def_cfa_offset 32 + popq %r13 + .cfi_def_cfa_offset 24 + popq %r14 + .cfi_def_cfa_offset 16 + popq %r15 + .cfi_def_cfa_offset 8 + retq + .p2align 5, 0x90 +.LBB4_6: + .cfi_def_cfa_offset 624 + movq 24(%r14), %rax + movl (%rax), %r10d + testl %r10d, %r10d + jle .LBB4_18 +# %bb.7: + vmovss 64(%rsp), %xmm0 # 4-byte Reload + # xmm0 = mem[0],zero,zero,zero + vmulss %xmm0, %xmm0, %xmm0 + vbroadcastss %xmm0, %zmm0 + movq 160(%r15), %rdi + movq 8(%r14), %r11 + vbroadcastss (%rdi), %zmm1 + vmovups %zmm1, (%rsp) # 64-byte Spill + vbroadcastss 4(%rdi), %zmm1 + vmovups %zmm1, 64(%rsp) # 64-byte Spill + vbroadcastss 8(%rdi), %zmm1 + vmovups %zmm1, 384(%rsp) # 64-byte Spill + vbroadcastss 12(%rdi), %zmm1 + vmovups %zmm1, 320(%rsp) # 64-byte Spill + vbroadcastss 32(%rdi), %zmm1 + vmovups %zmm1, 256(%rsp) # 64-byte Spill + vbroadcastss 36(%rdi), %zmm1 + vmovups %zmm1, 192(%rsp) # 64-byte Spill + vbroadcastss 40(%rdi), %zmm1 + vmovups %zmm1, 128(%rsp) # 64-byte Spill + vbroadcastss 44(%rdi), %zmm8 + vbroadcastss 64(%rdi), %zmm9 + vbroadcastss 68(%rdi), %zmm10 + vbroadcastss 72(%rdi), %zmm11 + vbroadcastss 76(%rdi), %zmm12 + decq %r10 + xorl %edx, %edx + movl $248, %r8d + movl $240, %r9d + vbroadcastss .LCPI4_1(%rip), %zmm13 # zmm13 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vbroadcastss .LCPI4_2(%rip), %zmm14 # zmm14 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] + vmovups 512(%rsp), %zmm6 # 64-byte Reload + vmovups 448(%rsp), %zmm7 # 64-byte Reload + .p2align 4, 0x90 +.LBB4_8: # =>This Inner Loop Header: Depth=1 + movslq (%r11,%rdx,4), %rax + movq %rax, %rsi + shlq $5, %rsi + leaq (%rsi,%rsi,2), %rbx + vmovups (%rdi,%rbx), %zmm15 # AlignMOV convert to UnAlignMOV + vmovups 32(%rdi,%rbx), %zmm16 # AlignMOV convert to UnAlignMOV + vmovups 64(%rdi,%rbx), %zmm27 # AlignMOV convert to UnAlignMOV + vmovups (%rsp), %zmm1 # 64-byte Reload + vsubps %zmm15, %zmm1, %zmm24 + vmovups 256(%rsp), %zmm1 # 64-byte Reload + vsubps %zmm16, %zmm1, %zmm25 + vsubps %zmm27, %zmm9, %zmm26 + vmovups 64(%rsp), %zmm1 # 64-byte Reload + vsubps %zmm15, %zmm1, %zmm21 + vmovups 192(%rsp), %zmm1 # 64-byte Reload + vsubps %zmm16, %zmm1, %zmm22 + vsubps %zmm27, %zmm10, %zmm23 + vmovups 384(%rsp), %zmm1 # 64-byte Reload + vsubps %zmm15, %zmm1, %zmm17 + vmovups 128(%rsp), %zmm1 # 64-byte Reload + vsubps %zmm16, %zmm1, %zmm19 + vsubps %zmm27, %zmm11, %zmm20 + vmovups 320(%rsp), %zmm1 # 64-byte Reload + vsubps %zmm15, %zmm1, %zmm18 + vsubps %zmm16, %zmm8, %zmm16 + vsubps %zmm27, %zmm12, %zmm15 + vmulps %zmm26, %zmm26, %zmm27 + vfmadd231ps %zmm25, %zmm25, %zmm27 # zmm27 = (zmm25 * zmm25) + zmm27 + vfmadd231ps %zmm24, %zmm24, %zmm27 # zmm27 = (zmm24 * zmm24) + zmm27 + vmulps %zmm23, %zmm23, %zmm28 + vfmadd231ps %zmm22, %zmm22, %zmm28 # zmm28 = (zmm22 * zmm22) + zmm28 + vfmadd231ps %zmm21, %zmm21, %zmm28 # zmm28 = (zmm21 * zmm21) + zmm28 + vmulps %zmm20, %zmm20, %zmm29 + vfmadd231ps %zmm19, %zmm19, %zmm29 # zmm29 = (zmm19 * zmm19) + zmm29 + vfmadd231ps %zmm17, %zmm17, %zmm29 # zmm29 = (zmm17 * zmm17) + zmm29 + vmulps %zmm15, %zmm15, %zmm30 + vfmadd231ps %zmm16, %zmm16, %zmm30 # zmm30 = (zmm16 * zmm16) + zmm30 + vrcp14ps %zmm27, %zmm31 + vrcp14ps %zmm28, %zmm1 + vrcp14ps %zmm29, %zmm2 + vfmadd231ps %zmm18, %zmm18, %zmm30 # zmm30 = (zmm18 * zmm18) + zmm30 + vrcp14ps %zmm30, %zmm3 + vmulps %zmm31, %zmm6, %zmm4 + vmulps %zmm4, %zmm31, %zmm4 + vmulps %zmm4, %zmm31, %zmm4 + vaddps %zmm13, %zmm4, %zmm5 + vmulps %zmm31, %zmm7, %zmm31 + vmulps %zmm5, %zmm31, %zmm5 + vmulps %zmm1, %zmm6, %zmm31 + vmulps %zmm31, %zmm1, %zmm31 + vmulps %zmm31, %zmm1, %zmm31 + vmulps %zmm5, %zmm4, %zmm4 + vaddps %zmm13, %zmm31, %zmm5 + vmulps %zmm1, %zmm7, %zmm1 + vmulps %zmm5, %zmm1, %zmm1 + vmulps %zmm2, %zmm6, %zmm5 + vmulps %zmm5, %zmm2, %zmm5 + vmulps %zmm5, %zmm2, %zmm5 + vmulps %zmm1, %zmm31, %zmm1 + vaddps %zmm13, %zmm5, %zmm31 + vmulps %zmm2, %zmm7, %zmm2 + vmulps %zmm31, %zmm2, %zmm2 + vmulps %zmm3, %zmm6, %zmm31 + vmulps %zmm31, %zmm3, %zmm31 + vmulps %zmm31, %zmm3, %zmm31 + vmulps %zmm2, %zmm5, %zmm2 + vaddps %zmm13, %zmm31, %zmm5 + vmulps %zmm3, %zmm7, %zmm3 + vmulps %zmm5, %zmm3, %zmm3 + vmulps %zmm3, %zmm31, %zmm3 + xorl %esi, %esi + xorl %edi, %edi + testl $2147483647, %eax # imm = 0x7FFFFFFF + sete %sil + setne %dil + movl $255, %eax + cmovel %r8d, %eax + movl $255, %ecx + cmovel %r9d, %ecx + xorl $255, %esi + kmovd %esi, %k1 + vcmpltps %zmm0, %zmm27, %k1 {%k1} + vmulps %zmm14, %zmm4, %zmm4 + vmulps %zmm4, %zmm24, %zmm5 {%k1} {z} + vmulps %zmm4, %zmm25, %zmm24 {%k1} {z} + vmulps %zmm4, %zmm26, %zmm4 {%k1} {z} + leal (%rdi,%rdi,2), %esi + orl $252, %esi + kmovd %esi, %k1 + vcmpltps %zmm0, %zmm28, %k1 {%k1} + vmulps %zmm14, %zmm1, %zmm1 + vmulps %zmm1, %zmm21, %zmm21 {%k1} {z} + vaddps %zmm21, %zmm5, %zmm5 + vmulps %zmm1, %zmm22, %zmm21 {%k1} {z} + vaddps %zmm21, %zmm24, %zmm21 + vmulps %zmm1, %zmm23, %zmm1 {%k1} {z} + vaddps %zmm1, %zmm4, %zmm1 + kmovd %eax, %k1 + vcmpltps %zmm0, %zmm29, %k1 {%k1} + vmulps %zmm14, %zmm2, %zmm2 + vmulps %zmm2, %zmm17, %zmm4 {%k1} {z} + vmulps %zmm2, %zmm19, %zmm17 {%k1} {z} + vmulps %zmm2, %zmm20, %zmm2 {%k1} {z} + kmovd %ecx, %k1 + vcmpltps %zmm0, %zmm30, %k1 {%k1} + vmulps %zmm14, %zmm3, %zmm3 + vmulps %zmm3, %zmm18, %zmm18 {%k1} {z} + vaddps %zmm18, %zmm4, %zmm4 + vaddps %zmm4, %zmm5, %zmm4 + vmulps %zmm3, %zmm16, %zmm5 {%k1} {z} + vaddps %zmm5, %zmm17, %zmm5 + vaddps %zmm5, %zmm21, %zmm5 + vmulps %zmm3, %zmm15, %zmm3 {%k1} {z} + movq 176(%r15), %rax + vaddps %zmm3, %zmm2, %zmm2 + vmovups (%rax,%rbx), %zmm3 # AlignMOV convert to UnAlignMOV + vsubps %zmm4, %zmm3, %zmm3 + vmovups %zmm3, (%rax,%rbx) # AlignMOV convert to UnAlignMOV + vaddps %zmm2, %zmm1, %zmm1 + vmovups 32(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV + vsubps %zmm5, %zmm2, %zmm2 + vmovups %zmm2, 32(%rax,%rbx) # AlignMOV convert to UnAlignMOV + vmovups 64(%rax,%rbx), %zmm2 # AlignMOV convert to UnAlignMOV + vsubps %zmm1, %zmm2, %zmm1 + vmovups %zmm1, 64(%rax,%rbx) # AlignMOV convert to UnAlignMOV + cmpq %rdx, %r10 + je .LBB4_18 +# %bb.9: # in Loop: Header=BB4_8 Depth=1 + movq 160(%r15), %rdi + incq %rdx + jmp .LBB4_8 + .p2align 5, 0x90 +.LBB4_18: + vzeroupper + callq simd_incr_reduced_sum +.Lfunc_end4: + .size computeForceLJ_4xn_half, .Lfunc_end4-computeForceLJ_4xn_half + .cfi_endproc + # -- End function + .p2align 4, 0x90 # -- Begin function simd_incr_reduced_sum + .type simd_incr_reduced_sum,@function +simd_incr_reduced_sum: # + .cfi_startproc +# %bb.0: + pushq %rax + .cfi_def_cfa_offset 16 + movq stderr(%rip), %rcx + movl $.L.str.8, %edi + movl $92, %esi + movl $1, %edx + callq fwrite@PLT + movl $-1, %edi + callq exit +.Lfunc_end5: + .size simd_incr_reduced_sum, .Lfunc_end5-simd_incr_reduced_sum + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_4xn_full +.LCPI6_0: + .quad 0x41cdcd6500000000 # 1.0E+9 + .text + .globl computeForceLJ_4xn_full + .p2align 4, 0x90 + .type computeForceLJ_4xn_full,@function +computeForceLJ_4xn_full: # + .cfi_startproc +# %bb.0: + pushq %r15 + .cfi_def_cfa_offset 16 + pushq %r14 + .cfi_def_cfa_offset 24 + pushq %r12 + .cfi_def_cfa_offset 32 + pushq %rbx + .cfi_def_cfa_offset 40 + pushq %rax + .cfi_def_cfa_offset 48 + .cfi_offset %rbx, -40 + .cfi_offset %r12, -32 + .cfi_offset %r14, -24 + .cfi_offset %r15, -16 + movq %rsi, %r15 + movq %rdi, %r14 + movl $.L.str.6, %edi + xorl %eax, %eax + callq debug_printf + movl 20(%r15), %r11d + testl %r11d, %r11d + jle .LBB6_5 +# %bb.1: + movq 176(%r15), %r9 + movq 192(%r15), %r10 + decq %r11 + leaq 64(%r9), %r8 + xorl %edi, %edi + vxorpd %xmm0, %xmm0, %xmm0 + vxorps %xmm1, %xmm1, %xmm1 + jmp .LBB6_2 + .p2align 5, 0x90 +.LBB6_13: # in Loop: Header=BB6_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB6_5 +.LBB6_2: # =>This Loop Header: Depth=1 + # Child Loop BB6_8 Depth 2 + # Child Loop BB6_12 Depth 2 + leaq (%rdi,%rdi,8), %rax + leaq (%rax,%rax,2), %rax + addq %rdi, %rax + movl (%r10,%rax), %ecx + testl %ecx, %ecx + jle .LBB6_13 +# %bb.3: # in Loop: Header=BB6_2 Depth=1 + leal (,%rdi,4), %esi + movl %esi, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %esi + orl %eax, %esi + cmpl $7, %ecx + ja .LBB6_7 +# %bb.4: # in Loop: Header=BB6_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + cmpq %rcx, %rbx + jae .LBB6_13 + jmp .LBB6_11 + .p2align 5, 0x90 +.LBB6_7: # in Loop: Header=BB6_2 Depth=1 + leaq (,%rcx,4), %rbx + andq $-32, %rbx + movl %esi, %r12d + leaq (%r9,%r12,4), %rax + xorl %edx, %edx + .p2align 4, 0x90 +.LBB6_8: # Parent Loop BB6_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %ymm0, (%rax,%rdx) + addq $32, %rdx + cmpq %rdx, %rbx + jne .LBB6_8 +# %bb.9: # in Loop: Header=BB6_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + addq %rbx, %r12 + vmovups %zmm1, (%r9,%r12,4) + cmpq %rcx, %rbx + jae .LBB6_13 +.LBB6_11: # in Loop: Header=BB6_2 Depth=1 + movl %esi, %eax + leaq (%r8,%rax,4), %rdx + .p2align 4, 0x90 +.LBB6_12: # Parent Loop BB6_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movl $0, -64(%rdx,%rbx,4) + movl $0, -32(%rdx,%rbx,4) + movl $0, (%rdx,%rbx,4) + incq %rbx + cmpq %rbx, %rcx + jne .LBB6_12 + jmp .LBB6_13 + .p2align 5, 0x90 +.LBB6_5: + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, (%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + cmpl $0, 20(%r15) + jg .LBB6_6 +# %bb.14: + movl $.L.str.1, %edi + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd 184(%r14), %xmm3 # xmm3 = mem[0],zero + vsubsd (%rsp), %xmm0, %xmm1 # 8-byte Folded Reload + vmovsd %xmm1, (%rsp) # 8-byte Spill + vmulsd .LCPI6_0(%rip), %xmm3, %xmm0 + vmulsd %xmm1, %xmm0, %xmm0 + vxorpd %xmm2, %xmm2, %xmm2 + vdivsd %xmm2, %xmm0, %xmm2 + movl $.L.str.4, %edi + xorl %esi, %esi + vmovapd %xmm3, %xmm0 + movb $3, %al + callq printf + movl $.L.str.7, %edi + xorl %eax, %eax + callq debug_printf + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + addq $8, %rsp + .cfi_def_cfa_offset 40 + popq %rbx + .cfi_def_cfa_offset 32 + popq %r12 + .cfi_def_cfa_offset 24 + popq %r14 + .cfi_def_cfa_offset 16 + popq %r15 + .cfi_def_cfa_offset 8 + retq + .p2align 5, 0x90 +.LBB6_6: + .cfi_def_cfa_offset 48 + callq simd_incr_reduced_sum +.Lfunc_end6: + .size computeForceLJ_4xn_full, .Lfunc_end6-computeForceLJ_4xn_full + .cfi_endproc + # -- End function + .globl computeForceLJ_4xn # -- Begin function computeForceLJ_4xn + .p2align 4, 0x90 + .type computeForceLJ_4xn,@function +computeForceLJ_4xn: # + .cfi_startproc +# %bb.0: + cmpl $0, 32(%rdx) + je .LBB7_2 +# %bb.1: + jmp computeForceLJ_4xn_half # TAILCALL + .p2align 5, 0x90 +.LBB7_2: + jmp computeForceLJ_4xn_full # TAILCALL +.Lfunc_end7: + .size computeForceLJ_4xn, .Lfunc_end7-computeForceLJ_4xn + .cfi_endproc + # -- End function + .type .L.str,@object # + .section .rodata.str1.1,"aMS",@progbits,1 +.L.str: + .asciz "computeForceLJ begin\n" + .size .L.str, 22 + .type .L.str.1,@object # +.L.str.1: + .asciz "force" + .size .L.str.1, 6 + .type .L.str.2,@object # +.L.str.2: + .asciz "computeForceLJ end\n" + .size .L.str.2, 20 + .type .L.str.3,@object # +.L.str.3: + .asciz "computeForceLJ_2xnn begin\n" + .size .L.str.3, 27 + .type .L.str.4,@object # +.L.str.4: + .asciz "Its: %u Freq: %f Time: %f\nCy/it: %f\n" + .size .L.str.4, 39 + .type .L.str.5,@object # +.L.str.5: + .asciz "computeForceLJ_2xnn end\n" + .size .L.str.5, 25 + .type .L.str.6,@object # +.L.str.6: + .asciz "computeForceLJ_4xn begin\n" + .size .L.str.6, 26 + .type .L.str.7,@object # +.L.str.7: + .asciz "computeForceLJ_4xn end\n" + .size .L.str.7, 24 + .type .L.str.8,@object # +.L.str.8: + .asciz "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n" + .size .L.str.8, 93 + .ident "Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.1.0.20220316)" + .section ".note.GNU-stack","",@progbits diff --git a/static_analysis/jan/icx-icx-gromacs-avx512.s b/static_analysis/jan/icx-icx-gromacs-avx512.s new file mode 100644 index 0000000..f9863ad --- /dev/null +++ b/static_analysis/jan/icx-icx-gromacs-avx512.s @@ -0,0 +1,2453 @@ + .text + .file "force_lj.c" + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_ref +.LCPI0_0: + .quad 0x4048000000000000 # 48 +.LCPI0_1: + .quad 0x3ff0000000000000 # 1 +.LCPI0_2: + .quad 0xbfe0000000000000 # -0.5 +.LCPI0_3: + .quad 0x3fe0000000000000 # 0.5 + .text + .globl computeForceLJ_ref + .p2align 4, 0x90 + .type computeForceLJ_ref,@function +computeForceLJ_ref: # + .cfi_startproc +# %bb.0: + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $152, %rsp + .cfi_def_cfa_offset 208 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, 16(%rsp) # 8-byte Spill + movq %rdx, 24(%rsp) # 8-byte Spill + movq %rsi, %r14 + movq %rdi, %r12 + movl $.L.str, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 144(%r12), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + vmovsd 40(%r12), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, (%rsp) # 8-byte Spill + vmovsd 56(%r12), %xmm1 # xmm1 = mem[0],zero + movl 20(%r14), %r11d + testl %r11d, %r11d + jle .LBB0_5 +# %bb.1: + movq 176(%r14), %r9 + movq 192(%r14), %r10 + decq %r11 + leaq 128(%r9), %r8 + xorl %edi, %edi + vxorpd %xmm0, %xmm0, %xmm0 + jmp .LBB0_2 + .p2align 5, 0x90 +.LBB0_79: # in Loop: Header=BB0_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB0_5 +.LBB0_2: # =>This Loop Header: Depth=1 + # Child Loop BB0_74 Depth 2 + # Child Loop BB0_78 Depth 2 + imulq $56, %rdi, %rax + movl (%r10,%rax), %esi + testl %esi, %esi + jle .LBB0_79 +# %bb.3: # in Loop: Header=BB0_2 Depth=1 + leal (,%rdi,4), %ebx + movl %ebx, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %ebx + orl %eax, %ebx + cmpl $7, %esi + ja .LBB0_73 +# %bb.4: # in Loop: Header=BB0_2 Depth=1 + movl %esi, %ebp + andl $-8, %ebp + cmpq %rsi, %rbp + jae .LBB0_79 + jmp .LBB0_77 + .p2align 5, 0x90 +.LBB0_73: # in Loop: Header=BB0_2 Depth=1 + leaq (,%rsi,8), %rbp + andq $-64, %rbp + movl %ebx, %ecx + leaq (%r9,%rcx,8), %rdx + xorl %eax, %eax + .p2align 4, 0x90 +.LBB0_74: # Parent Loop BB0_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %zmm0, (%rdx,%rax) + addq $64, %rax + cmpq %rax, %rbp + jne .LBB0_74 +# %bb.75: # in Loop: Header=BB0_2 Depth=1 + movl %esi, %ebp + andl $-8, %ebp + addq %rbp, %rcx + vmovupd %zmm0, (%r9,%rcx,8) + vmovupd %zmm0, 64(%r9,%rcx,8) + cmpq %rsi, %rbp + jae .LBB0_79 +.LBB0_77: # in Loop: Header=BB0_2 Depth=1 + movl %ebx, %eax + leaq (%r8,%rax,8), %rcx + .p2align 4, 0x90 +.LBB0_78: # Parent Loop BB0_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, -128(%rcx,%rbp,8) + movq $0, -64(%rcx,%rbp,8) + movq $0, (%rcx,%rbp,8) + incq %rbp + cmpq %rbp, %rsi + jne .LBB0_78 + jmp .LBB0_79 + .p2align 5, 0x90 +.LBB0_5: + vmovsd %xmm1, 48(%rsp) # 8-byte Spill + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 56(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + movl 20(%r14), %esi + testl %esi, %esi + jle .LBB0_17 +# %bb.6: + vmovsd 8(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm13 + movq 16(%rsp), %rax # 8-byte Reload + leaq 32(%rax), %r15 + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd .LCPI0_0(%rip), %xmm0, %xmm12 + leaq 24(%rax), %rdx + movq 160(%r14), %rdi + movq 176(%r14), %rbp + movq 24(%rsp), %rcx # 8-byte Reload + movq 8(%rcx), %rbx + movq %rbx, 72(%rsp) # 8-byte Spill + movq 24(%rcx), %rbx + movq %rbx, 96(%rsp) # 8-byte Spill + movslq 16(%rcx), %rcx + movq %rcx, 64(%rsp) # 8-byte Spill + vmovdqu 8(%rax), %xmm9 + leal -1(%rsi), %ecx + addq (%rax), %rcx + movq %rcx, 40(%rsp) # 8-byte Spill + movq %rbp, 80(%rsp) # 8-byte Spill + leaq 128(%rbp), %rax + movq %rax, 128(%rsp) # 8-byte Spill + movq %rdi, 32(%rsp) # 8-byte Spill + leaq 128(%rdi), %rax + movq %rax, 120(%rsp) # 8-byte Spill + xorl %edi, %edi + vmovsd .LCPI0_1(%rip), %xmm10 # xmm10 = mem[0],zero + vmovsd .LCPI0_2(%rip), %xmm11 # xmm11 = mem[0],zero + vmovsd .LCPI0_3(%rip), %xmm8 # xmm8 = mem[0],zero + vmovsd 48(%rsp), %xmm20 # 8-byte Reload + # xmm20 = mem[0],zero + movq %rsi, 88(%rsp) # 8-byte Spill + jmp .LBB0_7 + .p2align 5, 0x90 +.LBB0_19: # in Loop: Header=BB0_7 Depth=1 + movq 88(%rsp), %rsi # 8-byte Reload + movq 112(%rsp), %rdi # 8-byte Reload + movq 104(%rsp), %rbp # 8-byte Reload +.LBB0_20: # in Loop: Header=BB0_7 Depth=1 + vcvtsi2sd %ebp, %xmm21, %xmm0 + vmulsd %xmm0, %xmm8, %xmm0 + vcvttsd2si %xmm0, %rax + vmovq %rax, %xmm0 + vmovq %rcx, %xmm1 + vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0] + vpaddq %xmm0, %xmm9, %xmm9 + incq %rdi + cmpq %rsi, %rdi + je .LBB0_16 +.LBB0_7: # =>This Loop Header: Depth=1 + # Child Loop BB0_9 Depth 2 + # Child Loop BB0_10 Depth 3 + # Child Loop BB0_12 Depth 4 + movq 96(%rsp), %rax # 8-byte Reload + movslq (%rax,%rdi,4), %rbp + movq %rbp, %rcx + testq %rbp, %rbp + jle .LBB0_20 +# %bb.8: # in Loop: Header=BB0_7 Depth=1 + movl %edi, %r13d + shrl %r13d + leal (,%r13,8), %eax + leal (%rax,%rax,2), %eax + leal (,%rdi,4), %ecx + andl $4, %ecx + orl %ecx, %eax + movq 32(%rsp), %rsi # 8-byte Reload + leaq (%rsi,%rax,8), %r8 + movq 80(%rsp), %rsi # 8-byte Reload + leaq (%rsi,%rax,8), %r11 + movq %rdi, 112(%rsp) # 8-byte Spill + movq %rdi, %rax + imulq 64(%rsp), %rax # 8-byte Folded Reload + movq 72(%rsp), %rsi # 8-byte Reload + leaq (%rsi,%rax,4), %rax + movq %rax, 136(%rsp) # 8-byte Spill + movq 24(%rsp), %rax # 8-byte Reload + movl 32(%rax), %eax + movl %eax, (%rsp) # 4-byte Spill + movl %ecx, %r12d + movq %rbp, 104(%rsp) # 8-byte Spill + movl %ebp, %ecx + xorl %esi, %esi + movq %rcx, 144(%rsp) # 8-byte Spill + jmp .LBB0_9 + .p2align 5, 0x90 +.LBB0_18: # in Loop: Header=BB0_9 Depth=2 + movq 8(%rsp), %rsi # 8-byte Reload + incq %rsi + movq 144(%rsp), %rcx # 8-byte Reload + cmpq %rcx, %rsi + je .LBB0_19 +.LBB0_9: # Parent Loop BB0_7 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_10 Depth 3 + # Child Loop BB0_12 Depth 4 + movq 136(%rsp), %rax # 8-byte Reload + movq %rsi, 8(%rsp) # 8-byte Spill + movslq (%rax,%rsi,4), %r10 + movq %r10, %rax + shlq $6, %rax + leaq (%rax,%rax,2), %rdi + movq 32(%rsp), %rax # 8-byte Reload + addq %rdi, %rax + movq 128(%rsp), %rcx # 8-byte Reload + leaq (%rcx,%rdi), %rsi + addq 120(%rsp), %rdi # 8-byte Folded Reload + xorl %r9d, %r9d + xorl %r14d, %r14d + jmp .LBB0_10 + .p2align 5, 0x90 +.LBB0_67: # in Loop: Header=BB0_10 Depth=3 + vdivsd %xmm3, %xmm10, %xmm3 + vmulsd %xmm20, %xmm3, %xmm14 + vmulsd %xmm3, %xmm3, %xmm4 + vmulsd %xmm4, %xmm14, %xmm4 + vaddsd %xmm4, %xmm11, %xmm14 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm14, %xmm4 + vmulsd %xmm3, %xmm4, %xmm3 + vfmadd231sd %xmm2, %xmm3, %xmm7 # xmm7 = (xmm3 * xmm2) + xmm7 + vfmadd231sd %xmm1, %xmm3, %xmm5 # xmm5 = (xmm3 * xmm1) + xmm5 + vfmadd231sd %xmm0, %xmm3, %xmm19 # xmm19 = (xmm3 * xmm0) + xmm19 + movl $1, %r14d + movq %rdx, %rbx +.LBB0_68: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) + .p2align 4, 0x90 +.LBB0_69: # in Loop: Header=BB0_10 Depth=3 + xorl %ecx, %ecx + testl %r14d, %r14d + sete %cl + movq 16(%rsp), %rbx # 8-byte Reload + incq 40(%rbx,%rcx,8) + vaddsd (%r11,%r9,8), %xmm7, %xmm0 + vmovsd %xmm0, (%r11,%r9,8) + vaddsd 64(%r11,%r9,8), %xmm5, %xmm0 + vmovsd %xmm0, 64(%r11,%r9,8) + vaddsd 128(%r11,%r9,8), %xmm19, %xmm0 + vmovsd %xmm0, 128(%r11,%r9,8) + incq %r9 + cmpq $4, %r9 + je .LBB0_18 +.LBB0_10: # Parent Loop BB0_7 Depth=1 + # Parent Loop BB0_9 Depth=2 + # => This Loop Header: Depth=3 + # Child Loop BB0_12 Depth 4 + vmovsd (%r8,%r9,8), %xmm14 # xmm14 = mem[0],zero + leaq (%r9,%r12), %rbp + vmovsd 64(%r8,%r9,8), %xmm16 # xmm16 = mem[0],zero + vmovsd 128(%r8,%r9,8), %xmm18 # xmm18 = mem[0],zero + cmpl $0, (%rsp) # 4-byte Folded Reload + je .LBB0_21 +# %bb.11: # in Loop: Header=BB0_10 Depth=3 + vxorpd %xmm7, %xmm7, %xmm7 + xorl %ebx, %ebx + vxorpd %xmm5, %xmm5, %xmm5 + vxorpd %xmm19, %xmm19, %xmm19 + jmp .LBB0_12 + .p2align 5, 0x90 +.LBB0_70: # in Loop: Header=BB0_12 Depth=4 + vdivsd %xmm3, %xmm10, %xmm3 + vmulsd %xmm20, %xmm3, %xmm4 + vmulsd %xmm3, %xmm3, %xmm6 + vmulsd %xmm4, %xmm6, %xmm4 + vaddsd %xmm4, %xmm11, %xmm6 + vmulsd %xmm3, %xmm12, %xmm3 + vmulsd %xmm4, %xmm6, %xmm4 + vmovsd -64(%rsi,%rbx,8), %xmm6 # xmm6 = mem[0],zero + vmulsd %xmm3, %xmm4, %xmm3 + vmovsd -128(%rsi,%rbx,8), %xmm4 # xmm4 = mem[0],zero + vfnmadd231sd %xmm3, %xmm2, %xmm4 # xmm4 = -(xmm2 * xmm3) + xmm4 + vmovsd %xmm4, -128(%rsi,%rbx,8) + vfnmadd231sd %xmm3, %xmm0, %xmm6 # xmm6 = -(xmm0 * xmm3) + xmm6 + vmovsd %xmm6, -64(%rsi,%rbx,8) + vmovsd (%rsi,%rbx,8), %xmm4 # xmm4 = mem[0],zero + vfnmadd231sd %xmm3, %xmm1, %xmm4 # xmm4 = -(xmm1 * xmm3) + xmm4 + vmovsd %xmm4, (%rsi,%rbx,8) + vfmadd231sd %xmm2, %xmm3, %xmm7 # xmm7 = (xmm3 * xmm2) + xmm7 + vfmadd231sd %xmm0, %xmm3, %xmm5 # xmm5 = (xmm3 * xmm0) + xmm5 + vfmadd231sd %xmm3, %xmm1, %xmm19 # xmm19 = (xmm1 * xmm3) + xmm19 + movl $1, %r14d + movq %rdx, %rcx +.LBB0_71: # in Loop: Header=BB0_12 Depth=4 + incq (%rcx) +.LBB0_72: # in Loop: Header=BB0_12 Depth=4 + incq %rbx + cmpq $8, %rbx + je .LBB0_69 +.LBB0_12: # Parent Loop BB0_7 Depth=1 + # Parent Loop BB0_9 Depth=2 + # Parent Loop BB0_10 Depth=3 + # => This Inner Loop Header: Depth=4 + cmpl %r10d, %r13d + jne .LBB0_14 +# %bb.13: # in Loop: Header=BB0_12 Depth=4 + cmpq %rbx, %rbp + jae .LBB0_72 +.LBB0_14: # in Loop: Header=BB0_12 Depth=4 + vsubsd -128(%rdi,%rbx,8), %xmm14, %xmm2 + vsubsd -64(%rdi,%rbx,8), %xmm16, %xmm0 + vsubsd (%rdi,%rbx,8), %xmm18, %xmm1 + vmulsd %xmm2, %xmm2, %xmm3 + vfmadd231sd %xmm0, %xmm0, %xmm3 # xmm3 = (xmm0 * xmm0) + xmm3 + vfmadd231sd %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vucomisd %xmm13, %xmm3 + jb .LBB0_70 +# %bb.15: # in Loop: Header=BB0_12 Depth=4 + movq %r15, %rcx + jmp .LBB0_71 + .p2align 5, 0x90 +.LBB0_21: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_24 +# %bb.22: # in Loop: Header=BB0_10 Depth=3 + vxorpd %xmm19, %xmm19, %xmm19 + testq %rbp, %rbp + jne .LBB0_24 +# %bb.23: # in Loop: Header=BB0_10 Depth=3 + vxorpd %xmm5, %xmm5, %xmm5 + vxorpd %xmm7, %xmm7, %xmm7 + cmpl %r10d, %r13d + je .LBB0_28 + jmp .LBB0_29 + .p2align 5, 0x90 +.LBB0_24: # in Loop: Header=BB0_10 Depth=3 + vsubsd (%rax), %xmm14, %xmm15 + vsubsd 64(%rax), %xmm16, %xmm1 + vsubsd 128(%rax), %xmm18, %xmm2 + vmulsd %xmm15, %xmm15, %xmm0 + vfmadd231sd %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231sd %xmm2, %xmm2, %xmm0 # xmm0 = (xmm2 * xmm2) + xmm0 + vxorpd %xmm19, %xmm19, %xmm19 + vucomisd %xmm13, %xmm0 + movq %r15, %rbx + vxorpd %xmm5, %xmm5, %xmm5 + vxorpd %xmm7, %xmm7, %xmm7 + jae .LBB0_26 +# %bb.25: # in Loop: Header=BB0_10 Depth=3 + vdivsd %xmm0, %xmm10, %xmm0 + vmulsd %xmm20, %xmm0, %xmm3 + vmulsd %xmm0, %xmm0, %xmm5 + vmulsd %xmm3, %xmm5, %xmm3 + vaddsd %xmm3, %xmm11, %xmm5 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm5, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vmulsd %xmm0, %xmm15, %xmm7 + vmulsd %xmm1, %xmm0, %xmm5 + vmulsd %xmm2, %xmm0, %xmm19 + movl $1, %r14d + movq %rdx, %rbx +.LBB0_26: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) + cmpl %r10d, %r13d + jne .LBB0_29 +.LBB0_28: # in Loop: Header=BB0_10 Depth=3 + cmpq $1, %rbp + je .LBB0_33 +.LBB0_29: # in Loop: Header=BB0_10 Depth=3 + vsubsd 8(%rax), %xmm14, %xmm2 + vsubsd 72(%rax), %xmm16, %xmm1 + vsubsd 136(%rax), %xmm18, %xmm15 + vmulsd %xmm2, %xmm2, %xmm0 + vfmadd231sd %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231sd %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomisd %xmm13, %xmm0 + jae .LBB0_30 +# %bb.31: # in Loop: Header=BB0_10 Depth=3 + vdivsd %xmm0, %xmm10, %xmm0 + vmulsd %xmm20, %xmm0, %xmm17 + vmulsd %xmm0, %xmm0, %xmm3 + vmulsd %xmm17, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vfmadd231sd %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231sd %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_32 + .p2align 5, 0x90 +.LBB0_30: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_32: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_33: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_35 +# %bb.34: # in Loop: Header=BB0_10 Depth=3 + cmpq $2, %rbp + je .LBB0_39 +.LBB0_35: # in Loop: Header=BB0_10 Depth=3 + vsubsd 16(%rax), %xmm14, %xmm2 + vsubsd 80(%rax), %xmm16, %xmm1 + vsubsd 144(%rax), %xmm18, %xmm15 + vmulsd %xmm2, %xmm2, %xmm0 + vfmadd231sd %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231sd %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomisd %xmm13, %xmm0 + jae .LBB0_36 +# %bb.37: # in Loop: Header=BB0_10 Depth=3 + vdivsd %xmm0, %xmm10, %xmm0 + vmulsd %xmm20, %xmm0, %xmm17 + vmulsd %xmm0, %xmm0, %xmm3 + vmulsd %xmm17, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vfmadd231sd %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231sd %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_38 + .p2align 5, 0x90 +.LBB0_36: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_38: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_39: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_41 +# %bb.40: # in Loop: Header=BB0_10 Depth=3 + cmpq $3, %rbp + je .LBB0_45 +.LBB0_41: # in Loop: Header=BB0_10 Depth=3 + vsubsd 24(%rax), %xmm14, %xmm2 + vsubsd 88(%rax), %xmm16, %xmm1 + vsubsd 152(%rax), %xmm18, %xmm15 + vmulsd %xmm2, %xmm2, %xmm0 + vfmadd231sd %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231sd %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomisd %xmm13, %xmm0 + jae .LBB0_42 +# %bb.43: # in Loop: Header=BB0_10 Depth=3 + vdivsd %xmm0, %xmm10, %xmm0 + vmulsd %xmm20, %xmm0, %xmm17 + vmulsd %xmm0, %xmm0, %xmm3 + vmulsd %xmm17, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vfmadd231sd %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231sd %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_44 + .p2align 5, 0x90 +.LBB0_42: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_44: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_45: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_47 +# %bb.46: # in Loop: Header=BB0_10 Depth=3 + cmpq $4, %rbp + je .LBB0_51 +.LBB0_47: # in Loop: Header=BB0_10 Depth=3 + vsubsd 32(%rax), %xmm14, %xmm2 + vsubsd 96(%rax), %xmm16, %xmm1 + vsubsd 160(%rax), %xmm18, %xmm15 + vmulsd %xmm2, %xmm2, %xmm0 + vfmadd231sd %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231sd %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomisd %xmm13, %xmm0 + jae .LBB0_48 +# %bb.49: # in Loop: Header=BB0_10 Depth=3 + vdivsd %xmm0, %xmm10, %xmm0 + vmulsd %xmm20, %xmm0, %xmm17 + vmulsd %xmm0, %xmm0, %xmm3 + vmulsd %xmm17, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vfmadd231sd %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231sd %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_50 + .p2align 5, 0x90 +.LBB0_48: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_50: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_51: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_53 +# %bb.52: # in Loop: Header=BB0_10 Depth=3 + cmpq $5, %rbp + je .LBB0_57 +.LBB0_53: # in Loop: Header=BB0_10 Depth=3 + vsubsd 40(%rax), %xmm14, %xmm2 + vsubsd 104(%rax), %xmm16, %xmm1 + vsubsd 168(%rax), %xmm18, %xmm15 + vmulsd %xmm2, %xmm2, %xmm0 + vfmadd231sd %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231sd %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomisd %xmm13, %xmm0 + jae .LBB0_54 +# %bb.55: # in Loop: Header=BB0_10 Depth=3 + vdivsd %xmm0, %xmm10, %xmm0 + vmulsd %xmm20, %xmm0, %xmm17 + vmulsd %xmm0, %xmm0, %xmm3 + vmulsd %xmm17, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vfmadd231sd %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231sd %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_56 + .p2align 5, 0x90 +.LBB0_54: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_56: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_57: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_59 +# %bb.58: # in Loop: Header=BB0_10 Depth=3 + cmpq $6, %rbp + je .LBB0_63 +.LBB0_59: # in Loop: Header=BB0_10 Depth=3 + vsubsd 48(%rax), %xmm14, %xmm2 + vsubsd 112(%rax), %xmm16, %xmm1 + vsubsd 176(%rax), %xmm18, %xmm15 + vmulsd %xmm2, %xmm2, %xmm0 + vfmadd231sd %xmm1, %xmm1, %xmm0 # xmm0 = (xmm1 * xmm1) + xmm0 + vfmadd231sd %xmm15, %xmm15, %xmm0 # xmm0 = (xmm15 * xmm15) + xmm0 + vucomisd %xmm13, %xmm0 + jae .LBB0_60 +# %bb.61: # in Loop: Header=BB0_10 Depth=3 + vdivsd %xmm0, %xmm10, %xmm0 + vmulsd %xmm20, %xmm0, %xmm17 + vmulsd %xmm0, %xmm0, %xmm3 + vmulsd %xmm17, %xmm3, %xmm3 + vaddsd %xmm11, %xmm3, %xmm17 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm3, %xmm17, %xmm3 + vmulsd %xmm0, %xmm3, %xmm0 + vfmadd231sd %xmm2, %xmm0, %xmm7 # xmm7 = (xmm0 * xmm2) + xmm7 + vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vfmadd231sd %xmm15, %xmm0, %xmm19 # xmm19 = (xmm0 * xmm15) + xmm19 + movl $1, %r14d + movq %rdx, %rbx + jmp .LBB0_62 + .p2align 5, 0x90 +.LBB0_60: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx +.LBB0_62: # in Loop: Header=BB0_10 Depth=3 + incq (%rbx) +.LBB0_63: # in Loop: Header=BB0_10 Depth=3 + cmpl %r10d, %r13d + jne .LBB0_65 +# %bb.64: # in Loop: Header=BB0_10 Depth=3 + cmpq $7, %rbp + je .LBB0_69 +.LBB0_65: # in Loop: Header=BB0_10 Depth=3 + vsubsd 56(%rax), %xmm14, %xmm2 + vsubsd 120(%rax), %xmm16, %xmm1 + vsubsd 184(%rax), %xmm18, %xmm0 + vmulsd %xmm2, %xmm2, %xmm3 + vfmadd231sd %xmm1, %xmm1, %xmm3 # xmm3 = (xmm1 * xmm1) + xmm3 + vfmadd231sd %xmm0, %xmm0, %xmm3 # xmm3 = (xmm0 * xmm0) + xmm3 + vucomisd %xmm13, %xmm3 + jb .LBB0_67 +# %bb.66: # in Loop: Header=BB0_10 Depth=3 + movq %r15, %rbx + jmp .LBB0_68 + .p2align 5, 0x90 +.LBB0_16: + movq 40(%rsp), %rcx # 8-byte Reload + incq %rcx + movq 16(%rsp), %rax # 8-byte Reload + movq %rcx, (%rax) + vmovdqu %xmm9, 8(%rax) +.LBB0_17: + movl $.L.str.1, %edi + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, (%rsp) # 8-byte Spill + movl $.L.str.2, %edi + xorl %eax, %eax + callq debug_printf + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 56(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $152, %rsp + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end0: + .size computeForceLJ_ref, .Lfunc_end0-computeForceLJ_ref + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_2xnn_half +.LCPI1_0: + .quad 0xbfe0000000000000 # -0.5 +.LCPI1_1: + .quad 0x4048000000000000 # 48 +.LCPI1_4: + .quad 0x3fe0000000000000 # 0.5 + .section .rodata,"a",@progbits + .p2align 6 +.LCPI1_2: + .quad 2 # 0x2 + .quad 3 # 0x3 + .quad 8 # 0x8 + .quad 9 # 0x9 + .quad 6 # 0x6 + .quad 7 # 0x7 + .quad 12 # 0xc + .quad 13 # 0xd + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_3: + .quad 1 # 0x1 + .zero 8 + .text + .globl computeForceLJ_2xnn_half + .p2align 4, 0x90 + .type computeForceLJ_2xnn_half,@function +computeForceLJ_2xnn_half: # + .cfi_startproc +# %bb.0: + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $200, %rsp + .cfi_def_cfa_offset 256 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, 32(%rsp) # 8-byte Spill + movq %rdx, %r14 + movq %rsi, %rbx + movq %rdi, %r12 + movl $.L.str.3, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 144(%r12), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, (%rsp) # 8-byte Spill + vbroadcastsd 56(%r12), %zmm1 + vbroadcastsd 40(%r12), %zmm0 + vmovupd %zmm0, 64(%rsp) # 64-byte Spill + movq %rbx, %r15 + movl 20(%rbx), %r11d + testl %r11d, %r11d + jle .LBB1_5 +# %bb.1: + movq 176(%r15), %r9 + movq 192(%r15), %r10 + decq %r11 + leaq 128(%r9), %r8 + xorl %edi, %edi + vxorpd %xmm0, %xmm0, %xmm0 + jmp .LBB1_2 + .p2align 5, 0x90 +.LBB1_21: # in Loop: Header=BB1_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB1_5 +.LBB1_2: # =>This Loop Header: Depth=1 + # Child Loop BB1_16 Depth 2 + # Child Loop BB1_20 Depth 2 + imulq $56, %rdi, %rax + movl (%r10,%rax), %esi + testl %esi, %esi + jle .LBB1_21 +# %bb.3: # in Loop: Header=BB1_2 Depth=1 + leal (,%rdi,4), %ebx + movl %ebx, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %ebx + orl %eax, %ebx + cmpl $7, %esi + ja .LBB1_15 +# %bb.4: # in Loop: Header=BB1_2 Depth=1 + movl %esi, %ebp + andl $-8, %ebp + cmpq %rsi, %rbp + jae .LBB1_21 + jmp .LBB1_19 + .p2align 5, 0x90 +.LBB1_15: # in Loop: Header=BB1_2 Depth=1 + leaq (,%rsi,8), %rbp + andq $-64, %rbp + movl %ebx, %ecx + leaq (%r9,%rcx,8), %rdx + xorl %eax, %eax + .p2align 4, 0x90 +.LBB1_16: # Parent Loop BB1_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %zmm0, (%rdx,%rax) + addq $64, %rax + cmpq %rax, %rbp + jne .LBB1_16 +# %bb.17: # in Loop: Header=BB1_2 Depth=1 + movl %esi, %ebp + andl $-8, %ebp + addq %rbp, %rcx + vmovupd %zmm0, (%r9,%rcx,8) + vmovupd %zmm0, 64(%r9,%rcx,8) + cmpq %rsi, %rbp + jae .LBB1_21 +.LBB1_19: # in Loop: Header=BB1_2 Depth=1 + movl %ebx, %eax + leaq (%r8,%rax,8), %rcx + .p2align 4, 0x90 +.LBB1_20: # Parent Loop BB1_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, -128(%rcx,%rbp,8) + movq $0, -64(%rcx,%rbp,8) + movq $0, (%rcx,%rbp,8) + incq %rbp + cmpq %rbp, %rsi + jne .LBB1_20 + jmp .LBB1_21 + .p2align 5, 0x90 +.LBB1_5: + xorl %r13d, %r13d + xorl %eax, %eax + vmovupd %zmm1, 128(%rsp) # 64-byte Spill + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 16(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + vmovupd 128(%rsp), %zmm31 # 64-byte Reload + cmpl $0, 20(%r15) + jle .LBB1_10 +# %bb.6: + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm0 + vbroadcastsd %xmm0, %zmm0 + vbroadcastsd .LCPI1_0(%rip), %zmm1 # zmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vbroadcastsd .LCPI1_1(%rip), %zmm2 # zmm2 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] + movb $-52, %al + kmovb %eax, %k1 + vmovupd .LCPI1_2(%rip), %zmm3 # zmm3 = [2,3,8,9,6,7,12,13] + # AlignMOV convert to UnAlignMOV + vmovsd .LCPI1_4(%rip), %xmm5 # xmm5 = mem[0],zero + movq %r14, 24(%rsp) # 8-byte Spill + movq %r15, 8(%rsp) # 8-byte Spill + movl $248, %ebp + jmp .LBB1_7 + .p2align 5, 0x90 +.LBB1_13: # in Loop: Header=BB1_7 Depth=1 + movq 24(%rsp), %r14 # 8-byte Reload + movq 8(%rsp), %r15 # 8-byte Reload + movq (%rsp), %rcx # 8-byte Reload + movq 56(%rsp), %rdx # 8-byte Reload + movq 48(%rsp), %r8 # 8-byte Reload + movq 40(%rsp), %rax # 8-byte Reload +.LBB1_9: # in Loop: Header=BB1_7 Depth=1 + vblendmpd %zmm15, %zmm13, %zmm8 {%k1} + vpermt2pd %zmm15, %zmm3, %zmm13 + vaddpd %zmm13, %zmm8, %zmm8 + vpermilpd $85, %zmm8, %zmm11 # zmm11 = zmm8[1,0,3,2,5,4,7,6] + vaddpd %zmm11, %zmm8, %zmm8 + vextractf64x4 $1, %zmm8, %ymm11 + vblendpd $10, %ymm11, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm11[1],ymm8[2],ymm11[3] + vaddpd (%r8,%rcx,8), %ymm8, %ymm8 + vmovupd %ymm8, (%r8,%rcx,8) # AlignMOV convert to UnAlignMOV + vblendmpd %zmm10, %zmm9, %zmm8 {%k1} + vpermt2pd %zmm10, %zmm3, %zmm9 + vaddpd %zmm9, %zmm8, %zmm8 + vpermilpd $85, %zmm8, %zmm9 # zmm9 = zmm8[1,0,3,2,5,4,7,6] + vaddpd %zmm9, %zmm8, %zmm8 + vextractf64x4 $1, %zmm8, %ymm9 + vblendpd $10, %ymm9, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm9[1],ymm8[2],ymm9[3] + vaddpd 64(%r8,%rcx,8), %ymm8, %ymm8 + vmovupd %ymm8, 64(%r8,%rcx,8) # AlignMOV convert to UnAlignMOV + vblendmpd %zmm6, %zmm7, %zmm8 {%k1} + vpermt2pd %zmm6, %zmm3, %zmm7 + vaddpd %zmm7, %zmm8, %zmm6 + vpermilpd $85, %zmm6, %zmm7 # zmm7 = zmm6[1,0,3,2,5,4,7,6] + vaddpd %zmm7, %zmm6, %zmm6 + vextractf64x4 $1, %zmm6, %ymm7 + vblendpd $10, %ymm7, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3] + vaddpd 128(%r8,%rcx,8), %ymm6, %ymm6 + vmovupd %ymm6, 128(%r8,%rcx,8) # AlignMOV convert to UnAlignMOV + vmovdqu .LCPI1_3(%rip), %xmm4 # xmm4 = <1,u> + # AlignMOV convert to UnAlignMOV + vpinsrq $1, %rax, %xmm4, %xmm6 + movq 32(%rsp), %rcx # 8-byte Reload + vpaddq (%rcx), %xmm6, %xmm6 + vmovdqu %xmm6, (%rcx) + vcvtsi2sd %edx, %xmm12, %xmm6 + vmulsd %xmm5, %xmm6, %xmm6 + vcvttsd2si %xmm6, %rax + addq %rax, 16(%rcx) + incq %r13 + movslq 20(%r15), %rax + cmpq %rax, %r13 + jge .LBB1_10 +.LBB1_7: # =>This Loop Header: Depth=1 + # Child Loop BB1_12 Depth 2 + leal (,%r13,4), %ecx + movl %ecx, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %ecx + orl %eax, %ecx + movq 176(%r15), %r8 + movq 24(%r14), %rax + movslq (%rax,%r13,4), %rdx + testq %rdx, %rdx + jle .LBB1_8 +# %bb.11: # in Loop: Header=BB1_7 Depth=1 + movq 160(%r15), %r15 + vbroadcastsd (%r15,%rcx,8), %ymm6 + movq 8(%r14), %rax + vbroadcastsd 8(%r15,%rcx,8), %ymm7 + vinsertf64x4 $1, %ymm7, %zmm6, %zmm8 + vbroadcastsd 16(%r15,%rcx,8), %ymm6 + vbroadcastsd 24(%r15,%rcx,8), %ymm7 + vbroadcastsd 64(%r15,%rcx,8), %ymm9 + vbroadcastsd 72(%r15,%rcx,8), %ymm10 + vinsertf64x4 $1, %ymm7, %zmm6, %zmm11 + vinsertf64x4 $1, %ymm10, %zmm9, %zmm12 + vbroadcastsd 80(%r15,%rcx,8), %ymm6 + vbroadcastsd 88(%r15,%rcx,8), %ymm7 + vinsertf64x4 $1, %ymm7, %zmm6, %zmm14 + vbroadcastsd 128(%r15,%rcx,8), %ymm6 + vbroadcastsd 136(%r15,%rcx,8), %ymm7 + vinsertf64x4 $1, %ymm7, %zmm6, %zmm16 + vbroadcastsd 144(%r15,%rcx,8), %ymm6 + movq %rcx, (%rsp) # 8-byte Spill + vbroadcastsd 152(%r15,%rcx,8), %ymm7 + vinsertf64x4 $1, %ymm7, %zmm6, %zmm17 + movq %rdx, 56(%rsp) # 8-byte Spill + movl %edx, %edx + movl 16(%r14), %ecx + imull %r13d, %ecx + movslq %ecx, %rcx + leaq (%rax,%rcx,4), %r11 + movq %rdx, 40(%rsp) # 8-byte Spill + leaq -1(%rdx), %r12 + vxorpd %xmm13, %xmm13, %xmm13 + movq %r8, 48(%rsp) # 8-byte Spill + xorl %r9d, %r9d + vxorpd %xmm9, %xmm9, %xmm9 + vxorpd %xmm7, %xmm7, %xmm7 + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm6, %xmm6, %xmm6 + vmovupd 64(%rsp), %zmm4 # 64-byte Reload + .p2align 4, 0x90 +.LBB1_12: # Parent Loop BB1_7 Depth=1 + # => This Inner Loop Header: Depth=2 + movslq (%r11,%r9,4), %rdx + leal (%rdx,%rdx), %r10d + movq %rdx, %rsi + shlq $6, %rsi + leaq (%rsi,%rsi,2), %r14 + vbroadcastf64x4 (%r15,%r14), %zmm18 # zmm18 = mem[0,1,2,3,0,1,2,3] + vbroadcastf64x4 64(%r15,%r14), %zmm19 # zmm19 = mem[0,1,2,3,0,1,2,3] + vbroadcastf64x4 128(%r15,%r14), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3] + addl %edx, %edx + incl %edx + cmpq %rdx, %r13 + sete %dl + movl $0, %edi + movl $225, %eax + cmovel %eax, %edi + movl $0, %esi + movl $129, %eax + cmovel %eax, %esi + xorl %eax, %eax + cmpq %r10, %r13 + setne %al + sete %bl + notb %bl + movl $255, %ecx + cmovel %ebp, %ecx + addb %dil, %bl + addb %cl, %sil + leal (%rax,%rax,2), %ecx + addl $12, %ecx + addb %dl, %cl + subb %al, %dl + shlb $4, %cl + orb %bl, %cl + kmovd %ecx, %k2 + shlb $4, %dl + orb %sil, %dl + kmovd %edx, %k3 + vsubpd %zmm18, %zmm8, %zmm21 + vsubpd %zmm19, %zmm12, %zmm22 + vsubpd %zmm20, %zmm16, %zmm23 + vsubpd %zmm18, %zmm11, %zmm18 + vsubpd %zmm19, %zmm14, %zmm19 + vsubpd %zmm20, %zmm17, %zmm20 + vmulpd %zmm23, %zmm23, %zmm24 + vfmadd231pd %zmm22, %zmm22, %zmm24 # zmm24 = (zmm22 * zmm22) + zmm24 + vfmadd231pd %zmm21, %zmm21, %zmm24 # zmm24 = (zmm21 * zmm21) + zmm24 + vmulpd %zmm20, %zmm20, %zmm25 + vrcp14pd %zmm24, %zmm26 + vfmadd231pd %zmm19, %zmm19, %zmm25 # zmm25 = (zmm19 * zmm19) + zmm25 + vfmadd231pd %zmm18, %zmm18, %zmm25 # zmm25 = (zmm18 * zmm18) + zmm25 + vrcp14pd %zmm25, %zmm27 + vmulpd %zmm26, %zmm31, %zmm28 + vmulpd %zmm28, %zmm26, %zmm28 + vmulpd %zmm28, %zmm26, %zmm28 + vmulpd %zmm27, %zmm31, %zmm29 + vmulpd %zmm29, %zmm27, %zmm29 + vmulpd %zmm29, %zmm27, %zmm29 + vaddpd %zmm1, %zmm28, %zmm30 + vmulpd %zmm26, %zmm4, %zmm26 + vmulpd %zmm30, %zmm26, %zmm26 + vmulpd %zmm26, %zmm28, %zmm26 + vmulpd %zmm2, %zmm26, %zmm26 + vaddpd %zmm1, %zmm29, %zmm28 + vmulpd %zmm27, %zmm4, %zmm27 + vmulpd %zmm28, %zmm27, %zmm27 + vmulpd %zmm27, %zmm29, %zmm27 + vmulpd %zmm2, %zmm27, %zmm27 + vcmpltpd %zmm0, %zmm24, %k2 {%k2} + vmulpd %zmm26, %zmm21, %zmm21 {%k2} {z} + vmulpd %zmm26, %zmm22, %zmm22 {%k2} {z} + vmulpd %zmm26, %zmm23, %zmm23 {%k2} {z} + vcmpltpd %zmm0, %zmm25, %k2 {%k3} + vmulpd %zmm27, %zmm18, %zmm18 {%k2} {z} + vmulpd %zmm27, %zmm19, %zmm19 {%k2} {z} + vmulpd %zmm27, %zmm20, %zmm20 {%k2} {z} + vaddpd %zmm21, %zmm18, %zmm24 + vaddpd %zmm22, %zmm19, %zmm25 + vextractf64x4 $1, %zmm24, %ymm26 + vaddpd %ymm26, %ymm24, %ymm24 + vmovupd (%r8,%r14), %ymm26 # AlignMOV convert to UnAlignMOV + vsubpd %ymm24, %ymm26, %ymm24 + vmovupd 64(%r8,%r14), %ymm26 # AlignMOV convert to UnAlignMOV + vmovupd 128(%r8,%r14), %ymm27 # AlignMOV convert to UnAlignMOV + vmovupd %ymm24, (%r8,%r14) # AlignMOV convert to UnAlignMOV + vaddpd %zmm23, %zmm20, %zmm24 + vextractf64x4 $1, %zmm25, %ymm28 + vaddpd %ymm28, %ymm25, %ymm25 + vsubpd %ymm25, %ymm26, %ymm25 + vmovupd %ymm25, 64(%r8,%r14) # AlignMOV convert to UnAlignMOV + vextractf64x4 $1, %zmm24, %ymm25 + vaddpd %ymm25, %ymm24, %ymm24 + vsubpd %ymm24, %ymm27, %ymm24 + vmovupd %ymm24, 128(%r8,%r14) # AlignMOV convert to UnAlignMOV + vaddpd %zmm21, %zmm13, %zmm13 + vaddpd %zmm22, %zmm9, %zmm9 + vaddpd %zmm23, %zmm7, %zmm7 + vaddpd %zmm18, %zmm15, %zmm15 + vaddpd %zmm19, %zmm10, %zmm10 + vaddpd %zmm20, %zmm6, %zmm6 + cmpq %r9, %r12 + je .LBB1_13 +# %bb.14: # in Loop: Header=BB1_12 Depth=2 + movq 8(%rsp), %rax # 8-byte Reload + movq 160(%rax), %r15 + movq 176(%rax), %r8 + incq %r9 + jmp .LBB1_12 + .p2align 5, 0x90 +.LBB1_8: # in Loop: Header=BB1_7 Depth=1 + vxorpd %xmm6, %xmm6, %xmm6 + movq %rdx, %rax + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm7, %xmm7, %xmm7 + vxorpd %xmm9, %xmm9, %xmm9 + vxorpd %xmm13, %xmm13, %xmm13 + jmp .LBB1_9 + .p2align 5, 0x90 +.LBB1_10: + movl $.L.str.1, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 64(%rsp) # 8-byte Spill + movl $.L.str.4, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 64(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 16(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $200, %rsp + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end1: + .size computeForceLJ_2xnn_half, .Lfunc_end1-computeForceLJ_2xnn_half + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_2xnn_full +.LCPI2_0: + .quad 0xbfe0000000000000 # -0.5 +.LCPI2_1: + .quad 0x4048000000000000 # 48 + .section .rodata,"a",@progbits + .p2align 6 +.LCPI2_2: + .quad 2 # 0x2 + .quad 3 # 0x3 + .quad 8 # 0x8 + .quad 9 # 0x9 + .quad 6 # 0x6 + .quad 7 # 0x7 + .quad 12 # 0xc + .quad 13 # 0xd + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI2_3: + .quad 1 # 0x1 + .zero 8 + .text + .globl computeForceLJ_2xnn_full + .p2align 4, 0x90 + .type computeForceLJ_2xnn_full,@function +computeForceLJ_2xnn_full: # + .cfi_startproc +# %bb.0: + pushq %r15 + .cfi_def_cfa_offset 16 + pushq %r14 + .cfi_def_cfa_offset 24 + pushq %r13 + .cfi_def_cfa_offset 32 + pushq %r12 + .cfi_def_cfa_offset 40 + pushq %rbx + .cfi_def_cfa_offset 48 + subq $144, %rsp + .cfi_def_cfa_offset 192 + .cfi_offset %rbx, -48 + .cfi_offset %r12, -40 + .cfi_offset %r13, -32 + .cfi_offset %r14, -24 + .cfi_offset %r15, -16 + movq %rcx, %r13 + movq %rdx, %r14 + movq %rsi, %r15 + movq %rdi, %r12 + movl $.L.str.3, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 144(%r12), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, (%rsp) # 8-byte Spill + vbroadcastsd 56(%r12), %zmm1 + vbroadcastsd 40(%r12), %zmm2 + movl 20(%r15), %r11d + testl %r11d, %r11d + jle .LBB2_5 +# %bb.1: + movq 176(%r15), %r9 + movq 192(%r15), %r10 + decq %r11 + leaq 128(%r9), %r8 + xorl %edi, %edi + vxorpd %xmm0, %xmm0, %xmm0 + jmp .LBB2_2 + .p2align 5, 0x90 +.LBB2_19: # in Loop: Header=BB2_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB2_5 +.LBB2_2: # =>This Loop Header: Depth=1 + # Child Loop BB2_14 Depth 2 + # Child Loop BB2_18 Depth 2 + imulq $56, %rdi, %rax + movl (%r10,%rax), %ecx + testl %ecx, %ecx + jle .LBB2_19 +# %bb.3: # in Loop: Header=BB2_2 Depth=1 + leal (,%rdi,4), %esi + movl %esi, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %esi + orl %eax, %esi + cmpl $7, %ecx + ja .LBB2_13 +# %bb.4: # in Loop: Header=BB2_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + cmpq %rcx, %rbx + jae .LBB2_19 + jmp .LBB2_17 + .p2align 5, 0x90 +.LBB2_13: # in Loop: Header=BB2_2 Depth=1 + leaq (,%rcx,8), %rbx + andq $-64, %rbx + movl %esi, %r12d + leaq (%r9,%r12,8), %rax + xorl %edx, %edx + .p2align 4, 0x90 +.LBB2_14: # Parent Loop BB2_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %zmm0, (%rax,%rdx) + addq $64, %rdx + cmpq %rdx, %rbx + jne .LBB2_14 +# %bb.15: # in Loop: Header=BB2_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + addq %rbx, %r12 + vmovupd %zmm0, (%r9,%r12,8) + vmovupd %zmm0, 64(%r9,%r12,8) + cmpq %rcx, %rbx + jae .LBB2_19 +.LBB2_17: # in Loop: Header=BB2_2 Depth=1 + movl %esi, %eax + leaq (%r8,%rax,8), %rdx + .p2align 4, 0x90 +.LBB2_18: # Parent Loop BB2_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, -128(%rdx,%rbx,8) + movq $0, -64(%rdx,%rbx,8) + movq $0, (%rdx,%rbx,8) + incq %rbx + cmpq %rbx, %rcx + jne .LBB2_18 + jmp .LBB2_19 + .p2align 5, 0x90 +.LBB2_5: + xorl %r12d, %r12d + xorl %eax, %eax + vmovupd %zmm1, 16(%rsp) # 64-byte Spill + vmovupd %zmm2, 80(%rsp) # 64-byte Spill + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + vmovupd 80(%rsp), %zmm28 # 64-byte Reload + vmovupd 16(%rsp), %zmm27 # 64-byte Reload + cmpl $0, 20(%r15) + jle .LBB2_10 +# %bb.6: + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm0 + vbroadcastsd %xmm0, %zmm0 + movl $48, %r8d + vbroadcastsd .LCPI2_0(%rip), %zmm1 # zmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vbroadcastsd .LCPI2_1(%rip), %zmm2 # zmm2 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] + movb $-52, %al + kmovb %eax, %k1 + vmovupd .LCPI2_2(%rip), %zmm3 # zmm3 = [2,3,8,9,6,7,12,13] + # AlignMOV convert to UnAlignMOV + vmovdqu .LCPI2_3(%rip), %xmm4 # xmm4 = <1,u> + # AlignMOV convert to UnAlignMOV + jmp .LBB2_7 + .p2align 5, 0x90 +.LBB2_8: # in Loop: Header=BB2_7 Depth=1 + vxorpd %xmm5, %xmm5, %xmm5 + vxorpd %xmm8, %xmm8, %xmm8 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm6, %xmm6, %xmm6 + vxorpd %xmm7, %xmm7, %xmm7 + vxorpd %xmm11, %xmm11, %xmm11 +.LBB2_9: # in Loop: Header=BB2_7 Depth=1 + movq 176(%r15), %rax + vblendmpd %zmm13, %zmm11, %zmm9 {%k1} + vpermt2pd %zmm13, %zmm3, %zmm11 + vaddpd %zmm11, %zmm9, %zmm9 + vpermilpd $85, %zmm9, %zmm10 # zmm10 = zmm9[1,0,3,2,5,4,7,6] + vaddpd %zmm10, %zmm9, %zmm9 + vextractf64x4 $1, %zmm9, %ymm10 + vblendpd $10, %ymm10, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm10[1],ymm9[2],ymm10[3] + vaddpd (%rax,%r9,8), %ymm9, %ymm9 + vmovupd %ymm9, (%rax,%r9,8) # AlignMOV convert to UnAlignMOV + vblendmpd %zmm8, %zmm7, %zmm9 {%k1} + vpermt2pd %zmm8, %zmm3, %zmm7 + vaddpd %zmm7, %zmm9, %zmm7 + vpermilpd $85, %zmm7, %zmm8 # zmm8 = zmm7[1,0,3,2,5,4,7,6] + vaddpd %zmm8, %zmm7, %zmm7 + vextractf64x4 $1, %zmm7, %ymm8 + vblendpd $10, %ymm8, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm8[1],ymm7[2],ymm8[3] + vaddpd 64(%rax,%r9,8), %ymm7, %ymm7 + vmovupd %ymm7, 64(%rax,%r9,8) # AlignMOV convert to UnAlignMOV + vblendmpd %zmm5, %zmm6, %zmm7 {%k1} + vpermt2pd %zmm5, %zmm3, %zmm6 + vaddpd %zmm6, %zmm7, %zmm5 + vpermilpd $85, %zmm5, %zmm6 # zmm6 = zmm5[1,0,3,2,5,4,7,6] + vaddpd %zmm6, %zmm5, %zmm5 + vextractf64x4 $1, %zmm5, %ymm6 + vblendpd $10, %ymm6, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3] + vaddpd 128(%rax,%r9,8), %ymm5, %ymm5 + vmovupd %ymm5, 128(%rax,%r9,8) # AlignMOV convert to UnAlignMOV + vpinsrq $1, %r10, %xmm4, %xmm5 + vpaddq (%r13), %xmm5, %xmm5 + vmovdqu %xmm5, (%r13) + addq %r10, 16(%r13) + incq %r12 + movslq 20(%r15), %rax + cmpq %rax, %r12 + jge .LBB2_10 +.LBB2_7: # =>This Loop Header: Depth=1 + # Child Loop BB2_12 Depth 2 + leal (,%r12,4), %r9d + movl %r9d, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %r9d + orl %eax, %r9d + movq 24(%r14), %rax + movslq (%rax,%r12,4), %r10 + testq %r10, %r10 + jle .LBB2_8 +# %bb.11: # in Loop: Header=BB2_7 Depth=1 + movq 160(%r15), %rsi + movq 8(%r14), %rax + vbroadcastsd (%rsi,%r9,8), %ymm5 + vbroadcastsd 8(%rsi,%r9,8), %ymm6 + vbroadcastsd 16(%rsi,%r9,8), %ymm7 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm9 + vbroadcastsd 24(%rsi,%r9,8), %ymm5 + vinsertf64x4 $1, %ymm5, %zmm7, %zmm10 + vbroadcastsd 64(%rsi,%r9,8), %ymm5 + vbroadcastsd 72(%rsi,%r9,8), %ymm6 + vbroadcastsd 80(%rsi,%r9,8), %ymm7 + vbroadcastsd 88(%rsi,%r9,8), %ymm8 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm12 + vinsertf64x4 $1, %ymm8, %zmm7, %zmm14 + vbroadcastsd 128(%rsi,%r9,8), %ymm5 + vbroadcastsd 136(%rsi,%r9,8), %ymm6 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm15 + vbroadcastsd 144(%rsi,%r9,8), %ymm5 + vbroadcastsd 152(%rsi,%r9,8), %ymm6 + vinsertf64x4 $1, %ymm6, %zmm5, %zmm16 + movl %r10d, %r10d + movl 16(%r14), %ecx + imull %r12d, %ecx + movslq %ecx, %rcx + leaq (%rax,%rcx,4), %rdi + vxorpd %xmm11, %xmm11, %xmm11 + xorl %eax, %eax + vxorpd %xmm7, %xmm7, %xmm7 + vxorpd %xmm6, %xmm6, %xmm6 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm8, %xmm8, %xmm8 + vxorpd %xmm5, %xmm5, %xmm5 + .p2align 4, 0x90 +.LBB2_12: # Parent Loop BB2_7 Depth=1 + # => This Inner Loop Header: Depth=2 + movslq (%rdi,%rax,4), %rcx + leaq (%rcx,%rcx,2), %rdx + shlq $6, %rdx + vbroadcastf64x4 64(%rsi,%rdx), %zmm21 # zmm21 = mem[0,1,2,3,0,1,2,3] + vbroadcastf64x4 128(%rsi,%rdx), %zmm22 # zmm22 = mem[0,1,2,3,0,1,2,3] + vbroadcastf64x4 (%rsi,%rdx), %zmm20 # zmm20 = mem[0,1,2,3,0,1,2,3] + vsubpd %zmm20, %zmm9, %zmm19 + vsubpd %zmm21, %zmm12, %zmm17 + vsubpd %zmm22, %zmm15, %zmm18 + vmulpd %zmm18, %zmm18, %zmm23 + vfmadd231pd %zmm17, %zmm17, %zmm23 # zmm23 = (zmm17 * zmm17) + zmm23 + vfmadd231pd %zmm19, %zmm19, %zmm23 # zmm23 = (zmm19 * zmm19) + zmm23 + vrcp14pd %zmm23, %zmm24 + vmulpd %zmm24, %zmm27, %zmm25 + vmulpd %zmm25, %zmm24, %zmm25 + vmulpd %zmm25, %zmm24, %zmm25 + vaddpd %zmm1, %zmm25, %zmm26 + vmulpd %zmm24, %zmm28, %zmm24 + vmulpd %zmm26, %zmm24, %zmm24 + vmulpd %zmm24, %zmm25, %zmm24 + vsubpd %zmm21, %zmm14, %zmm21 + vsubpd %zmm22, %zmm16, %zmm22 + leal (%rcx,%rcx), %edx + cmpq %rdx, %r12 + setne %dl + leal 1(%rcx,%rcx), %ecx + sete %bl + cmpq %rcx, %r12 + movl $0, %ecx + cmovel %r8d, %ecx + notb %bl + addb %cl, %bl + movl %edx, %ecx + vsubpd %zmm20, %zmm10, %zmm20 + shlb $5, %cl + orb %bl, %cl + orb $-48, %cl + kmovd %ecx, %k2 + vcmpltpd %zmm0, %zmm23, %k2 {%k2} + vmulpd %zmm22, %zmm22, %zmm23 + vfmadd231pd %zmm21, %zmm21, %zmm23 # zmm23 = (zmm21 * zmm21) + zmm23 + vfmadd231pd %zmm20, %zmm20, %zmm23 # zmm23 = (zmm20 * zmm20) + zmm23 + vmulpd %zmm2, %zmm24, %zmm24 + vfmadd231pd %zmm24, %zmm19, %zmm11 {%k2} # zmm11 {%k2} = (zmm19 * zmm24) + zmm11 + vrcp14pd %zmm23, %zmm19 + vfmadd231pd %zmm24, %zmm17, %zmm7 {%k2} # zmm7 {%k2} = (zmm17 * zmm24) + zmm7 + vfmadd231pd %zmm24, %zmm18, %zmm6 {%k2} # zmm6 {%k2} = (zmm18 * zmm24) + zmm6 + vmulpd %zmm19, %zmm27, %zmm17 + vmulpd %zmm17, %zmm19, %zmm17 + vmulpd %zmm17, %zmm19, %zmm17 + vaddpd %zmm1, %zmm17, %zmm18 + vmulpd %zmm19, %zmm28, %zmm19 + vmulpd %zmm18, %zmm19, %zmm18 + vmulpd %zmm18, %zmm17, %zmm17 + shlb $2, %dl + orb $-5, %dl + kmovd %edx, %k2 + vcmpltpd %zmm0, %zmm23, %k2 {%k2} + vmulpd %zmm2, %zmm17, %zmm17 + vfmadd231pd %zmm17, %zmm20, %zmm13 {%k2} # zmm13 {%k2} = (zmm20 * zmm17) + zmm13 + vfmadd231pd %zmm17, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm17) + zmm8 + vfmadd231pd %zmm17, %zmm22, %zmm5 {%k2} # zmm5 {%k2} = (zmm22 * zmm17) + zmm5 + incq %rax + cmpq %rax, %r10 + jne .LBB2_12 + jmp .LBB2_9 + .p2align 5, 0x90 +.LBB2_10: + movl $.L.str.1, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 16(%rsp) # 8-byte Spill + movl $.L.str.4, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 16(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 8(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $144, %rsp + .cfi_def_cfa_offset 48 + popq %rbx + .cfi_def_cfa_offset 40 + popq %r12 + .cfi_def_cfa_offset 32 + popq %r13 + .cfi_def_cfa_offset 24 + popq %r14 + .cfi_def_cfa_offset 16 + popq %r15 + .cfi_def_cfa_offset 8 + retq +.Lfunc_end2: + .size computeForceLJ_2xnn_full, .Lfunc_end2-computeForceLJ_2xnn_full + .cfi_endproc + # -- End function + .globl computeForceLJ_2xnn # -- Begin function computeForceLJ_2xnn + .p2align 4, 0x90 + .type computeForceLJ_2xnn,@function +computeForceLJ_2xnn: # + .cfi_startproc +# %bb.0: + cmpl $0, 32(%rdx) + je .LBB3_2 +# %bb.1: + jmp computeForceLJ_2xnn_half # TAILCALL + .p2align 5, 0x90 +.LBB3_2: + jmp computeForceLJ_2xnn_full # TAILCALL +.Lfunc_end3: + .size computeForceLJ_2xnn, .Lfunc_end3-computeForceLJ_2xnn + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_4xn_half +.LCPI4_0: + .quad 0xbfe0000000000000 # -0.5 +.LCPI4_1: + .quad 0x4048000000000000 # 48 +.LCPI4_3: + .quad 0x3fe0000000000000 # 0.5 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI4_2: + .quad 1 # 0x1 + .zero 8 + .text + .globl computeForceLJ_4xn_half + .p2align 4, 0x90 + .type computeForceLJ_4xn_half,@function +computeForceLJ_4xn_half: # + .cfi_startproc +# %bb.0: + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $1352, %rsp # imm = 0x548 + .cfi_def_cfa_offset 1408 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, 24(%rsp) # 8-byte Spill + movq %rdx, %r14 + movq %rsi, %rbx + movq %rdi, %r12 + movl $.L.str.5, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 144(%r12), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 64(%rsp) # 8-byte Spill + vbroadcastsd 56(%r12), %zmm0 + vmovups %zmm0, 512(%rsp) # 64-byte Spill + vbroadcastsd 40(%r12), %zmm0 + vmovupd %zmm0, 448(%rsp) # 64-byte Spill + movq %rbx, %r15 + movl 20(%rbx), %r11d + testl %r11d, %r11d + jle .LBB4_5 +# %bb.1: + movq 176(%r15), %r9 + movq 192(%r15), %r10 + decq %r11 + leaq 128(%r9), %r8 + xorl %edi, %edi + vxorpd %xmm0, %xmm0, %xmm0 + jmp .LBB4_2 + .p2align 5, 0x90 +.LBB4_21: # in Loop: Header=BB4_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB4_5 +.LBB4_2: # =>This Loop Header: Depth=1 + # Child Loop BB4_16 Depth 2 + # Child Loop BB4_20 Depth 2 + imulq $56, %rdi, %rax + movl (%r10,%rax), %esi + testl %esi, %esi + jle .LBB4_21 +# %bb.3: # in Loop: Header=BB4_2 Depth=1 + leal (,%rdi,4), %ebx + movl %ebx, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %ebx + orl %eax, %ebx + cmpl $7, %esi + ja .LBB4_15 +# %bb.4: # in Loop: Header=BB4_2 Depth=1 + movl %esi, %ebp + andl $-8, %ebp + cmpq %rsi, %rbp + jae .LBB4_21 + jmp .LBB4_19 + .p2align 5, 0x90 +.LBB4_15: # in Loop: Header=BB4_2 Depth=1 + leaq (,%rsi,8), %rbp + andq $-64, %rbp + movl %ebx, %ecx + leaq (%r9,%rcx,8), %rdx + xorl %eax, %eax + .p2align 4, 0x90 +.LBB4_16: # Parent Loop BB4_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %zmm0, (%rdx,%rax) + addq $64, %rax + cmpq %rax, %rbp + jne .LBB4_16 +# %bb.17: # in Loop: Header=BB4_2 Depth=1 + movl %esi, %ebp + andl $-8, %ebp + addq %rbp, %rcx + vmovupd %zmm0, (%r9,%rcx,8) + vmovupd %zmm0, 64(%r9,%rcx,8) + cmpq %rsi, %rbp + jae .LBB4_21 +.LBB4_19: # in Loop: Header=BB4_2 Depth=1 + movl %ebx, %eax + leaq (%r8,%rax,8), %rcx + .p2align 4, 0x90 +.LBB4_20: # Parent Loop BB4_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, -128(%rcx,%rbp,8) + movq $0, -64(%rcx,%rbp,8) + movq $0, (%rcx,%rbp,8) + incq %rbp + cmpq %rbp, %rsi + jne .LBB4_20 + jmp .LBB4_21 + .p2align 5, 0x90 +.LBB4_5: + xorl %r13d, %r13d + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + cmpl $0, 20(%r15) + jle .LBB4_10 +# %bb.6: + vmovsd 64(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm0 + vbroadcastsd %xmm0, %zmm0 + vmovupd %zmm0, 384(%rsp) # 64-byte Spill + movq %r14, 16(%rsp) # 8-byte Spill + movq %r15, (%rsp) # 8-byte Spill + vmovupd 512(%rsp), %zmm29 # 64-byte Reload + vbroadcastsd .LCPI4_0(%rip), %zmm31 # zmm31 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vmovupd 448(%rsp), %zmm30 # 64-byte Reload + jmp .LBB4_7 + .p2align 5, 0x90 +.LBB4_13: # in Loop: Header=BB4_7 Depth=1 + movq 16(%rsp), %r14 # 8-byte Reload + movq (%rsp), %r15 # 8-byte Reload + movq 56(%rsp), %rax # 8-byte Reload + movq 48(%rsp), %rdx # 8-byte Reload + movq 40(%rsp), %r10 # 8-byte Reload + movq 32(%rsp), %rcx # 8-byte Reload +.LBB4_9: # in Loop: Header=BB4_7 Depth=1 + vshufpd $85, %zmm16, %zmm13, %zmm3 # zmm3 = zmm13[1],zmm16[0],zmm13[3],zmm16[2],zmm13[5],zmm16[4],zmm13[7],zmm16[6] + vshufpd $170, %zmm16, %zmm13, %zmm4 # zmm4 = zmm13[0],zmm16[1],zmm13[2],zmm16[3],zmm13[4],zmm16[5],zmm13[6],zmm16[7] + vaddpd %zmm3, %zmm4, %zmm3 + vshufpd $85, %zmm15, %zmm14, %zmm4 # zmm4 = zmm14[1],zmm15[0],zmm14[3],zmm15[2],zmm14[5],zmm15[4],zmm14[7],zmm15[6] + vshufpd $170, %zmm15, %zmm14, %zmm13 # zmm13 = zmm14[0],zmm15[1],zmm14[2],zmm15[3],zmm14[4],zmm15[5],zmm14[6],zmm15[7] + vaddpd %zmm4, %zmm13, %zmm4 + vshuff64x2 $78, %zmm4, %zmm3, %zmm13 # zmm13 = zmm3[4,5,6,7],zmm4[0,1,2,3] + vshuff64x2 $228, %zmm4, %zmm3, %zmm3 # zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] + vaddpd %zmm13, %zmm3, %zmm3 + vpermpd $78, %zmm3, %zmm4 # zmm4 = zmm3[2,3,0,1,6,7,4,5] + vaddpd %zmm4, %zmm3, %zmm3 + vextractf64x4 $1, %zmm3, %ymm4 + vblendpd $12, %ymm4, %ymm3, %ymm3 # ymm3 = ymm3[0,1],ymm4[2,3] + vaddpd (%r10,%rax,8), %ymm3, %ymm3 + vmovupd %ymm3, (%r10,%rax,8) # AlignMOV convert to UnAlignMOV + vshufpd $85, %zmm12, %zmm11, %zmm3 # zmm3 = zmm11[1],zmm12[0],zmm11[3],zmm12[2],zmm11[5],zmm12[4],zmm11[7],zmm12[6] + vshufpd $170, %zmm12, %zmm11, %zmm4 # zmm4 = zmm11[0],zmm12[1],zmm11[2],zmm12[3],zmm11[4],zmm12[5],zmm11[6],zmm12[7] + vaddpd %zmm3, %zmm4, %zmm3 + vshufpd $85, %zmm9, %zmm23, %zmm4 # zmm4 = zmm23[1],zmm9[0],zmm23[3],zmm9[2],zmm23[5],zmm9[4],zmm23[7],zmm9[6] + vshufpd $170, %zmm9, %zmm23, %zmm9 # zmm9 = zmm23[0],zmm9[1],zmm23[2],zmm9[3],zmm23[4],zmm9[5],zmm23[6],zmm9[7] + vaddpd %zmm4, %zmm9, %zmm4 + vshuff64x2 $78, %zmm4, %zmm3, %zmm9 # zmm9 = zmm3[4,5,6,7],zmm4[0,1,2,3] + vshuff64x2 $228, %zmm4, %zmm3, %zmm3 # zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] + vaddpd %zmm9, %zmm3, %zmm3 + vpermpd $78, %zmm3, %zmm4 # zmm4 = zmm3[2,3,0,1,6,7,4,5] + vaddpd %zmm4, %zmm3, %zmm3 + vextractf64x4 $1, %zmm3, %ymm4 + vblendpd $12, %ymm4, %ymm3, %ymm3 # ymm3 = ymm3[0,1],ymm4[2,3] + vaddpd 64(%r10,%rax,8), %ymm3, %ymm3 + vmovupd %ymm3, 64(%r10,%rax,8) # AlignMOV convert to UnAlignMOV + vshufpd $85, %zmm8, %zmm7, %zmm3 # zmm3 = zmm7[1],zmm8[0],zmm7[3],zmm8[2],zmm7[5],zmm8[4],zmm7[7],zmm8[6] + vshufpd $170, %zmm8, %zmm7, %zmm4 # zmm4 = zmm7[0],zmm8[1],zmm7[2],zmm8[3],zmm7[4],zmm8[5],zmm7[6],zmm8[7] + vaddpd %zmm3, %zmm4, %zmm3 + vshufpd $85, %zmm5, %zmm6, %zmm4 # zmm4 = zmm6[1],zmm5[0],zmm6[3],zmm5[2],zmm6[5],zmm5[4],zmm6[7],zmm5[6] + vshufpd $170, %zmm5, %zmm6, %zmm5 # zmm5 = zmm6[0],zmm5[1],zmm6[2],zmm5[3],zmm6[4],zmm5[5],zmm6[6],zmm5[7] + vaddpd %zmm4, %zmm5, %zmm4 + vshuff64x2 $78, %zmm4, %zmm3, %zmm5 # zmm5 = zmm3[4,5,6,7],zmm4[0,1,2,3] + vshuff64x2 $228, %zmm4, %zmm3, %zmm3 # zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] + vaddpd %zmm5, %zmm3, %zmm3 + vpermpd $78, %zmm3, %zmm4 # zmm4 = zmm3[2,3,0,1,6,7,4,5] + vaddpd %zmm4, %zmm3, %zmm3 + vextractf64x4 $1, %zmm3, %ymm4 + vblendpd $12, %ymm4, %ymm3, %ymm3 # ymm3 = ymm3[0,1],ymm4[2,3] + vaddpd 128(%r10,%rax,8), %ymm3, %ymm3 + vmovupd %ymm3, 128(%r10,%rax,8) # AlignMOV convert to UnAlignMOV + vmovdqu .LCPI4_2(%rip), %xmm0 # xmm0 = <1,u> + # AlignMOV convert to UnAlignMOV + vpinsrq $1, %rcx, %xmm0, %xmm3 + movq 24(%rsp), %rcx # 8-byte Reload + vpaddq (%rcx), %xmm3, %xmm3 + vmovdqu %xmm3, (%rcx) + vcvtsi2sd %edx, %xmm10, %xmm3 + vmulsd .LCPI4_3(%rip), %xmm3, %xmm3 + vcvttsd2si %xmm3, %rax + addq %rax, 16(%rcx) + incq %r13 + movslq 20(%r15), %rax + cmpq %rax, %r13 + jge .LBB4_10 +.LBB4_7: # =>This Loop Header: Depth=1 + # Child Loop BB4_12 Depth 2 + leal (,%r13,4), %eax + movl %eax, %ecx + andl $2147483640, %ecx # imm = 0x7FFFFFF8 + leal (%rcx,%rcx,2), %ecx + andl $4, %eax + orl %ecx, %eax + movq 176(%r15), %r10 + movq 24(%r14), %rcx + movslq (%rcx,%r13,4), %rdx + testq %rdx, %rdx + jle .LBB4_8 +# %bb.11: # in Loop: Header=BB4_7 Depth=1 + movq 160(%r15), %r15 + vbroadcastsd (%r15,%rax,8), %zmm0 + vmovups %zmm0, 1280(%rsp) # 64-byte Spill + vbroadcastsd 8(%r15,%rax,8), %zmm0 + vmovups %zmm0, 1216(%rsp) # 64-byte Spill + vbroadcastsd 16(%r15,%rax,8), %zmm0 + vmovups %zmm0, 1152(%rsp) # 64-byte Spill + vbroadcastsd 24(%r15,%rax,8), %zmm0 + vmovups %zmm0, 1088(%rsp) # 64-byte Spill + vbroadcastsd 64(%r15,%rax,8), %zmm0 + vmovups %zmm0, 1024(%rsp) # 64-byte Spill + vbroadcastsd 72(%r15,%rax,8), %zmm0 + vmovups %zmm0, 960(%rsp) # 64-byte Spill + vbroadcastsd 80(%r15,%rax,8), %zmm0 + vmovups %zmm0, 896(%rsp) # 64-byte Spill + vbroadcastsd 88(%r15,%rax,8), %zmm0 + vmovups %zmm0, 832(%rsp) # 64-byte Spill + vbroadcastsd 128(%r15,%rax,8), %zmm0 + vmovups %zmm0, 768(%rsp) # 64-byte Spill + vbroadcastsd 136(%r15,%rax,8), %zmm0 + vmovups %zmm0, 704(%rsp) # 64-byte Spill + movq 8(%r14), %rcx + vbroadcastsd 144(%r15,%rax,8), %zmm0 + vmovups %zmm0, 640(%rsp) # 64-byte Spill + movq %rax, 56(%rsp) # 8-byte Spill + vbroadcastsd 152(%r15,%rax,8), %zmm0 + vmovups %zmm0, 576(%rsp) # 64-byte Spill + movq %rdx, 48(%rsp) # 8-byte Spill + movl %edx, %eax + movl 16(%r14), %ebp + imull %r13d, %ebp + movslq %ebp, %rbp + leaq (%rcx,%rbp,4), %r8 + movq %rax, 32(%rsp) # 8-byte Spill + leaq -1(%rax), %r12 + vxorpd %xmm13, %xmm13, %xmm13 + movq %r10, 40(%rsp) # 8-byte Spill + xorl %r14d, %r14d + vxorpd %xmm11, %xmm11, %xmm11 + vxorps %xmm0, %xmm0, %xmm0 + vmovups %zmm0, 256(%rsp) # 64-byte Spill + vxorpd %xmm16, %xmm16, %xmm16 + vxorpd %xmm12, %xmm12, %xmm12 + vxorps %xmm0, %xmm0, %xmm0 + vmovups %zmm0, 192(%rsp) # 64-byte Spill + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm23, %xmm23, %xmm23 + vxorps %xmm0, %xmm0, %xmm0 + vmovups %zmm0, 320(%rsp) # 64-byte Spill + vxorpd %xmm15, %xmm15, %xmm15 + vxorps %xmm0, %xmm0, %xmm0 + vmovups %zmm0, 128(%rsp) # 64-byte Spill + vxorps %xmm0, %xmm0, %xmm0 + vmovups %zmm0, 64(%rsp) # 64-byte Spill + movl $129, %ecx + .p2align 4, 0x90 +.LBB4_12: # Parent Loop BB4_7 Depth=1 + # => This Inner Loop Header: Depth=2 + movslq (%r8,%r14,4), %r9 + movq %r9, %rdx + shlq $6, %rdx + leaq (%rdx,%rdx,2), %r11 + vmovupd (%r15,%r11), %zmm17 # AlignMOV convert to UnAlignMOV + vmovupd 64(%r15,%r11), %zmm24 # AlignMOV convert to UnAlignMOV + vmovupd 128(%r15,%r11), %zmm25 # AlignMOV convert to UnAlignMOV + vmovupd 1280(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm17, %zmm0, %zmm20 + vmovupd 1024(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm24, %zmm0, %zmm22 + vmovupd 768(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm25, %zmm0, %zmm18 + vmovupd 1216(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm17, %zmm0, %zmm4 + vmovupd 960(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm24, %zmm0, %zmm10 + vmovupd 704(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm25, %zmm0, %zmm21 + vmovupd 1152(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm17, %zmm0, %zmm1 + vmovupd 896(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm24, %zmm0, %zmm8 + vmovupd 640(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm25, %zmm0, %zmm19 + vmovupd 1088(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm17, %zmm0, %zmm7 + vmovupd 832(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm24, %zmm0, %zmm9 + vmovupd 576(%rsp), %zmm0 # 64-byte Reload + vsubpd %zmm25, %zmm0, %zmm17 + vmulpd %zmm18, %zmm18, %zmm24 + vfmadd231pd %zmm22, %zmm22, %zmm24 # zmm24 = (zmm22 * zmm22) + zmm24 + vfmadd231pd %zmm20, %zmm20, %zmm24 # zmm24 = (zmm20 * zmm20) + zmm24 + vmulpd %zmm21, %zmm21, %zmm25 + vfmadd231pd %zmm10, %zmm10, %zmm25 # zmm25 = (zmm10 * zmm10) + zmm25 + vfmadd231pd %zmm4, %zmm4, %zmm25 # zmm25 = (zmm4 * zmm4) + zmm25 + vmovapd %zmm4, %zmm3 + vmulpd %zmm19, %zmm19, %zmm26 + vfmadd231pd %zmm8, %zmm8, %zmm26 # zmm26 = (zmm8 * zmm8) + zmm26 + vfmadd231pd %zmm1, %zmm1, %zmm26 # zmm26 = (zmm1 * zmm1) + zmm26 + vmovapd %zmm1, %zmm4 + vmulpd %zmm17, %zmm17, %zmm27 + vrcp14pd %zmm24, %zmm28 + vrcp14pd %zmm25, %zmm1 + vrcp14pd %zmm26, %zmm2 + vfmadd231pd %zmm9, %zmm9, %zmm27 # zmm27 = (zmm9 * zmm9) + zmm27 + vfmadd231pd %zmm7, %zmm7, %zmm27 # zmm27 = (zmm7 * zmm7) + zmm27 + vrcp14pd %zmm27, %zmm0 + vmulpd %zmm28, %zmm29, %zmm5 + vmulpd %zmm5, %zmm28, %zmm5 + vmulpd %zmm5, %zmm28, %zmm5 + vaddpd %zmm31, %zmm5, %zmm6 + vmulpd %zmm28, %zmm30, %zmm28 + vmulpd %zmm6, %zmm28, %zmm6 + vmulpd %zmm1, %zmm29, %zmm28 + vmulpd %zmm28, %zmm1, %zmm28 + vmulpd %zmm28, %zmm1, %zmm28 + vmulpd %zmm6, %zmm5, %zmm5 + vaddpd %zmm31, %zmm28, %zmm6 + vmulpd %zmm1, %zmm30, %zmm1 + vmulpd %zmm6, %zmm1, %zmm1 + vmulpd %zmm2, %zmm29, %zmm6 + vmulpd %zmm6, %zmm2, %zmm6 + vmulpd %zmm6, %zmm2, %zmm6 + vmulpd %zmm1, %zmm28, %zmm1 + vaddpd %zmm31, %zmm6, %zmm28 + vmulpd %zmm2, %zmm30, %zmm2 + vmulpd %zmm28, %zmm2, %zmm2 + vmulpd %zmm0, %zmm29, %zmm28 + vmulpd %zmm28, %zmm0, %zmm28 + vmulpd %zmm28, %zmm0, %zmm28 + vmulpd %zmm2, %zmm6, %zmm2 + vaddpd %zmm31, %zmm28, %zmm6 + vmulpd %zmm0, %zmm30, %zmm0 + vmulpd %zmm6, %zmm0, %zmm0 + vmulpd %zmm0, %zmm28, %zmm0 + leal (%r9,%r9), %edx + leal (%r9,%r9), %esi + incl %esi + xorl %edi, %edi + cmpq %rdx, %r13 + sete %al + setne %dil + movl $255, %r15d + movl $248, %edx + cmovel %edx, %r15d + movl $255, %ebp + movl $240, %edx + cmovel %edx, %ebp + cmpq %rsi, %r13 + sete %r9b + movl $0, %esi + movl $225, %edx + cmovel %edx, %esi + notb %al + movl $0, %ebx + movl $193, %edx + cmovel %edx, %ebx + movl $0, %edx + cmovel %ecx, %edx + addb %sil, %al + kmovd %eax, %k1 + vmovupd 384(%rsp), %zmm28 # 64-byte Reload + vcmpltpd %zmm28, %zmm24, %k1 {%k1} + vbroadcastsd .LCPI4_1(%rip), %zmm24 # zmm24 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] + vmulpd %zmm24, %zmm5, %zmm5 + vmulpd %zmm5, %zmm20, %zmm6 {%k1} {z} + vmulpd %zmm5, %zmm22, %zmm20 {%k1} {z} + vmulpd %zmm5, %zmm18, %zmm5 {%k1} {z} + leal (%rdi,%rdi,2), %eax + addl $252, %eax + addb %al, %bl + kmovd %ebx, %k1 + vcmpltpd %zmm28, %zmm25, %k1 {%k1} + vmulpd %zmm24, %zmm1, %zmm1 + vmulpd %zmm1, %zmm3, %zmm3 {%k1} {z} + vmulpd %zmm1, %zmm10, %zmm18 {%k1} {z} + vmulpd %zmm1, %zmm21, %zmm1 {%k1} {z} + addb %r15b, %dl + kmovd %edx, %k1 + vcmpltpd %zmm28, %zmm26, %k1 {%k1} + vmulpd %zmm24, %zmm2, %zmm2 + vmulpd %zmm2, %zmm4, %zmm21 {%k1} {z} + vmulpd %zmm2, %zmm8, %zmm4 {%k1} {z} + vmulpd %zmm2, %zmm19, %zmm2 {%k1} {z} + addb %r9b, %bpl + kmovd %ebp, %k1 + vcmpltpd %zmm28, %zmm27, %k1 {%k1} + vmulpd %zmm24, %zmm0, %zmm0 + vmulpd %zmm0, %zmm7, %zmm19 {%k1} {z} + vmulpd %zmm0, %zmm9, %zmm22 {%k1} {z} + vmulpd %zmm0, %zmm17, %zmm0 {%k1} {z} + vaddpd %zmm6, %zmm13, %zmm13 + vaddpd %zmm3, %zmm16, %zmm16 + vaddpd %zmm21, %zmm14, %zmm14 + vaddpd %zmm19, %zmm15, %zmm15 + vaddpd %zmm3, %zmm6, %zmm3 + vaddpd %zmm19, %zmm21, %zmm6 + vaddpd %zmm6, %zmm3, %zmm3 + vmovupd (%r10,%r11), %zmm6 # AlignMOV convert to UnAlignMOV + vmovupd 64(%r10,%r11), %zmm17 # AlignMOV convert to UnAlignMOV + vmovupd 128(%r10,%r11), %zmm19 # AlignMOV convert to UnAlignMOV + vsubpd %zmm3, %zmm6, %zmm3 + vmovupd %zmm3, (%r10,%r11) # AlignMOV convert to UnAlignMOV + vaddpd %zmm20, %zmm11, %zmm11 + vaddpd %zmm18, %zmm12, %zmm12 + vaddpd %zmm18, %zmm20, %zmm3 + vaddpd %zmm4, %zmm23, %zmm23 + vaddpd %zmm22, %zmm4, %zmm4 + vaddpd %zmm4, %zmm3, %zmm3 + vsubpd %zmm3, %zmm17, %zmm3 + vmovupd %zmm3, 64(%r10,%r11) # AlignMOV convert to UnAlignMOV + vmovupd 128(%rsp), %zmm9 # 64-byte Reload + vaddpd %zmm22, %zmm9, %zmm9 + vmovupd 256(%rsp), %zmm7 # 64-byte Reload + vaddpd %zmm5, %zmm7, %zmm7 + vmovupd 192(%rsp), %zmm8 # 64-byte Reload + vaddpd %zmm1, %zmm8, %zmm8 + vaddpd %zmm1, %zmm5, %zmm1 + vmovupd 320(%rsp), %zmm6 # 64-byte Reload + vaddpd %zmm2, %zmm6, %zmm6 + vmovupd 64(%rsp), %zmm5 # 64-byte Reload + vaddpd %zmm0, %zmm5, %zmm5 + vaddpd %zmm0, %zmm2, %zmm0 + vaddpd %zmm0, %zmm1, %zmm0 + vsubpd %zmm0, %zmm19, %zmm0 + vmovupd %zmm0, 128(%r10,%r11) # AlignMOV convert to UnAlignMOV + cmpq %r14, %r12 + je .LBB4_13 +# %bb.14: # in Loop: Header=BB4_12 Depth=2 + vmovupd %zmm9, 128(%rsp) # 64-byte Spill + vmovupd %zmm8, 192(%rsp) # 64-byte Spill + vmovupd %zmm7, 256(%rsp) # 64-byte Spill + vmovupd %zmm6, 320(%rsp) # 64-byte Spill + vmovupd %zmm5, 64(%rsp) # 64-byte Spill + movq (%rsp), %rax # 8-byte Reload + movq 160(%rax), %r15 + movq 176(%rax), %r10 + incq %r14 + jmp .LBB4_12 + .p2align 5, 0x90 +.LBB4_8: # in Loop: Header=BB4_7 Depth=1 + vxorpd %xmm5, %xmm5, %xmm5 + movq %rdx, %rcx + vxorpd %xmm9, %xmm9, %xmm9 + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm6, %xmm6, %xmm6 + vxorpd %xmm23, %xmm23, %xmm23 + vxorpd %xmm14, %xmm14, %xmm14 + vxorpd %xmm8, %xmm8, %xmm8 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm16, %xmm16, %xmm16 + vxorpd %xmm7, %xmm7, %xmm7 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm13, %xmm13, %xmm13 + jmp .LBB4_9 + .p2align 5, 0x90 +.LBB4_10: + movl $.L.str.1, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 64(%rsp) # 8-byte Spill + movl $.L.str.6, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 64(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 8(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $1352, %rsp # imm = 0x548 + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end4: + .size computeForceLJ_4xn_half, .Lfunc_end4-computeForceLJ_4xn_half + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJ_4xn_full +.LCPI5_0: + .quad 0xbfe0000000000000 # -0.5 +.LCPI5_1: + .quad 0x4048000000000000 # 48 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI5_2: + .quad 1 # 0x1 + .zero 8 + .text + .globl computeForceLJ_4xn_full + .p2align 4, 0x90 + .type computeForceLJ_4xn_full,@function +computeForceLJ_4xn_full: # + .cfi_startproc +# %bb.0: + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $600, %rsp # imm = 0x258 + .cfi_def_cfa_offset 656 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, %r13 + movq %rdx, %r14 + movq %rsi, %r15 + movq %rdi, %r12 + movl $.L.str.5, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 144(%r12), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 16(%rsp) # 8-byte Spill + vbroadcastsd 56(%r12), %zmm1 + vbroadcastsd 40(%r12), %zmm2 + movl 20(%r15), %r11d + testl %r11d, %r11d + jle .LBB5_5 +# %bb.1: + movq 176(%r15), %r9 + movq 192(%r15), %r10 + decq %r11 + leaq 128(%r9), %r8 + xorl %edi, %edi + vxorpd %xmm0, %xmm0, %xmm0 + jmp .LBB5_2 + .p2align 5, 0x90 +.LBB5_19: # in Loop: Header=BB5_2 Depth=1 + cmpq %r11, %rdi + leaq 1(%rdi), %rdi + je .LBB5_5 +.LBB5_2: # =>This Loop Header: Depth=1 + # Child Loop BB5_14 Depth 2 + # Child Loop BB5_18 Depth 2 + imulq $56, %rdi, %rax + movl (%r10,%rax), %ecx + testl %ecx, %ecx + jle .LBB5_19 +# %bb.3: # in Loop: Header=BB5_2 Depth=1 + leal (,%rdi,4), %esi + movl %esi, %eax + andl $2147483640, %eax # imm = 0x7FFFFFF8 + leal (%rax,%rax,2), %eax + andl $4, %esi + orl %eax, %esi + cmpl $7, %ecx + ja .LBB5_13 +# %bb.4: # in Loop: Header=BB5_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + cmpq %rcx, %rbx + jae .LBB5_19 + jmp .LBB5_17 + .p2align 5, 0x90 +.LBB5_13: # in Loop: Header=BB5_2 Depth=1 + leaq (,%rcx,8), %rbx + andq $-64, %rbx + movl %esi, %r12d + leaq (%r9,%r12,8), %rax + xorl %edx, %edx + .p2align 4, 0x90 +.LBB5_14: # Parent Loop BB5_2 Depth=1 + # => This Inner Loop Header: Depth=2 + vmovupd %zmm0, (%rax,%rdx) + addq $64, %rdx + cmpq %rdx, %rbx + jne .LBB5_14 +# %bb.15: # in Loop: Header=BB5_2 Depth=1 + movl %ecx, %ebx + andl $-8, %ebx + addq %rbx, %r12 + vmovupd %zmm0, (%r9,%r12,8) + vmovupd %zmm0, 64(%r9,%r12,8) + cmpq %rcx, %rbx + jae .LBB5_19 +.LBB5_17: # in Loop: Header=BB5_2 Depth=1 + movl %esi, %eax + leaq (%r8,%rax,8), %rdx + .p2align 4, 0x90 +.LBB5_18: # Parent Loop BB5_2 Depth=1 + # => This Inner Loop Header: Depth=2 + movq $0, -128(%rdx,%rbx,8) + movq $0, -64(%rdx,%rbx,8) + movq $0, (%rdx,%rbx,8) + incq %rbx + cmpq %rbx, %rcx + jne .LBB5_18 + jmp .LBB5_19 + .p2align 5, 0x90 +.LBB5_5: + vmovupd %zmm2, 80(%rsp) # 64-byte Spill + vmovupd %zmm1, 144(%rsp) # 64-byte Spill + xorl %eax, %eax + vzeroupper + callq getTimeStamp + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + cmpl $0, 20(%r15) + jle .LBB5_10 +# %bb.6: + vmovsd 16(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm0 + vbroadcastsd %xmm0, %zmm0 + xorl %r11d, %r11d + vbroadcastsd .LCPI5_0(%rip), %zmm1 # zmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vbroadcastsd .LCPI5_1(%rip), %zmm2 # zmm2 = [4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1,4.8E+1] + vmovupd 144(%rsp), %zmm21 # 64-byte Reload + vmovupd 80(%rsp), %zmm22 # 64-byte Reload + jmp .LBB5_7 + .p2align 5, 0x90 +.LBB5_8: # in Loop: Header=BB5_7 Depth=1 + vxorpd %xmm4, %xmm4, %xmm4 + vxorpd %xmm8, %xmm8, %xmm8 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm5, %xmm5, %xmm5 + vxorpd %xmm9, %xmm9, %xmm9 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm6, %xmm6, %xmm6 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm7, %xmm7, %xmm7 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm14, %xmm14, %xmm14 +.LBB5_9: # in Loop: Header=BB5_7 Depth=1 + movq 176(%r15), %rcx + vshufpd $85, %zmm15, %zmm14, %zmm3 # zmm3 = zmm14[1],zmm15[0],zmm14[3],zmm15[2],zmm14[5],zmm15[4],zmm14[7],zmm15[6] + vshufpd $170, %zmm15, %zmm14, %zmm14 # zmm14 = zmm14[0],zmm15[1],zmm14[2],zmm15[3],zmm14[4],zmm15[5],zmm14[6],zmm15[7] + vaddpd %zmm3, %zmm14, %zmm3 + vshufpd $85, %zmm12, %zmm13, %zmm14 # zmm14 = zmm13[1],zmm12[0],zmm13[3],zmm12[2],zmm13[5],zmm12[4],zmm13[7],zmm12[6] + vshufpd $170, %zmm12, %zmm13, %zmm12 # zmm12 = zmm13[0],zmm12[1],zmm13[2],zmm12[3],zmm13[4],zmm12[5],zmm13[6],zmm12[7] + vaddpd %zmm14, %zmm12, %zmm12 + vshuff64x2 $78, %zmm12, %zmm3, %zmm13 # zmm13 = zmm3[4,5,6,7],zmm12[0,1,2,3] + vshuff64x2 $228, %zmm12, %zmm3, %zmm3 # zmm3 = zmm3[0,1,2,3],zmm12[4,5,6,7] + vaddpd %zmm13, %zmm3, %zmm3 + vpermpd $78, %zmm3, %zmm12 # zmm12 = zmm3[2,3,0,1,6,7,4,5] + vaddpd %zmm12, %zmm3, %zmm3 + vextractf64x4 $1, %zmm3, %ymm12 + vblendpd $12, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0,1],ymm12[2,3] + vaddpd (%rcx,%r8,8), %ymm3, %ymm3 + vmovupd %ymm3, (%rcx,%r8,8) # AlignMOV convert to UnAlignMOV + vshufpd $85, %zmm10, %zmm11, %zmm3 # zmm3 = zmm11[1],zmm10[0],zmm11[3],zmm10[2],zmm11[5],zmm10[4],zmm11[7],zmm10[6] + vshufpd $170, %zmm10, %zmm11, %zmm10 # zmm10 = zmm11[0],zmm10[1],zmm11[2],zmm10[3],zmm11[4],zmm10[5],zmm11[6],zmm10[7] + vaddpd %zmm3, %zmm10, %zmm3 + vshufpd $85, %zmm8, %zmm9, %zmm10 # zmm10 = zmm9[1],zmm8[0],zmm9[3],zmm8[2],zmm9[5],zmm8[4],zmm9[7],zmm8[6] + vshufpd $170, %zmm8, %zmm9, %zmm8 # zmm8 = zmm9[0],zmm8[1],zmm9[2],zmm8[3],zmm9[4],zmm8[5],zmm9[6],zmm8[7] + vaddpd %zmm10, %zmm8, %zmm8 + vshuff64x2 $78, %zmm8, %zmm3, %zmm9 # zmm9 = zmm3[4,5,6,7],zmm8[0,1,2,3] + vshuff64x2 $228, %zmm8, %zmm3, %zmm3 # zmm3 = zmm3[0,1,2,3],zmm8[4,5,6,7] + vaddpd %zmm9, %zmm3, %zmm3 + vpermpd $78, %zmm3, %zmm8 # zmm8 = zmm3[2,3,0,1,6,7,4,5] + vaddpd %zmm8, %zmm3, %zmm3 + vextractf64x4 $1, %zmm3, %ymm8 + vblendpd $12, %ymm8, %ymm3, %ymm3 # ymm3 = ymm3[0,1],ymm8[2,3] + vaddpd 64(%rcx,%r8,8), %ymm3, %ymm3 + vmovupd %ymm3, 64(%rcx,%r8,8) # AlignMOV convert to UnAlignMOV + vshufpd $85, %zmm6, %zmm7, %zmm3 # zmm3 = zmm7[1],zmm6[0],zmm7[3],zmm6[2],zmm7[5],zmm6[4],zmm7[7],zmm6[6] + vshufpd $170, %zmm6, %zmm7, %zmm6 # zmm6 = zmm7[0],zmm6[1],zmm7[2],zmm6[3],zmm7[4],zmm6[5],zmm7[6],zmm6[7] + vaddpd %zmm3, %zmm6, %zmm3 + vshufpd $85, %zmm4, %zmm5, %zmm6 # zmm6 = zmm5[1],zmm4[0],zmm5[3],zmm4[2],zmm5[5],zmm4[4],zmm5[7],zmm4[6] + vshufpd $170, %zmm4, %zmm5, %zmm4 # zmm4 = zmm5[0],zmm4[1],zmm5[2],zmm4[3],zmm5[4],zmm4[5],zmm5[6],zmm4[7] + vaddpd %zmm6, %zmm4, %zmm4 + vshuff64x2 $78, %zmm4, %zmm3, %zmm5 # zmm5 = zmm3[4,5,6,7],zmm4[0,1,2,3] + vshuff64x2 $228, %zmm4, %zmm3, %zmm3 # zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] + vaddpd %zmm5, %zmm3, %zmm3 + vpermpd $78, %zmm3, %zmm4 # zmm4 = zmm3[2,3,0,1,6,7,4,5] + vaddpd %zmm4, %zmm3, %zmm3 + vextractf64x4 $1, %zmm3, %ymm4 + vblendpd $12, %ymm4, %ymm3, %ymm3 # ymm3 = ymm3[0,1],ymm4[2,3] + vaddpd 128(%rcx,%r8,8), %ymm3, %ymm3 + vmovupd %ymm3, 128(%rcx,%r8,8) # AlignMOV convert to UnAlignMOV + vmovdqu .LCPI5_2(%rip), %xmm3 # xmm3 = <1,u> + # AlignMOV convert to UnAlignMOV + vpinsrq $1, %r9, %xmm3, %xmm3 + vpaddq (%r13), %xmm3, %xmm3 + vmovdqu %xmm3, (%r13) + addq %r9, 16(%r13) + incq %r11 + movslq 20(%r15), %rcx + cmpq %rcx, %r11 + jge .LBB5_10 +.LBB5_7: # =>This Loop Header: Depth=1 + # Child Loop BB5_12 Depth 2 + leal (,%r11,4), %r8d + movl %r8d, %ecx + andl $2147483640, %ecx # imm = 0x7FFFFFF8 + leal (%rcx,%rcx,2), %ecx + andl $4, %r8d + orl %ecx, %r8d + movq 24(%r14), %rcx + movslq (%rcx,%r11,4), %r9 + testq %r9, %r9 + jle .LBB5_8 +# %bb.11: # in Loop: Header=BB5_7 Depth=1 + movq 160(%r15), %rsi + movq 8(%r14), %rcx + vbroadcastsd (%rsi,%r8,8), %zmm3 + vmovups %zmm3, 16(%rsp) # 64-byte Spill + vbroadcastsd 8(%rsi,%r8,8), %zmm3 + vmovups %zmm3, 528(%rsp) # 64-byte Spill + vbroadcastsd 16(%rsi,%r8,8), %zmm3 + vmovups %zmm3, 464(%rsp) # 64-byte Spill + vbroadcastsd 24(%rsi,%r8,8), %zmm3 + vmovups %zmm3, 400(%rsp) # 64-byte Spill + vbroadcastsd 64(%rsi,%r8,8), %zmm3 + vmovups %zmm3, 336(%rsp) # 64-byte Spill + vbroadcastsd 72(%rsi,%r8,8), %zmm3 + vmovups %zmm3, 272(%rsp) # 64-byte Spill + vbroadcastsd 80(%rsi,%r8,8), %zmm3 + vmovups %zmm3, 208(%rsp) # 64-byte Spill + vbroadcastsd 88(%rsi,%r8,8), %zmm23 + vbroadcastsd 128(%rsi,%r8,8), %zmm24 + vbroadcastsd 136(%rsi,%r8,8), %zmm25 + vbroadcastsd 144(%rsi,%r8,8), %zmm26 + vbroadcastsd 152(%rsi,%r8,8), %zmm27 + movl %r9d, %r9d + movl 16(%r14), %edx + imull %r11d, %edx + movslq %edx, %rdx + leaq (%rcx,%rdx,4), %r10 + vxorpd %xmm14, %xmm14, %xmm14 + xorl %ebx, %ebx + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm7, %xmm7, %xmm7 + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm6, %xmm6, %xmm6 + vxorpd %xmm13, %xmm13, %xmm13 + vxorpd %xmm9, %xmm9, %xmm9 + vxorpd %xmm5, %xmm5, %xmm5 + vxorpd %xmm12, %xmm12, %xmm12 + vxorpd %xmm8, %xmm8, %xmm8 + vxorpd %xmm4, %xmm4, %xmm4 + .p2align 4, 0x90 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# LLVM-MCA-BEGIN +# pointer_increment=256 da67166e5736661e6b03ea29ee7bfd67 +.LBB5_12: # Parent Loop BB5_7 Depth=1 + # => This Inner Loop Header: Depth=2 + movslq (%r10,%rbx,4), %rcx + leaq (%rcx,%rcx,2), %rdx + shlq $6, %rdx + vmovupd (%rsi,%rdx), %zmm28 # AlignMOV convert to UnAlignMOV + vmovupd 64(%rsi,%rdx), %zmm29 # AlignMOV convert to UnAlignMOV + vmovupd 128(%rsi,%rdx), %zmm30 # AlignMOV convert to UnAlignMOV + vmovupd 16(%rsp), %zmm3 # 64-byte Reload + vsubpd %zmm28, %zmm3, %zmm3 + vsubpd %zmm30, %zmm24, %zmm31 + vmovupd 336(%rsp), %zmm16 # 64-byte Reload + vsubpd %zmm29, %zmm16, %zmm16 + vmulpd %zmm31, %zmm31, %zmm17 + vfmadd231pd %zmm16, %zmm16, %zmm17 # zmm17 = (zmm16 * zmm16) + zmm17 + vfmadd231pd %zmm3, %zmm3, %zmm17 # zmm17 = (zmm3 * zmm3) + zmm17 + vrcp14pd %zmm17, %zmm18 + vmulpd %zmm18, %zmm21, %zmm19 + vmulpd %zmm19, %zmm18, %zmm19 + vmulpd %zmm19, %zmm18, %zmm19 + vaddpd %zmm1, %zmm19, %zmm20 + vmulpd %zmm18, %zmm22, %zmm18 + vmulpd %zmm20, %zmm18, %zmm18 + vsubpd %zmm30, %zmm25, %zmm20 + leal (%rcx,%rcx), %edx + cmpq %rdx, %r11 + setne %dl + sete %al + addl %ecx, %ecx + incl %ecx + cmpq %rcx, %r11 + sete %cl + vmulpd %zmm18, %zmm19, %zmm18 + vmovupd 528(%rsp), %zmm19 # 64-byte Reload + vsubpd %zmm28, %zmm19, %zmm19 + setne %dil + movl %edi, %ebp + shlb $4, %bpl + subb %al, %bpl + addb $-17, %bpl + kmovd %ebp, %k1 + vcmpltpd %zmm0, %zmm17, %k1 {%k1} + vmovupd 272(%rsp), %zmm17 # 64-byte Reload + vsubpd %zmm29, %zmm17, %zmm17 + leal (%rdx,%rdx), %eax + movl %edi, %ebp + vmulpd %zmm2, %zmm18, %zmm18 + vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14 + vmulpd %zmm20, %zmm20, %zmm3 + vfmadd231pd %zmm17, %zmm17, %zmm3 # zmm3 = (zmm17 * zmm17) + zmm3 + vfmadd231pd %zmm19, %zmm19, %zmm3 # zmm3 = (zmm19 * zmm19) + zmm3 + vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11 + vrcp14pd %zmm3, %zmm16 + vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7 + vmulpd %zmm16, %zmm21, %zmm18 + vmulpd %zmm18, %zmm16, %zmm18 + vmulpd %zmm18, %zmm16, %zmm18 + vaddpd %zmm1, %zmm18, %zmm31 + vmulpd %zmm16, %zmm22, %zmm16 + vmulpd %zmm31, %zmm16, %zmm16 + vmovupd 464(%rsp), %zmm31 # 64-byte Reload + vsubpd %zmm28, %zmm31, %zmm31 + shlb $5, %bpl + orb %al, %bpl + orb $-35, %bpl + kmovd %ebp, %k1 + vcmpltpd %zmm0, %zmm3, %k1 {%k1} + vmovupd 208(%rsp), %zmm3 # 64-byte Reload + vsubpd %zmm29, %zmm3, %zmm3 + vmulpd %zmm16, %zmm18, %zmm16 + vsubpd %zmm30, %zmm26, %zmm18 + vmulpd %zmm2, %zmm16, %zmm16 + vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15 + vmulpd %zmm18, %zmm18, %zmm19 + vfmadd231pd %zmm3, %zmm3, %zmm19 # zmm19 = (zmm3 * zmm3) + zmm19 + vfmadd231pd %zmm31, %zmm31, %zmm19 # zmm19 = (zmm31 * zmm31) + zmm19 + vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10 + vrcp14pd %zmm19, %zmm17 + vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6 + vmulpd %zmm17, %zmm21, %zmm16 + vmulpd %zmm16, %zmm17, %zmm16 + vmulpd %zmm16, %zmm17, %zmm16 + vaddpd %zmm1, %zmm16, %zmm20 + vmulpd %zmm17, %zmm22, %zmm17 + vmulpd %zmm20, %zmm17, %zmm17 + vmulpd %zmm17, %zmm16, %zmm16 + leal (,%rdx,4), %eax + shlb $6, %dil + orb %al, %dil + orb $-69, %dil + kmovd %edi, %k1 + vcmpltpd %zmm0, %zmm19, %k1 {%k1} + vmovupd 400(%rsp), %zmm17 # 64-byte Reload + vsubpd %zmm28, %zmm17, %zmm17 + vsubpd %zmm29, %zmm23, %zmm19 + vsubpd %zmm30, %zmm27, %zmm20 + vmulpd %zmm2, %zmm16, %zmm16 + vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13 + vmulpd %zmm20, %zmm20, %zmm28 + vfmadd231pd %zmm19, %zmm19, %zmm28 # zmm28 = (zmm19 * zmm19) + zmm28 + vfmadd231pd %zmm17, %zmm17, %zmm28 # zmm28 = (zmm17 * zmm17) + zmm28 + vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9 + vrcp14pd %zmm28, %zmm3 + vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5 + vmulpd %zmm3, %zmm21, %zmm16 + vmulpd %zmm16, %zmm3, %zmm16 + vmulpd %zmm16, %zmm3, %zmm16 + vaddpd %zmm1, %zmm16, %zmm18 + vmulpd %zmm3, %zmm22, %zmm3 + vmulpd %zmm18, %zmm3, %zmm3 + vmulpd %zmm3, %zmm16, %zmm3 + shlb $3, %dl + shlb $7, %cl + orb %dl, %cl + addb $-9, %cl + kmovd %ecx, %k1 + vcmpltpd %zmm0, %zmm28, %k1 {%k1} + vmulpd %zmm2, %zmm3, %zmm3 + vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12 + vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8 + vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4 + incq %rbx + cmpq %rbx, %r9 + jne .LBB5_12 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER + jmp .LBB5_9 + .p2align 5, 0x90 +.LBB5_10: + movl $.L.str.1, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 16(%rsp) # 8-byte Spill + movl $.L.str.6, %edi + xorl %eax, %eax + callq debug_printf + vmovsd 16(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vsubsd 8(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $600, %rsp # imm = 0x258 + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end5: + .size computeForceLJ_4xn_full, .Lfunc_end5-computeForceLJ_4xn_full + .cfi_endproc + # -- End function + .globl computeForceLJ_4xn # -- Begin function computeForceLJ_4xn + .p2align 4, 0x90 + .type computeForceLJ_4xn,@function +computeForceLJ_4xn: # + .cfi_startproc +# %bb.0: + cmpl $0, 32(%rdx) + je .LBB6_2 +# %bb.1: + jmp computeForceLJ_4xn_half # TAILCALL + .p2align 5, 0x90 +.LBB6_2: + jmp computeForceLJ_4xn_full # TAILCALL +.Lfunc_end6: + .size computeForceLJ_4xn, .Lfunc_end6-computeForceLJ_4xn + .cfi_endproc + # -- End function + .type .L.str,@object # + .section .rodata.str1.1,"aMS",@progbits,1 +.L.str: + .asciz "computeForceLJ begin\n" + .size .L.str, 22 + .type .L.str.1,@object # +.L.str.1: + .asciz "force" + .size .L.str.1, 6 + .type .L.str.2,@object # +.L.str.2: + .asciz "computeForceLJ end\n" + .size .L.str.2, 20 + .type .L.str.3,@object # +.L.str.3: + .asciz "computeForceLJ_2xnn begin\n" + .size .L.str.3, 27 + .type .L.str.4,@object # +.L.str.4: + .asciz "computeForceLJ_2xnn end\n" + .size .L.str.4, 25 + .type .L.str.5,@object # +.L.str.5: + .asciz "computeForceLJ_4xn begin\n" + .size .L.str.5, 26 + .type .L.str.6,@object # +.L.str.6: + .asciz "computeForceLJ_4xn end\n" + .size .L.str.6, 24 + .ident "Intel(R) oneAPI DPC++/C++ Compiler 2022.1.0 (2022.1.0.20220316)" + .section ".note.GNU-stack","",@progbits diff --git a/static_analysis/jan/zen-icx-lammps-avx2.s b/static_analysis/jan/zen-icx-lammps-avx2.s new file mode 100644 index 0000000..c340cad --- /dev/null +++ b/static_analysis/jan/zen-icx-lammps-avx2.s @@ -0,0 +1,676 @@ + .text + .file "force_lj.c" + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJFullNeigh_plain_c +.LCPI0_0: + .quad 4631952216750555136 # 48 +.LCPI0_3: + .quad 4607182418800017408 # 1 +.LCPI0_4: + .quad -4620693217682128896 # -0.5 + .section .rodata.cst4,"aM",@progbits,4 + .p2align 2 +.LCPI0_1: + .long 3 # 0x3 +.LCPI0_2: + .long 2 # 0x2 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_5: + .zero 16,255 + .text + .globl computeForceLJFullNeigh_plain_c + .p2align 4, 0x90 + .type computeForceLJFullNeigh_plain_c,@function +computeForceLJFullNeigh_plain_c: # +.LcomputeForceLJFullNeigh_plain_c$local: + .cfi_startproc +# %bb.0: # + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $264, %rsp # imm = 0x108 + .cfi_def_cfa_offset 320 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, %rbx + movq %rdx, %r15 + movq %rsi, %r12 + movl 4(%rsi), %r14d + vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, (%rsp) # 8-byte Spill + vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 128(%rsp) # 8-byte Spill + vmovq 56(%rdi), %xmm0 # xmm0 = mem[0],zero + vmovdqa %xmm0, 80(%rsp) # 16-byte Spill + testl %r14d, %r14d + jle .LBB0_2 +# %bb.1: # + movq 64(%r12), %rdi + leaq (,%r14,8), %rax + leaq (%rax,%rax,2), %rdx + xorl %esi, %esi + callq _intel_fast_memset +.LBB0_2: # + xorl %eax, %eax + callq getTimeStamp + vmovq %xmm0, 32(%rsp) # 8-byte Folded Spill + movl $.L.str, %edi + callq likwid_markerStartRegion + testl %r14d, %r14d + jle .LBB0_19 +# %bb.3: # + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm13 + movq 16(%r15), %r11 + movq 24(%r15), %rsi + movslq 8(%r15), %rdi + movq 16(%r12), %r15 + movq 64(%r12), %r8 + vmovsd 128(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd .LCPI0_0(%rip), %xmm0, %xmm15 + movq %rbx, 24(%rsp) # 8-byte Spill + vmovdqu (%rbx), %xmm14 + decq %r14 + vmovq %r15, %xmm0 + vpbroadcastq %xmm0, %ymm3 + vbroadcastsd %xmm13, %ymm2 + vmovapd 80(%rsp), %xmm12 # 16-byte Reload + vbroadcastsd %xmm12, %ymm8 + vbroadcastsd %xmm15, %ymm9 + shlq $2, %rdi + xorl %r10d, %r10d + movq %r14, 56(%rsp) # 8-byte Spill + vmovapd %xmm13, 192(%rsp) # 16-byte Spill + movq %rsi, 48(%rsp) # 8-byte Spill + movq %rdi, 40(%rsp) # 8-byte Spill + vmovapd %xmm15, 176(%rsp) # 16-byte Spill + vmovupd %ymm2, 224(%rsp) # 32-byte Spill + vmovupd %ymm9, 128(%rsp) # 32-byte Spill + jmp .LBB0_6 + .p2align 4, 0x90 +.LBB0_17: # + # in Loop: Header=BB0_6 Depth=1 + movq %r13, %rdx +.LBB0_5: # + # in Loop: Header=BB0_6 Depth=1 + vaddsd (%r8,%r12,8), %xmm10, %xmm0 + vmovsd %xmm0, (%r8,%r12,8) + vaddsd (%r8,%rbx,8), %xmm11, %xmm0 + vmovsd %xmm0, (%r8,%rbx,8) + vaddsd (%r8,%rbp,8), %xmm5, %xmm0 + vmovsd %xmm0, (%r8,%rbp,8) + leal 3(%r13), %eax + addl $6, %r13d + testl %eax, %eax + cmovnsl %eax, %r13d + sarl $2, %r13d + movslq %r13d, %rax + vmovq %rax, %xmm0 + vmovq %rdx, %xmm1 + vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0] + vpaddq %xmm0, %xmm14, %xmm14 + addq %rdi, %r11 + cmpq %r14, %r10 + leaq 1(%r10), %r10 + je .LBB0_18 +.LBB0_6: # + # =>This Loop Header: Depth=1 + # Child Loop BB0_9 Depth 2 + # Child Loop BB0_13 Depth 2 + movl (%rsi,%r10,4), %r13d + leal (%r10,%r10,2), %r12d + leal (%r10,%r10,2), %ebx + incl %ebx + leal (%r10,%r10,2), %ebp + addl $2, %ebp + testl %r13d, %r13d + jle .LBB0_4 +# %bb.7: # + # in Loop: Header=BB0_6 Depth=1 + vmovsd (%r15,%r12,8), %xmm0 # xmm0 = mem[0],zero + vmovsd (%r15,%rbx,8), %xmm1 # xmm1 = mem[0],zero + vmovsd (%r15,%rbp,8), %xmm2 # xmm2 = mem[0],zero + movq %r13, %rdx + movl $4294967292, %eax # imm = 0xFFFFFFFC + andq %rax, %rdx + vmovapd %xmm0, 112(%rsp) # 16-byte Spill + vmovapd %xmm1, 96(%rsp) # 16-byte Spill + vmovapd %xmm2, (%rsp) # 16-byte Spill + je .LBB0_16 +# %bb.8: # + # in Loop: Header=BB0_6 Depth=1 + movq %rbp, 64(%rsp) # 8-byte Spill + movq %rbx, 72(%rsp) # 8-byte Spill + vmovdqa %xmm14, 208(%rsp) # 16-byte Spill + vbroadcastsd %xmm0, %ymm14 + vbroadcastsd %xmm1, %ymm5 + vbroadcastsd %xmm2, %ymm10 + vxorpd %xmm0, %xmm0, %xmm0 + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm13, %xmm13, %xmm13 + xorl %ebp, %ebp + vmovapd %ymm8, %ymm9 + vmovupd 224(%rsp), %ymm8 # 32-byte Reload + .p2align 4, 0x90 +movl $111, %ebx # OSACA START MARKER +.byte 100 # OSACA START MARKER +.byte 103 # OSACA START MARKER +.byte 144 # OSACA START MARKER +# LLVM-MCA-BEGIN +# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc +.LBB0_9: # + # Parent Loop BB0_6 Depth=1 + # => This Inner Loop Header: Depth=2 + vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3] + vpmulld (%r11,%rbp,4), %xmm1, %xmm11 + vpmovsxdq %xmm11, %ymm1 + vpsllq $3, %ymm1, %ymm1 + vpaddq %ymm1, %ymm3, %ymm1 + vmovq %xmm1, %r14 + vpextrq $1, %xmm1, %r9 + vextracti128 $1, %ymm1, %xmm1 + vmovsd (%r14), %xmm2 # xmm2 = mem[0],zero + vpsubd .LCPI0_5, %xmm11, %xmm6 + vpmovsxdq %xmm6, %ymm6 + vpsllq $3, %ymm6, %ymm6 + vmovq %xmm1, %rdi + vpaddq %ymm6, %ymm3, %ymm6 + vmovq %xmm6, %rcx + vpextrq $1, %xmm1, %rbx + vpextrq $1, %xmm6, %rax + vextracti128 $1, %ymm6, %xmm1 + vmovsd (%rdi), %xmm6 # xmm6 = mem[0],zero + vmovq %xmm1, %rdi + vpextrq $1, %xmm1, %rsi + vmovsd (%rdi), %xmm1 # xmm1 = mem[0],zero + vmovsd (%rcx), %xmm7 # xmm7 = mem[0],zero + vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2] + vmovhpd (%r9), %xmm2, %xmm2 # xmm2 = xmm2[0],mem[0] + vpaddd %xmm12, %xmm11, %xmm4 + vpmovsxdq %xmm4, %ymm4 + vmovhpd (%rax), %xmm7, %xmm7 # xmm7 = xmm7[0],mem[0] + vpsllq $3, %ymm4, %ymm4 + vpaddq %ymm4, %ymm3, %ymm4 + vmovhpd (%rbx), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0] + vpextrq $1, %xmm4, %rax + vmovhpd (%rsi), %xmm1, %xmm1 # xmm1 = xmm1[0],mem[0] + vmovq %xmm4, %rcx + vextracti128 $1, %ymm4, %xmm4 + vmovq %xmm4, %rsi + vinsertf128 $1, %xmm6, %ymm2, %ymm2 + vpextrq $1, %xmm4, %rdi + vmovsd (%rsi), %xmm4 # xmm4 = mem[0],zero + vsubpd %ymm2, %ymm14, %ymm2 + vmovhpd (%rdi), %xmm4, %xmm4 # xmm4 = xmm4[0],mem[0] + vmovsd (%rcx), %xmm6 # xmm6 = mem[0],zero + vinsertf128 $1, %xmm1, %ymm7, %ymm1 + vmovhpd (%rax), %xmm6, %xmm6 # xmm6 = xmm6[0],mem[0] + vinsertf128 $1, %xmm4, %ymm6, %ymm4 + vsubpd %ymm1, %ymm5, %ymm1 + vsubpd %ymm4, %ymm10, %ymm4 + vmulpd %ymm2, %ymm2, %ymm6 + vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6 + vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6 + vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] + vdivpd %ymm6, %ymm7, %ymm7 + vmulpd %ymm7, %ymm7, %ymm11 + vmulpd %ymm9, %ymm11, %ymm11 + vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] + vmulpd %ymm7, %ymm11, %ymm11 + vaddpd %ymm12, %ymm11, %ymm12 + vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload + vmulpd %ymm7, %ymm11, %ymm7 + vmulpd %ymm7, %ymm12, %ymm7 + vcmpltpd %ymm8, %ymm6, %ymm6 + vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0 + vblendvpd %ymm6, %ymm2, %ymm0, %ymm0 + vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15 + vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13 + vblendvpd %ymm6, %ymm1, %ymm15, %ymm15 + vblendvpd %ymm6, %ymm4, %ymm13, %ymm13 + addq $4, %rbp + cmpq %rdx, %rbp + jb .LBB0_9 +# LLVM-MCA-END +movl $222, %ebx # OSACA END MARKER +.byte 100 # OSACA END MARKER +.byte 103 # OSACA END MARKER +.byte 144 # OSACA END MARKER +# %bb.10: # + # in Loop: Header=BB0_6 Depth=1 + vpermilpd $1, %xmm0, %xmm1 # xmm1 = xmm0[1,0] + vaddsd %xmm1, %xmm0, %xmm1 + vextractf128 $1, %ymm0, %xmm0 + vaddsd %xmm0, %xmm1, %xmm1 + vpermilpd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0] + vaddsd %xmm0, %xmm1, %xmm10 + vpermilpd $1, %xmm15, %xmm1 # xmm1 = xmm15[1,0] + vaddsd %xmm1, %xmm15, %xmm1 + vextractf128 $1, %ymm15, %xmm2 + vaddsd %xmm2, %xmm1, %xmm1 + vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0] + vaddsd %xmm2, %xmm1, %xmm11 + vpermilpd $1, %xmm13, %xmm1 # xmm1 = xmm13[1,0] + vaddsd %xmm1, %xmm13, %xmm1 + vextractf128 $1, %ymm13, %xmm2 + vaddsd %xmm2, %xmm1, %xmm1 + vpermilpd $1, %xmm2, %xmm2 # xmm2 = xmm2[1,0] + vaddsd %xmm2, %xmm1, %xmm5 + movq 56(%rsp), %r14 # 8-byte Reload + vmovapd 80(%rsp), %xmm12 # 16-byte Reload + vmovapd 192(%rsp), %xmm13 # 16-byte Reload + movq 48(%rsp), %rsi # 8-byte Reload + movq 40(%rsp), %rdi # 8-byte Reload + vmovdqa 208(%rsp), %xmm14 # 16-byte Reload + vmovapd 176(%rsp), %xmm15 # 16-byte Reload + vmovapd %ymm9, %ymm8 + movq 72(%rsp), %rbx # 8-byte Reload + movq 64(%rsp), %rbp # 8-byte Reload + vmovapd 112(%rsp), %xmm0 # 16-byte Reload + cmpq %r13, %rdx + jae .LBB0_17 + jmp .LBB0_11 + .p2align 4, 0x90 +.LBB0_4: # + # in Loop: Header=BB0_6 Depth=1 + movslq %r13d, %rdx + vxorpd %xmm5, %xmm5, %xmm5 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm10, %xmm10, %xmm10 + jmp .LBB0_5 + .p2align 4, 0x90 +.LBB0_16: # + # in Loop: Header=BB0_6 Depth=1 + vxorpd %xmm10, %xmm10, %xmm10 + vxorpd %xmm11, %xmm11, %xmm11 + vxorpd %xmm5, %xmm5, %xmm5 + cmpq %r13, %rdx + jae .LBB0_17 +.LBB0_11: # + # in Loop: Header=BB0_6 Depth=1 + vmovapd 96(%rsp), %xmm4 # 16-byte Reload + jmp .LBB0_13 + .p2align 4, 0x90 +.LBB0_12: # + # in Loop: Header=BB0_13 Depth=2 + incq %rdx + cmpq %rdx, %r13 + je .LBB0_17 +.LBB0_13: # + # Parent Loop BB0_6 Depth=1 + # => This Inner Loop Header: Depth=2 + movl (%r11,%rdx,4), %eax + leal (%rax,%rax,2), %ecx + movslq %ecx, %rcx + vsubsd (%r15,%rcx,8), %xmm0, %xmm6 + leal (%rax,%rax,2), %ecx + incl %ecx + movslq %ecx, %rcx + vsubsd (%r15,%rcx,8), %xmm4, %xmm2 + leal 2(%rax,%rax,2), %eax + cltq + vmovapd (%rsp), %xmm1 # 16-byte Reload + vsubsd (%r15,%rax,8), %xmm1, %xmm1 + vmulsd %xmm6, %xmm6, %xmm7 + vfmadd231sd %xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7 + vfmadd231sd %xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7 + vucomisd %xmm13, %xmm7 + jae .LBB0_12 +# %bb.14: # + # in Loop: Header=BB0_13 Depth=2 + vmovsd .LCPI0_3(%rip), %xmm0 # xmm0 = mem[0],zero + vdivsd %xmm7, %xmm0, %xmm7 + vmulsd %xmm7, %xmm7, %xmm0 + vmulsd %xmm0, %xmm12, %xmm0 + vmulsd %xmm7, %xmm0, %xmm0 + vaddsd .LCPI0_4(%rip), %xmm0, %xmm4 + vmulsd %xmm7, %xmm15, %xmm7 + vmulsd %xmm0, %xmm7, %xmm0 + vmulsd %xmm4, %xmm0, %xmm0 + vmovapd 96(%rsp), %xmm4 # 16-byte Reload + vfmadd231sd %xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10 + vfmadd231sd %xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11 + vfmadd231sd %xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5 + vmovapd 112(%rsp), %xmm0 # 16-byte Reload + jmp .LBB0_12 +.LBB0_18: # + movq 24(%rsp), %rax # 8-byte Reload + vmovdqu %xmm14, (%rax) +.LBB0_19: # + movl $.L.str, %edi + vzeroupper + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + vsubsd 32(%rsp), %xmm0, %xmm0 # 8-byte Folded Reload + addq $264, %rsp # imm = 0x108 + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end0: + .size computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c + .cfi_endproc + # -- End function + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 # -- Begin function computeForceLJHalfNeigh +.LCPI1_0: + .quad 4631952216750555136 # 48 +.LCPI1_1: + .quad 4607182418800017408 # 1 +.LCPI1_2: + .quad -4620693217682128896 # -0.5 +.LCPI1_3: + .quad 4741671816366391296 # 1.0E+9 + .text + .globl computeForceLJHalfNeigh + .p2align 4, 0x90 + .type computeForceLJHalfNeigh,@function +computeForceLJHalfNeigh: # +.LcomputeForceLJHalfNeigh$local: + .cfi_startproc +# %bb.0: # + pushq %rbp + .cfi_def_cfa_offset 16 + pushq %r15 + .cfi_def_cfa_offset 24 + pushq %r14 + .cfi_def_cfa_offset 32 + pushq %r13 + .cfi_def_cfa_offset 40 + pushq %r12 + .cfi_def_cfa_offset 48 + pushq %rbx + .cfi_def_cfa_offset 56 + subq $56, %rsp + .cfi_def_cfa_offset 112 + .cfi_offset %rbx, -56 + .cfi_offset %r12, -48 + .cfi_offset %r13, -40 + .cfi_offset %r14, -32 + .cfi_offset %r15, -24 + .cfi_offset %rbp, -16 + movq %rcx, 24(%rsp) # 8-byte Spill + movq %rdx, %r12 + movq %rsi, %r13 + movl 4(%rsi), %r15d + vmovsd 144(%rdi), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, (%rsp) # 8-byte Spill + vmovsd 40(%rdi), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 8(%rsp) # 8-byte Spill + movq %rdi, 40(%rsp) # 8-byte Spill + vmovsd 56(%rdi), %xmm0 # xmm0 = mem[0],zero + vmovsd %xmm0, 16(%rsp) # 8-byte Spill + testl %r15d, %r15d + jle .LBB1_2 +# %bb.1: # + movq 64(%r13), %rdi + leaq (,%r15,8), %rax + leaq (%rax,%rax,2), %rdx + xorl %esi, %esi + callq _intel_fast_memset +.LBB1_2: # + xorl %r14d, %r14d + xorl %eax, %eax + callq getTimeStamp + vmovsd %xmm0, 32(%rsp) # 8-byte Spill + movl $.L.str.1, %edi + callq likwid_markerStartRegion + testl %r15d, %r15d + jle .LBB1_8 +# %bb.3: # + vmovsd (%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd %xmm0, %xmm0, %xmm12 + movq 16(%r12), %rax + movq 24(%r12), %rcx + movq %rcx, (%rsp) # 8-byte Spill + movslq 8(%r12), %rdx + movq 16(%r13), %rsi + movq 64(%r13), %rdi + vmovsd 8(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + vmulsd .LCPI1_0(%rip), %xmm0, %xmm11 + movq 24(%rsp), %rcx # 8-byte Reload + vmovdqu (%rcx), %xmm10 + shlq $2, %rdx + movq %rdx, 48(%rsp) # 8-byte Spill + xorl %r13d, %r13d + xorl %r14d, %r14d + jmp .LBB1_4 + .p2align 4, 0x90 +.LBB1_14: # + # in Loop: Header=BB1_4 Depth=1 + movq 8(%rsp), %rbp # 8-byte Reload +.LBB1_6: # + # in Loop: Header=BB1_4 Depth=1 + addl %r10d, %r14d + vaddsd (%rdi,%r12,8), %xmm14, %xmm0 + vmovsd %xmm0, (%rdi,%r12,8) + vaddsd (%rdi,%rbp,8), %xmm15, %xmm0 + vmovsd %xmm0, (%rdi,%rbp,8) + vaddsd (%rdi,%r11,8), %xmm13, %xmm0 + vmovsd %xmm0, (%rdi,%r11,8) + leal 3(%r10), %ecx + addl $6, %r10d + testl %ecx, %ecx + cmovnsl %ecx, %r10d + sarl $2, %r10d + movslq %r10d, %rcx + vmovq %rcx, %xmm0 + vmovq %rdx, %xmm1 + vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0] + vpaddq %xmm0, %xmm10, %xmm10 + incq %r13 + addq 48(%rsp), %rax # 8-byte Folded Reload + cmpq %r15, %r13 + je .LBB1_7 +.LBB1_4: # + # =>This Loop Header: Depth=1 + # Child Loop BB1_10 Depth 2 + movq (%rsp), %rcx # 8-byte Reload + movslq (%rcx,%r13,4), %r10 + leaq (,%r13,2), %rcx + addq %r13, %rcx + leal 1(%rcx), %ebp + leal 2(%rcx), %r11d + movl %ecx, %r12d + testq %r10, %r10 + jle .LBB1_5 +# %bb.9: # + # in Loop: Header=BB1_4 Depth=1 + vmovsd (%rsi,%r12,8), %xmm9 # xmm9 = mem[0],zero + movq %rbp, 8(%rsp) # 8-byte Spill + vmovsd (%rsi,%rbp,8), %xmm4 # xmm4 = mem[0],zero + vmovsd (%rsi,%r11,8), %xmm1 # xmm1 = mem[0],zero + movl %r10d, %edx + vxorpd %xmm14, %xmm14, %xmm14 + xorl %ebx, %ebx + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm13, %xmm13, %xmm13 + jmp .LBB1_10 + .p2align 4, 0x90 +.LBB1_13: # + # in Loop: Header=BB1_10 Depth=2 + incq %rbx + cmpq %rbx, %rdx + je .LBB1_14 +.LBB1_10: # + # Parent Loop BB1_4 Depth=1 + # => This Inner Loop Header: Depth=2 + movslq (%rax,%rbx,4), %r9 + leaq (%r9,%r9,2), %r8 + vsubsd (%rsi,%r8,8), %xmm9, %xmm2 + movslq %r8d, %rcx + vsubsd 8(%rsi,%rcx,8), %xmm4, %xmm5 + vsubsd 16(%rsi,%rcx,8), %xmm1, %xmm0 + vmulsd %xmm2, %xmm2, %xmm6 + vfmadd231sd %xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6 + vfmadd231sd %xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6 + vucomisd %xmm12, %xmm6 + jae .LBB1_13 +# %bb.11: # + # in Loop: Header=BB1_10 Depth=2 + vmovsd .LCPI1_1(%rip), %xmm3 # xmm3 = mem[0],zero + vdivsd %xmm6, %xmm3, %xmm6 + vmulsd 16(%rsp), %xmm6, %xmm7 # 8-byte Folded Reload + vmulsd %xmm6, %xmm6, %xmm8 + vmulsd %xmm7, %xmm8, %xmm7 + vaddsd .LCPI1_2(%rip), %xmm7, %xmm3 + vmulsd %xmm6, %xmm11, %xmm6 + vmulsd %xmm7, %xmm6, %xmm6 + vmulsd %xmm3, %xmm6, %xmm3 + vmulsd %xmm2, %xmm3, %xmm6 + vaddsd %xmm6, %xmm14, %xmm14 + vmulsd %xmm5, %xmm3, %xmm2 + vaddsd %xmm2, %xmm15, %xmm15 + vmulsd %xmm0, %xmm3, %xmm0 + vaddsd %xmm0, %xmm13, %xmm13 + cmpl %r15d, %r9d + jge .LBB1_13 +# %bb.12: # + # in Loop: Header=BB1_10 Depth=2 + leaq 1(%rcx), %rbp + addq $2, %rcx + vmovsd (%rdi,%r8,8), %xmm3 # xmm3 = mem[0],zero + vsubsd %xmm6, %xmm3, %xmm3 + vmovsd %xmm3, (%rdi,%r8,8) + vmovsd (%rdi,%rbp,8), %xmm3 # xmm3 = mem[0],zero + vsubsd %xmm2, %xmm3, %xmm2 + vmovsd %xmm2, (%rdi,%rbp,8) + vmovsd (%rdi,%rcx,8), %xmm2 # xmm2 = mem[0],zero + vsubsd %xmm0, %xmm2, %xmm0 + vmovsd %xmm0, (%rdi,%rcx,8) + jmp .LBB1_13 + .p2align 4, 0x90 +.LBB1_5: # + # in Loop: Header=BB1_4 Depth=1 + vxorpd %xmm13, %xmm13, %xmm13 + movq %r10, %rdx + vxorpd %xmm15, %xmm15, %xmm15 + vxorpd %xmm14, %xmm14, %xmm14 + jmp .LBB1_6 +.LBB1_7: # + movq 24(%rsp), %rax # 8-byte Reload + vmovdqu %xmm10, (%rax) +.LBB1_8: # + movl $.L.str.1, %edi + callq likwid_markerStopRegion + xorl %eax, %eax + callq getTimeStamp + movq 40(%rsp), %rax # 8-byte Reload + vmovsd 264(%rax), %xmm3 # xmm3 = mem[0],zero + vsubsd 32(%rsp), %xmm0, %xmm2 # 8-byte Folded Reload + vmulsd .LCPI1_3(%rip), %xmm3, %xmm0 + vmulsd %xmm2, %xmm0, %xmm0 + vmovapd %xmm2, %xmm1 + vmovsd %xmm2, 16(%rsp) # 8-byte Spill + movl %r14d, %eax + vxorps %xmm12, %xmm12, %xmm12 + vcvtsi2sd %rax, %xmm12, %xmm2 + vdivsd %xmm2, %xmm0, %xmm2 + movl $.L.str.2, %edi + movl %r14d, %esi + vmovapd %xmm3, %xmm0 + movb $3, %al + callq printf + vmovsd 16(%rsp), %xmm0 # 8-byte Reload + # xmm0 = mem[0],zero + addq $56, %rsp + .cfi_def_cfa_offset 56 + popq %rbx + .cfi_def_cfa_offset 48 + popq %r12 + .cfi_def_cfa_offset 40 + popq %r13 + .cfi_def_cfa_offset 32 + popq %r14 + .cfi_def_cfa_offset 24 + popq %r15 + .cfi_def_cfa_offset 16 + popq %rbp + .cfi_def_cfa_offset 8 + retq +.Lfunc_end1: + .size computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh + .cfi_endproc + # -- End function + .globl computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd + .p2align 4, 0x90 + .type computeForceLJFullNeigh_simd,@function +computeForceLJFullNeigh_simd: # +.LcomputeForceLJFullNeigh_simd$local: + .cfi_startproc +# %bb.0: # + pushq %rax + .cfi_def_cfa_offset 16 + movl 4(%rsi), %eax + testl %eax, %eax + jle .LBB2_2 +# %bb.1: # + movq 64(%rsi), %rdi + shlq $3, %rax + leaq (%rax,%rax,2), %rdx + xorl %esi, %esi + callq _intel_fast_memset +.LBB2_2: # + xorl %eax, %eax + callq getTimeStamp + movl $.L.str, %edi + callq likwid_markerStartRegion + movq stderr(%rip), %rcx + movl $.L.str.3, %edi + movl $65, %esi + movl $1, %edx + callq fwrite + movl $-1, %edi + callq exit +.Lfunc_end2: + .size computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd + .cfi_endproc + # -- End function + .type .L.str,@object # + .section .rodata.str1.1,"aMS",@progbits,1 +.L.str: + .asciz "force" + .size .L.str, 6 + .type .L.str.1,@object # +.L.str.1: + .asciz "forceLJ-halfneigh" + .size .L.str.1, 18 + .type .L.str.2,@object # +.L.str.2: + .asciz "Its: %u Freq: %f Time: %f\nCy/it: %f (half-neigh)\n" + .size .L.str.2, 52 + .type .L.str.3,@object # +.L.str.3: + .asciz "Error: SIMD kernel not implemented for specified instruction set!" + .size .L.str.3, 66 + .ident "Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)" + .section ".note.GNU-stack","",@progbits