Update force kernel to be integrated
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
		| @@ -10,224 +10,153 @@ computeForce: | |||||||
|         push      r12 |         push      r12 | ||||||
|         push      r13 |         push      r13 | ||||||
|         push      r14 |         push      r14 | ||||||
|  |         mov       r9d, DWORD PTR [4+rsi]                            # r9d <- atom->Nlocal | ||||||
| # r9d <- atom->Nlocal |         vmovsd    xmm2, QWORD PTR [72+rdi]                          # xmm2 <- param->cutforce | ||||||
|         mov       r9d, DWORD PTR [4+rsi] |         vmovsd    xmm1, QWORD PTR [8+rdi]                           # xmm1 <- param->sigma6 | ||||||
| # xmm2 <- param->cutforce |         vmovsd    xmm0, QWORD PTR [rdi]                             # xmm0 <- param->epsilon | ||||||
|         vmovsd    xmm2, QWORD PTR [72+rdi] |         mov       r13, QWORD PTR [64+rsi]                           # r13 <- atom->fx | ||||||
| # xmm1 <- param->sigma6 |         mov       r14, QWORD PTR [72+rsi]                           # r14 <- atom->fy | ||||||
|         vmovsd    xmm1, QWORD PTR [8+rdi] |         mov       rdi, QWORD PTR [80+rsi]                           # rdi <- atom->fz | ||||||
| # xmm0 <- param->epsilon |         test      r9d, r9d                                          # atom->Nlocal <= 0 | ||||||
|         vmovsd    xmm0, QWORD PTR [rdi] |         jle       ..exit_func | ||||||
|  |  | ||||||
| # r13 <- atom->fx |  | ||||||
|         mov       r13, QWORD PTR [64+rsi] |  | ||||||
| # r14 <- atom->fy |  | ||||||
|         mov       r14, QWORD PTR [72+rsi] |  | ||||||
| # rdi <- atom->fz |  | ||||||
|         mov       rdi, QWORD PTR [80+rsi] |  | ||||||
|  |  | ||||||
| # atom->Nlocal <= 0 |  | ||||||
|         test      r9d, r9d |  | ||||||
|         jle       ..B1.30 |  | ||||||
|  |  | ||||||
| ..B1.2: | ..B1.2: | ||||||
| # r10d <- 0 |         xor       r10d, r10d                                        # r10d <- 0 | ||||||
|         xor       r10d, r10d |         mov       ecx, r9d                                          # ecx <- atom->Nlocal | ||||||
| # ecx <- atom->Nlocal |         xor       r8d, r8d                                          # r8d <- 0 | ||||||
|         mov       ecx, r9d |         mov       r11d, 1                                           # r11d <- 1 | ||||||
| # r8d <- 0 |         xor       eax, eax                                          # eax <- 0 | ||||||
|         xor       r8d, r8d |         shr       ecx, 1                                            # ecx <- atom->Nlocal >> 1 | ||||||
| # r11d <- 1 |         je        ..B1.6                                            # ecx == 0 | ||||||
|         mov       r11d, 1 |  | ||||||
| # eax <- 0 |  | ||||||
|         xor       eax, eax |  | ||||||
| # ecx <- atom->Nlocal >> 1 |  | ||||||
|         shr       ecx, 1 |  | ||||||
|         je        ..B1.6 |  | ||||||
|  |  | ||||||
| # Init forces to zero loop | # Init forces to zero loop | ||||||
| ..B1.4: | ..B1.4: | ||||||
| # fx[i] <- 0 |         mov       QWORD PTR [r8+r13], rax                           # fx[i] <- 0 | ||||||
|         mov       QWORD PTR [r8+r13], rax |         mov       QWORD PTR [r8+r14], rax                           # fy[i] <- 0 | ||||||
| # i++ |         mov       QWORD PTR [r8+rdi], rax                           # fz[i] <- 0 | ||||||
|         inc       r10 |         mov       QWORD PTR [8+r8+r13], rax                         # fx[i] <- 0 | ||||||
| # fy[i] <- 0 |         mov       QWORD PTR [8+r8+r14], rax                         # fy[i] <- 0 | ||||||
|         mov       QWORD PTR [r8+r14], rax |         mov       QWORD PTR [8+r8+rdi], rax                         # fz[i] <- 0 | ||||||
| # fz[i] <- 0 |         add       r8, 16                                            # i++ | ||||||
|         mov       QWORD PTR [r8+rdi], rax |         inc       r10                                               # i++ | ||||||
| # fx[i] <- 0 |         cmp       r10, rcx                                          # i < Nlocal | ||||||
|         mov       QWORD PTR [8+r8+r13], rax |  | ||||||
| # fy[i] <- 0 |  | ||||||
|         mov       QWORD PTR [8+r8+r14], rax |  | ||||||
| # fz[i] <- 0 |  | ||||||
|         mov       QWORD PTR [8+r8+rdi], rax |  | ||||||
| # i++ |  | ||||||
|         add       r8, 16 |  | ||||||
| # i < Nlocal |  | ||||||
|         cmp       r10, rcx |  | ||||||
|         jb        ..B1.4 |         jb        ..B1.4 | ||||||
|  |  | ||||||
| ..B1.5: | ..B1.5: | ||||||
| # r11d <- i * 2 + 1 |         lea       r11d, DWORD PTR [1+r10+r10]                       # r11d <- i * 2 + 1 | ||||||
|         lea       r11d, DWORD PTR [1+r10+r10] |  | ||||||
| ..B1.6: | ..B1.6: | ||||||
| # r11d <- i * 2 |         lea       ecx, DWORD PTR [-1+r11]                           # r11d <- i * 2 | ||||||
|         lea       ecx, DWORD PTR [-1+r11] |         cmp       ecx, r9d                                          # i < Nlocal | ||||||
| # i < Nlocal |  | ||||||
|         cmp       ecx, r9d |  | ||||||
|         jae       ..B1.8 |         jae       ..B1.8 | ||||||
|  |  | ||||||
| ..B1.7: | ..B1.7: | ||||||
|  |         movsxd    r11, r11d                                         # r11 <- i * 2 | ||||||
| # r11 <- i * 2 |         mov       QWORD PTR [-8+r13+r11*8], rax                     # fx[i] <- 0 | ||||||
|         movsxd    r11, r11d |         mov       QWORD PTR [-8+r14+r11*8], rax                     # fy[i] <- 0 | ||||||
| # fx[i] <- 0 |         mov       QWORD PTR [-8+rdi+r11*8], rax                     # fz[i] <- 0 | ||||||
|         mov       QWORD PTR [-8+r13+r11*8], rax |  | ||||||
| # fy[i] <- 0 |  | ||||||
|         mov       QWORD PTR [-8+r14+r11*8], rax |  | ||||||
| # fz[i] <- 0 |  | ||||||
|         mov       QWORD PTR [-8+rdi+r11*8], rax |  | ||||||
|  |  | ||||||
| ..B1.8: | ..B1.8: | ||||||
|  |         vmulsd    xmm15, xmm2, xmm2                                 # xmm15 <- cutforcesq | ||||||
|  |         xor       r8d, r8d                                          # r8d <- 0 | ||||||
|  |         vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip]      # ymm18 <- [8, ...] | ||||||
|  |         vmulsd    xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip]   # xmm0 <- 48 *  epsilon | ||||||
|  |         vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip]      # ymm17 <- [0..7] | ||||||
|  |         vmovups   zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip]       # zmm7 <- [0.5, ...] | ||||||
|  |         vbroadcastsd zmm16, xmm15                                   # zmm16 <- [cutforcesq, ...] | ||||||
|  |         vbroadcastsd zmm15, xmm1                                    # zmm15 <- [param->sigma6, ...] | ||||||
|  |         vbroadcastsd zmm14, xmm0                                    # zmm16 <- [48 * epsilon, ...] | ||||||
|  |         movsxd    r9, r9d                                           # r9 <- atom->Nlocal | ||||||
|  |         xor       r10d, r10d                                        # r10d <- 0 (i) | ||||||
|  |         mov       rcx, QWORD PTR [24+rdx]                           # rcx <- neighbor->numneigh | ||||||
|  |         mov       r11, QWORD PTR [8+rdx]                            # r11 <- neighbor->neighbors | ||||||
|  |         movsxd    r12, DWORD PTR [16+rdx]                           # r12 <- neighbor->maxneighs | ||||||
|  |         mov       rdx, QWORD PTR [16+rsi]                           # rdx <- atom->x | ||||||
|  |  | ||||||
| # xmm15 <- cutforcesq |         ### AOS | ||||||
|         vmulsd    xmm15, xmm2, xmm2 |         xor        eax, eax | ||||||
| # r8d <- 0 |         ### SOA | ||||||
|         xor       r8d, r8d |         #mov       rax, QWORD PTR [24+rsi]                           # rax <- atom->y | ||||||
| # ymm18 <- 8 |         #mov       rsi, QWORD PTR [32+rsi]                           # rsi <- atom->z | ||||||
|         vmovdqu32 ymm18, YMMWORD PTR .L_2il0floatpacket.0[rip] |         ### | ||||||
| # xmm0 <- 48 *  epsilon |  | ||||||
|         vmulsd    xmm0, xmm0, QWORD PTR .L_2il0floatpacket.3[rip] |  | ||||||
| # ymm17 <- [0..7] |  | ||||||
|         vmovdqu32 ymm17, YMMWORD PTR .L_2il0floatpacket.1[rip] |  | ||||||
| # zmm7 <- 0.5 |  | ||||||
|         vmovups   zmm7, ZMMWORD PTR .L_2il0floatpacket.4[rip] |  | ||||||
| # zmm16 <- cutforcesq |  | ||||||
|         vbroadcastsd zmm16, xmm15 |  | ||||||
| # zmm15 <- param->sigma6 |  | ||||||
|         vbroadcastsd zmm15, xmm1 |  | ||||||
| # zmm16 <- 48 * epsilon |  | ||||||
|         vbroadcastsd zmm14, xmm0 |  | ||||||
| # r9 <- atom->Nlocal |  | ||||||
|         movsxd    r9, r9d |  | ||||||
| # r10d <- 0 (i) |  | ||||||
|         xor       r10d, r10d |  | ||||||
| # rcx <- neighbor->numneigh |  | ||||||
|         mov       rcx, QWORD PTR [24+rdx] |  | ||||||
| # r11 <- neighbor->neighbors |  | ||||||
|         mov       r11, QWORD PTR [8+rdx] |  | ||||||
| # r12 <- neighbor->maxneighs |  | ||||||
|         movsxd    r12, DWORD PTR [16+rdx] |  | ||||||
|  |  | ||||||
| # rdx <- atom->x |         shl       r12, 2                                            # r12 <- neighbor->maxneighs * 4 | ||||||
|         mov       rdx, QWORD PTR [16+rsi] |         mov       QWORD PTR [-32+rsp], r9                           # [-32+rsp] <- atom->Nlocal | ||||||
| # rax <- atom->y |         mov       QWORD PTR [-24+rsp], rcx                          # [-24+rsp] <- neighbor->numneigh | ||||||
|         mov       rax, QWORD PTR [24+rsi] |         mov       QWORD PTR [-16+rsp], r14                          # [-16+rsp] <- atom->fy | ||||||
| # rsi <- atom->z |         mov       QWORD PTR [-8+rsp], r13                           # [-8+rsp] <- atom->fx | ||||||
|         mov       rsi, QWORD PTR [32+rsi] |         mov       QWORD PTR [-40+rsp], r15                          # [-40+rsp] <- r15 | ||||||
|  |         mov       QWORD PTR [-48+rsp], rbx                          # [-48+rsp] <- rbx | ||||||
| # r12 <- neighbor->maxneighs * 4 |  | ||||||
|         shl       r12, 2 |  | ||||||
| # [-32+rsp] <- atom->Nlocal |  | ||||||
|         mov       QWORD PTR [-32+rsp], r9 |  | ||||||
| # [-24+rsp] <- neighbor->numneigh |  | ||||||
|         mov       QWORD PTR [-24+rsp], rcx |  | ||||||
| # [-16+rsp] <- atom->fy |  | ||||||
|         mov       QWORD PTR [-16+rsp], r14 |  | ||||||
| # [-8+rsp] <- atom->fx |  | ||||||
|         mov       QWORD PTR [-8+rsp], r13 |  | ||||||
| # [-40+rsp] <- r15 |  | ||||||
|         mov       QWORD PTR [-40+rsp], r15 |  | ||||||
| # [-48+rsp] <- rbx |  | ||||||
|         mov       QWORD PTR [-48+rsp], rbx |  | ||||||
| # zmm19 <- 0 |  | ||||||
|         vpxord    zmm19, zmm19, zmm19 |  | ||||||
|  |  | ||||||
| # Loop over all atoms | # Loop over all atoms | ||||||
| ..B1.9: | ..B1.9: | ||||||
|  |         mov       rcx, QWORD PTR [-24+rsp]                          # rcx <- neighbor->numneigh | ||||||
|  |         vxorpd    xmm25, xmm25, xmm25                               # xmm25 <- 0 | ||||||
|  |         vmovapd   xmm20, xmm25                                      # xmm20 <- 0 | ||||||
|  |         mov       r13d, DWORD PTR [rcx+r10*4]                       # r13d <- neighbor->numneigh[i] (numneighs) | ||||||
|  |         vmovapd   xmm4, xmm20                                       # xmm4 <- 0 | ||||||
|  |  | ||||||
| # rcx <- neighbor->numneigh |         ### AOS | ||||||
|         mov       rcx, QWORD PTR [-24+rsp] |         vmovsd    xmm8, QWORD PTR[rdx+rax]                          # xmm8 <- atom->x[i * 3] | ||||||
| # xmm25 <- 0 |         vmovsd    xmm9, QWORD PTR[8+rdx+rax]                        # xmm9 <- atom->x[i * 3 + 1] | ||||||
|         vxorpd    xmm25, xmm25, xmm25 |         vmovsd    xmm10, QWORD PTR[16+rdx+rax]                      # xmm10 <- atom->x[i * 3 + 2] | ||||||
| # xmm20 <- 0 |         ### SOA | ||||||
|         vmovapd   xmm20, xmm25 |         #vmovsd    xmm8, QWORD PTR [rdx+r10*8]                      # xmm8 <- atom->x[i] | ||||||
| # r13d <- neighbor->numneigh[i] (numneighs) |         #vmovsd    xmm9, QWORD PTR [rax+r10*8]                      # xmm9 <- atom->y[i] | ||||||
|         mov       r13d, DWORD PTR [rcx+r10*4] |         #vmovsd    xmm10, QWORD PTR [rsi+r10*8]                     # xmm10 <- atom->z[i] | ||||||
| # xmm4 <- 0 |         ### | ||||||
|         vmovapd   xmm4, xmm20 |  | ||||||
| # xmm8 <- atom->x[i] |         test      r13d, r13d                                        # numneighs <= 0 | ||||||
|         vmovsd    xmm8, QWORD PTR [rdx+r10*8] |         jle       ..exit_func | ||||||
| # xmm9 <- atom->y[i] |  | ||||||
|         vmovsd    xmm9, QWORD PTR [rax+r10*8] |  | ||||||
| # xmm9 <- atom->z[i] |  | ||||||
|         vmovsd    xmm10, QWORD PTR [rsi+r10*8] |  | ||||||
| # numneighs <= 0 |  | ||||||
|         test      r13d, r13d |  | ||||||
|         jle       ..B1.27 |  | ||||||
|  |  | ||||||
| ..B1.10: | ..B1.10: | ||||||
|  |         vpxord    zmm13, zmm13, zmm13                               # zmm13 <- 0 (fix) | ||||||
|  |         vmovaps   zmm12, zmm13                                      # zmm12 <- 0 (fiy) | ||||||
|  |         vmovaps   zmm11, zmm12                                      # zmm11 <- 0 (fiz) | ||||||
|  |  | ||||||
| # zmm13 <- 0 |         mov       r14d, r13d                                        # r14d <- numneighs | ||||||
|         vmovaps   zmm13, zmm19 |         xor       r11d, r11d                                        # r11d <- 0 | ||||||
| # zmm12 <- 0 |         and       r14d, -8                                          # r14d <- numneighs & (-8) | ||||||
|         vmovaps   zmm12, zmm13 |         lea       r9d, DWORD PTR [8+r11]                            # r9d <- 8 (why lea?) | ||||||
| # zmm11 <- 0 |         cmp       r14d, r9d                                         # r14d < r9d | ||||||
|         vmovaps   zmm11, zmm12 |         jl        ..B1.33 | ||||||
|  |  | ||||||
| # numneighs < 8 | #        cmp       r13d, 8                                           # numneighs < 8 | ||||||
|         cmp       r13d, 8 | #        jl        ..B1.32 | ||||||
|         jl        ..B1.32 | #..B1.11: | ||||||
|  | #        cmp       r13d, 1200                                        # numneighs < 1200 | ||||||
| ..B1.11: | #        jl        ..B1.31 | ||||||
|  | #..B1.12: | ||||||
| # numneighs < 1200 | #        mov       rcx, r12 | ||||||
|         cmp       r13d, 1200 | #        imul      rcx, r8 | ||||||
|         jl        ..B1.31 | #        add       rcx, r11                                          # rcx <- &neighbor->neighbors[neighbor->maxneighs * i (r8)] | ||||||
|  | #        mov       r9, rcx                                           # r9 <- neighs | ||||||
| ..B1.12: | #        and       r9, 63                                            # r9 <- neighs & 63 | ||||||
|  | #        test      r9d, 3                                            # (r9d & 3) == 0 => r9d divisible by 8 | ||||||
|         mov       rcx, r12 | #        je        ..B1.14 | ||||||
|         imul      rcx, r8 | #..B1.13: | ||||||
|         add       rcx, r11 | #        xor       r9d, r9d                                          # r9d <- 0 | ||||||
|         mov       r9, rcx | #        jmp       ..B1.16 | ||||||
|         and       r9, 63 | #..B1.14: | ||||||
|         test      r9d, 3 | #        test      r9d, r9d                                          # r9d == 0  | ||||||
|         je        ..B1.14 | #        je        ..B1.16 | ||||||
|  | #..B1.15: | ||||||
| ..B1.13: | #        neg       r9d | ||||||
|  | #        add       r9d, 64 | ||||||
|         xor       r9d, r9d | #        shr       r9d, 2                                            # r9d <- (64 - r9d) / 4 | ||||||
|         jmp       ..B1.16 | #        cmp       r13d, r9d                                         # numneighs < r9d | ||||||
|  | #        cmovl     r9d, r13d                                         # r9d <- MIN(numneighs, r9d) | ||||||
| ..B1.14: | #..B1.16: | ||||||
|  | #        mov       ebx, r13d | ||||||
|         test      r9d, r9d | #        sub       ebx, r9d | ||||||
|         je        ..B1.16 | #        and       ebx, 7 | ||||||
|  | #        neg       ebx | ||||||
| ..B1.15: | #        add       ebx, r13d                                         # ebx <- -((numneighs - r9d) & 7) + numneighs | ||||||
|  | #        cmp       r9d, 1                                            # r9d < 1 | ||||||
|         neg       r9d | #        jb        ..B1.20 | ||||||
|         add       r9d, 64 | #..B1.20: | ||||||
|         shr       r9d, 2 | #        lea       ecx, DWORD PTR [8+r9]                             # ecx <- r9d[1] | ||||||
|         cmp       r13d, r9d | #        cmp       ebx, ecx                                          # -((numneighs - r9d) & 7) + numneighs < neighs | ||||||
|         cmovl     r9d, r13d | #        jl        ..B1.24 | ||||||
|  |  | ||||||
| ..B1.16: |  | ||||||
|  |  | ||||||
|         mov       ebx, r13d |  | ||||||
|         sub       ebx, r9d |  | ||||||
|         and       ebx, 7 |  | ||||||
|         neg       ebx |  | ||||||
|         add       ebx, r13d |  | ||||||
|         cmp       r9d, 1 |  | ||||||
|         jb        ..B1.20 |  | ||||||
|  |  | ||||||
| ..B1.20: |  | ||||||
|         lea       ecx, DWORD PTR [8+r9] |  | ||||||
|         cmp       ebx, ecx |  | ||||||
|         jl        ..B1.24 |  | ||||||
|  |  | ||||||
| ..B1.21: | ..B1.21: | ||||||
|         mov       rcx, r12 |         mov       rcx, r12 | ||||||
| @@ -246,17 +175,25 @@ push      r14 | |||||||
|         vmovdqu   ymm3, YMMWORD PTR [rcx+r14*4] |         vmovdqu   ymm3, YMMWORD PTR [rcx+r14*4] | ||||||
|         add       r14, 8 |         add       r14, 8 | ||||||
|         vpxord    zmm5, zmm5, zmm5 |         vpxord    zmm5, zmm5, zmm5 | ||||||
|         vpxord    zmm4, zmm4, zmm4 |  | ||||||
|         vpxord    zmm6, zmm6, zmm6 |         vpxord    zmm6, zmm6, zmm6 | ||||||
|  |  | ||||||
|         vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8] |         ### AOS | ||||||
|  |         vpaddd     ymm4, ymm3, ymm3 | ||||||
|  |         vpaddd     ymm3, ymm3, ymm4 | ||||||
|  |         vpxord     zmm4, zmm4, zmm4 | ||||||
|         vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] |         vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] | ||||||
|         vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8] |         vgatherdpd zmm5{k2}, QWORD PTR [8+rdx+ymm3*8] | ||||||
|  |         vgatherdpd zmm6{k3}, QWORD PTR [16+rdx+ymm3*8] | ||||||
|  |         ### SOA | ||||||
|  |         #vpxord     zmm4, zmm4, zmm4 | ||||||
|  |         #vgatherdpd zmm5{k2}, QWORD PTR [rax+ymm3*8] | ||||||
|  |         #vgatherdpd zmm4{k1}, QWORD PTR [rdx+ymm3*8] | ||||||
|  |         #vgatherdpd zmm6{k3}, QWORD PTR [rsi+ymm3*8] | ||||||
|  |         ### | ||||||
|  |  | ||||||
|         vsubpd    zmm29, zmm1, zmm5 |         vsubpd    zmm29, zmm1, zmm5 | ||||||
|         vsubpd    zmm28, zmm0, zmm4 |         vsubpd    zmm28, zmm0, zmm4 | ||||||
|         vsubpd    zmm31, zmm2, zmm6 |         vsubpd    zmm31, zmm2, zmm6 | ||||||
|  |  | ||||||
|         vmulpd    zmm20, zmm29, zmm29 |         vmulpd    zmm20, zmm29, zmm29 | ||||||
|         vfmadd231pd zmm20, zmm28, zmm28 |         vfmadd231pd zmm20, zmm28, zmm28 | ||||||
|         vfmadd231pd zmm20, zmm31, zmm31 |         vfmadd231pd zmm20, zmm31, zmm31 | ||||||
| @@ -264,17 +201,13 @@ push      r14 | |||||||
| # if condition cutoff radius | # if condition cutoff radius | ||||||
|         vrcp14pd  zmm27, zmm20 #-> sr2 |         vrcp14pd  zmm27, zmm20 #-> sr2 | ||||||
|         vcmppd    k5, zmm20, zmm16, 1 |         vcmppd    k5, zmm20, zmm16, 1 | ||||||
|  |         vmulpd    zmm22, zmm27, zmm15                                   # zmm22 <-  sr2 * sigma6 | ||||||
|         vmulpd    zmm22, zmm27, zmm15 #-> sr2 * sigma6 |         vmulpd    zmm24, zmm27, zmm14                                   # zmm24 <- 48.0 * epsilon * sr2 | ||||||
|         vmulpd    zmm24, zmm27, zmm14 #-> 48.0 * epsilon * sr2 |         vmulpd    zmm25, zmm27, zmm22                                   # zmm25 <- sr2 * sigma6 * sr2 | ||||||
|  |         vmulpd    zmm23, zmm27, zmm25                                   # zmm23 <- sr2 * sigma6 * sr2 * sr2 | ||||||
|         vmulpd    zmm25, zmm27, zmm22 #-> sr2 * sigma6 * sr2 |         vfmsub213pd zmm27, zmm25, zmm7                                  # zmm27 <- sr2 * sigma * sr2 * sr2 - 0.5 | ||||||
|         vmulpd    zmm23, zmm27, zmm25 #-> sr2 * sigma6 * sr2 * sr2 |         vmulpd    zmm26, zmm23, zmm24                                   # zmm26 <- 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2 | ||||||
|  |         vmulpd    zmm30, zmm26, zmm27                                   # zmm30 <- force | ||||||
|         vfmsub213pd zmm27, zmm25, zmm7 #-> sr2 * sigma * sr2 * sr2 - 0.5 |  | ||||||
|         vmulpd    zmm26, zmm23, zmm24 #-> 48.0 * epsilon * sr2 * sr2 * sigma6 * sr2 |  | ||||||
|         vmulpd    zmm30, zmm26, zmm27 #-> |  | ||||||
|  |  | ||||||
|         vfmadd231pd zmm13{k5}, zmm30, zmm28 |         vfmadd231pd zmm13{k5}, zmm30, zmm28 | ||||||
|         vfmadd231pd zmm12{k5}, zmm30, zmm29 |         vfmadd231pd zmm12{k5}, zmm30, zmm29 | ||||||
|         vfmadd231pd zmm11{k5}, zmm30, zmm31 |         vfmadd231pd zmm11{k5}, zmm30, zmm31 | ||||||
| @@ -304,9 +237,14 @@ push      r14 | |||||||
|         vaddpd    zmm25, zmm23, zmm24 |         vaddpd    zmm25, zmm23, zmm24 | ||||||
|  |  | ||||||
| #exit function | #exit function | ||||||
| ..B1.27: | ..exit_func: | ||||||
|         mov       rcx, QWORD PTR [-8+rsp]                       #84.9[spill] |         mov       rcx, QWORD PTR [-8+rsp]                       #84.9[spill] | ||||||
|         mov       rbx, QWORD PTR [-16+rsp]                      #85.9[spill] |         mov       rbx, QWORD PTR [-16+rsp]                      #85.9[spill] | ||||||
|  |  | ||||||
|  |         ### AOS | ||||||
|  |         add       rax, 24 | ||||||
|  |         ### | ||||||
|  |  | ||||||
|         movsxd    r8, r10d                                      #55.32 |         movsxd    r8, r10d                                      #55.32 | ||||||
|         inc       r8                                            #55.32 |         inc       r8                                            #55.32 | ||||||
|         vaddsd    xmm0, xmm25, QWORD PTR [rcx+r10*8]            #84.9 |         vaddsd    xmm0, xmm25, QWORD PTR [rcx+r10*8]            #84.9 | ||||||
| @@ -318,8 +256,6 @@ push      r14 | |||||||
|         inc       r10                                           #55.5 |         inc       r10                                           #55.5 | ||||||
|         cmp       r10, QWORD PTR [-32+rsp]                      #55.5[spill] |         cmp       r10, QWORD PTR [-32+rsp]                      #55.5[spill] | ||||||
|         jb        ..B1.9 |         jb        ..B1.9 | ||||||
|  |  | ||||||
|  |  | ||||||
|         vzeroupper                                              #93.12 |         vzeroupper                                              #93.12 | ||||||
|         vxorpd    xmm0, xmm0, xmm0                              #93.12 |         vxorpd    xmm0, xmm0, xmm0                              #93.12 | ||||||
|         pop       r14                                           #93.12 |         pop       r14                                           #93.12 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user