Add version iterating most internal loop multiple times

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
Rafael Ravedutti 2021-05-05 03:04:41 +02:00
parent faf1e2ae85
commit 15de65303e
7 changed files with 247 additions and 35 deletions

View File

@ -19,6 +19,10 @@ else
DEFINES += -DPRECISION=2
endif
ifneq ($(INTERNAL_LOOP_NTIMES),)
DEFINES += -DINTERNAL_LOOP_NTIMES=$(INTERNAL_LOOP_NTIMES)
endif
VPATH = $(SRC_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OBJ = $(filter-out $(BUILD_DIR)/main%,$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))

View File

@ -0,0 +1,70 @@
iwia021h@testfront1:~/MD-Bench$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX ICC/force.s
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
Analyzed file: ICC/force.s
Architecture: CSX
Timestamp: 2021-04-30 16:08:44
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
* - Instruction micro-ops not bound to a port
X - No throughput/latency information for this instruction in data file
Combined Analysis Report
------------------------
Port pressure in cycles
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
-------------------------------------------------------------------------------------------------
306 | | | | | | | | || | | # LOE rbp rdi r8 r9 r10 edx ecx r11d r12d r13d r14d r15d ymm13 ymm14 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm15
307 | | | | | | | | || | | ..B1.29: # Preds ..B1.28
308 | | | | | | | | || | | # Execution count [2.50e+04]
309 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | movl %r14d, %eax #64.13
310 | | | | | | | | || | | X subl %ecx, %eax #64.13
311 | | | | | | | | || | | X vpbroadcastd %eax, %ymm0 #64.13
312 | | | | | | 1.00 | | || | | vpcmpgtd %ymm14, %ymm0, %k5 #64.13
313 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | movslq %ecx, %rcx #64.13
314 | | | | | | | | || | | * vmovaps %zmm15, %zmm17 #67.40
315 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #67.40
316 | | | | | | | | || | | * vmovaps %zmm15, %zmm16 #66.40
317 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%rdi,%rcx,4), %ymm1{%k5}{z} #65.25
318 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #66.40
319 | | | | | | | | || | | * vmovaps %zmm15, %zmm18 #68.40
320 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #68.40
321 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%r8,%ymm1,8), %zmm18{%k3} #68.40
322 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%r9,%ymm1,8), %zmm17{%k2} #67.40
323 | 1.50 | 0.34 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.66 | || | | vgatherdpd (%r10,%ymm1,8), %zmm16{%k1} #66.40
324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm3, %zmm31 #68.40
325 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm17, %zmm2, %zmm29 #67.40
326 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm16, %zmm4, %zmm28 #66.40
327 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm29, %zmm27 #69.53
328 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm28, %zmm28, %zmm27 #69.53
329 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm31, %zmm31, %zmm27 #69.67
330 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm27, %zmm26 #72.42
331 | | | | | | 1.00 | | || | | vcmppd $1, %zmm12, %zmm27, %k6{%k5} #71.26
332 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm26, %k0 #72.42
333 | | | | | | | | || | | * vmovaps %zmm27, %zmm19 #72.42
334 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm26, %zmm19 #72.42
335 | 1.00 | | | | | | | || | | knotw %k0, %k4 #72.42
336 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm19, %zmm20 #72.42
337 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm26, %zmm19, %zmm26{%k4} #72.42
338 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm26, %zmm20, %zmm26{%k4} #72.42
339 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm11, %zmm26, %zmm21 #73.42
340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm9, %zmm26, %zmm23 #74.58
341 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm21, %zmm26, %zmm24 #73.48
342 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm26, %zmm22 #73.54
343 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm10, %zmm24, %zmm26 #74.58
344 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm23, %zmm22, %zmm25 #74.65
345 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm26, %zmm25, %zmm30 #74.71
346 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21
347 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21
348 | 0.00 | | | | | 1.00 | | || 4.0 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21
18.0 3.00 13.0 2.50 13.0 2.50 18.0 3.00 68.0 4
Loop-Carried Dependencies Analysis Report
-----------------------------------------
313 | 1.0 | movslq %ecx, %rcx #64.13| [313]
348 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21| [348]
347 | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21| [347]
346 | 4.0 | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21| [346]

View File

@ -355,6 +355,7 @@ computeForce:
ja ..B1.38 # Prob 50% #67.9
movl $111,%ebx #IACA/OSACA START MARKER
.byte 100,103,144 #IACA/OSACA START MARKER
# LLVM-MCA-BEGIN
# LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
..B1.33: # Preds ..B1.32
# Execution count [2.50e+01]
@ -415,6 +416,7 @@ computeForce:
vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17
movl $222,%ebx #IACA/OSACA END MARKER
.byte 100,103,144 #IACA/OSACA END MARKER
# LLVM-MCA-END
# LOE rax rdx rbp rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
..B1.38: # Preds ..B1.23 ..B1.37 ..B1.32
# Execution count [4.50e+00]

View File

@ -0,0 +1,142 @@
200,4x4x2,8,256,21.5040,6.1440,9.2160,0.7405,69.1424,34.7110,4.9587
200,4x4x4,8,512,43.0080,12.2880,18.4320,1.4818,69.1066,34.7290,4.9613
200,4x4x8,8,1024,86.0160,24.5760,36.8640,2.9627,69.1261,34.7191,4.9599
200,4x4x16,8,2048,172.0320,49.1520,73.7280,5.9244,69.1378,34.7133,4.9590
200,4x8x2,8,512,43.0080,12.2880,18.4320,1.4813,69.1298,34.7173,4.9596
200,4x8x4,8,1024,86.0160,24.5760,36.8640,2.9633,69.1126,34.7259,4.9608
200,4x8x8,8,2048,172.0320,49.1520,73.7280,5.9254,69.1265,34.7189,4.9598
200,4x8x16,8,4096,344.0640,98.3040,147.4560,11.8621,69.0600,34.7524,4.9646
200,4x16x2,8,1024,86.0160,24.5760,36.8640,2.9705,68.9438,34.8110,4.9730
200,4x16x4,8,2048,172.0320,49.1520,73.7280,5.9338,69.0288,34.7681,4.9669
200,4x16x8,8,4096,344.0640,98.3040,147.4560,11.8492,69.1354,34.7145,4.9592
200,4x16x16,8,8192,688.1280,196.6080,294.9120,23.7239,69.0612,34.7518,4.9645
200,8x4x2,8,512,43.0080,12.2880,18.4320,1.4814,69.1239,34.7203,4.9600
200,8x4x4,8,1024,86.0160,24.5760,36.8640,2.9622,69.1383,34.7130,4.9590
200,8x4x8,8,2048,172.0320,49.1520,73.7280,5.9256,69.1234,34.7205,4.9601
200,8x4x16,8,4096,344.0640,98.3040,147.4560,11.8548,69.1026,34.7310,4.9616
200,8x8x2,8,1024,86.0160,24.5760,36.8640,2.9625,69.1319,34.7162,4.9595
200,8x8x4,8,2048,172.0320,49.1520,73.7280,5.9247,69.1339,34.7152,4.9593
200,8x8x8,8,4096,344.0640,98.3040,147.4560,11.8484,69.1400,34.7122,4.9589
200,8x8x16,8,8192,688.1280,196.6080,294.9120,23.7201,69.0721,34.7463,4.9638
200,8x16x2,8,2048,172.0320,49.1520,73.7280,5.9244,69.1376,34.7134,4.9591
200,8x16x4,8,4096,344.0640,98.3040,147.4560,11.8496,69.1331,34.7156,4.9594
200,8x16x8,8,8192,688.1280,196.6080,294.9120,23.7200,69.0724,34.7461,4.9637
200,8x16x16,8,16384,1376.2560,393.2160,589.8240,47.4404,-21.4620,-111.8254,-15.9751
200,16x4x2,8,1024,86.0160,24.5760,36.8640,2.9624,69.1338,34.7153,4.9593
200,16x4x4,8,2048,172.0320,49.1520,73.7280,5.9251,69.1300,34.7172,4.9596
200,16x4x8,8,4096,344.0640,98.3040,147.4560,11.8495,69.1335,34.7155,4.9594
200,16x4x16,8,8192,688.1280,196.6080,294.9120,23.7169,69.0814,34.7416,4.9631
200,16x8x2,8,2048,172.0320,49.1520,73.7280,5.9246,69.1360,34.7142,4.9592
200,16x8x4,8,4096,344.0640,98.3040,147.4560,11.8498,69.1319,34.7163,4.9595
200,16x8x8,8,8192,688.1280,196.6080,294.9120,23.7234,69.0625,34.7511,4.9644
200,16x8x16,8,16384,1376.2560,393.2160,589.8240,47.4499,-21.4578,-111.8477,-15.9782
200,16x16x2,8,4096,344.0640,98.3040,147.4560,11.8552,69.1005,34.7320,4.9617
200,16x16x4,8,8192,688.1280,196.6080,294.9120,23.7215,69.0682,34.7483,4.9640
200,4x4x2,8,256,21.5040,6.1440,9.2160,0.7406,69.1332,34.7156,4.9594
200,4x4x4,8,512,43.0080,12.2880,18.4320,1.4813,69.1306,34.7169,4.9596
200,4x4x8,8,1024,86.0160,24.5760,36.8640,2.9622,69.1383,34.7130,4.9590
200,4x4x16,8,2048,172.0320,49.1520,73.7280,5.9253,69.1267,34.7188,4.9598
200,4x8x2,8,512,43.0080,12.2880,18.4320,1.4813,69.1298,34.7173,4.9596
200,4x8x4,8,1024,86.0160,24.5760,36.8640,2.9625,69.1308,34.7168,4.9595
200,4x8x8,8,2048,172.0320,49.1520,73.7280,5.9247,69.1340,34.7152,4.9593
200,4x8x16,8,4096,344.0640,98.3040,147.4560,11.8482,69.1412,34.7116,4.9588
200,4x16x2,8,1024,86.0160,24.5760,36.8640,2.9625,69.1310,34.7167,4.9595
200,4x16x4,8,2048,172.0320,49.1520,73.7280,5.9254,69.1263,34.7191,4.9599
200,4x16x8,8,4096,344.0640,98.3040,147.4560,11.8488,69.1375,34.7134,4.9591
200,4x16x16,8,8192,688.1280,196.6080,294.9120,23.7265,69.0536,34.7556,4.9651
200,8x4x2,8,512,43.0080,12.2880,18.4320,1.4814,69.1244,34.7200,4.9600
200,8x4x4,8,1024,86.0160,24.5760,36.8640,2.9622,69.1375,34.7134,4.9591
200,8x4x8,8,2048,172.0320,49.1520,73.7280,5.9251,69.1301,34.7172,4.9596
200,8x4x16,8,4096,344.0640,98.3040,147.4560,11.8497,69.1326,34.7159,4.9594
200,8x8x2,8,1024,86.0160,24.5760,36.8640,2.9623,69.1364,34.7140,4.9591
200,8x8x4,8,2048,172.0320,49.1520,73.7280,5.9250,69.1311,34.7166,4.9595
200,8x8x8,8,4096,344.0640,98.3040,147.4560,11.8488,69.1378,34.7133,4.9590
200,8x8x16,8,8192,688.1280,196.6080,294.9120,23.7217,69.0677,34.7485,4.9641
200,8x16x2,8,2048,172.0320,49.1520,73.7280,5.9246,69.1355,34.7145,4.9592
200,8x16x4,8,4096,344.0640,98.3040,147.4560,11.8491,69.1358,34.7143,4.9592
200,8x16x8,8,8192,688.1280,196.6080,294.9120,23.7200,69.0725,34.7461,4.9637
200,8x16x16,8,16384,1376.2560,393.2160,589.8240,47.4484,-21.4584,-111.8442,-15.9777
200,16x4x2,8,1024,86.0160,24.5760,36.8640,2.9624,69.1343,34.7151,4.9593
200,16x4x4,8,2048,172.0320,49.1520,73.7280,5.9239,69.1432,34.7106,4.9587
200,16x4x8,8,4096,344.0640,98.3040,147.4560,11.8490,69.1367,34.7138,4.9591
200,16x4x16,8,8192,688.1280,196.6080,294.9120,23.7192,69.0749,34.7449,4.9636
200,16x8x2,8,2048,172.0320,49.1520,73.7280,5.9251,69.1301,34.7171,4.9596
200,16x8x4,8,4096,344.0640,98.3040,147.4560,11.8496,69.1329,34.7157,4.9594
200,16x8x8,8,8192,688.1280,196.6080,294.9120,23.7224,69.0655,34.7496,4.9642
200,16x8x16,8,16384,1376.2560,393.2160,589.8240,47.4415,-21.4615,-111.8280,-15.9754
200,16x16x2,8,4096,344.0640,98.3040,147.4560,11.8502,69.1297,34.7173,4.9596
200,16x16x4,8,8192,688.1280,196.6080,294.9120,23.7177,69.0792,34.7428,4.9633
200,16x16x8,8,16384,1376.2560,393.2160,589.8240,47.4485,-21.4584,-111.8445,-15.9778
200,16x16x16,8,32768,2752.5120,786.4320,1179.6480,94.8632,-21.4660,-111.8047,-15.9721
200,4x4x2,16,512,59.3920,12.2880,34.8160,2.8749,35.6181,67.3814,4.4921
200,4x4x4,16,1024,118.7840,24.5760,69.6320,5.7553,35.5849,67.4444,4.4963
200,4x4x8,16,2048,237.5680,49.1520,139.2640,11.5277,35.5317,67.5452,4.5030
200,4x4x16,16,4096,475.1360,98.3040,278.5280,23.0679,35.5125,67.5818,4.5055
200,4x8x2,16,1024,118.7840,24.5760,69.6320,5.7516,35.6076,67.4014,4.4934
200,4x8x4,16,2048,237.5680,49.1520,139.2640,11.5048,35.6026,67.4108,4.4941
200,4x8x8,16,4096,475.1360,98.3040,278.5280,23.0343,35.5643,67.4834,4.4989
200,4x8x16,16,8192,950.2720,196.6080,557.0560,46.1992,35.4638,67.6746,4.5116
200,4x16x2,16,2048,237.5680,49.1520,139.2640,11.5339,35.5126,67.5817,4.5054
200,4x16x4,16,4096,475.1360,98.3040,278.5280,23.0366,35.5607,67.4902,4.4993
200,4x16x8,16,8192,950.2720,196.6080,557.0560,46.1666,35.4888,67.6269,4.5085
200,4x16x16,16,16384,1900.5440,393.2160,1114.1120,92.3461,-11.0256,-217.6759,-14.5117
200,8x4x2,16,1024,118.7840,24.5760,69.6320,5.7504,35.6148,67.3877,4.4925
200,8x4x4,16,2048,237.5680,49.1520,139.2640,11.5363,35.5054,67.5954,4.5064
200,8x4x8,16,4096,475.1360,98.3040,278.5280,23.0449,35.5480,67.5143,4.5010
200,8x4x16,16,8192,950.2720,196.6080,557.0560,46.2153,35.4515,67.6981,4.5132
200,8x8x2,16,2048,237.5680,49.1520,139.2640,11.5019,35.6114,67.3942,4.4929
200,8x8x4,16,4096,475.1360,98.3040,278.5280,23.0126,35.5980,67.4196,4.4946
200,8x8x8,16,8192,950.2720,196.6080,557.0560,46.1823,35.4768,67.6499,4.5100
200,8x8x16,16,16384,1900.5440,393.2160,1114.1120,92.3762,-11.0220,-217.7469,-14.5165
200,8x16x2,16,4096,475.1360,98.3040,278.5280,23.0367,35.5606,67.4904,4.4994
200,8x16x4,16,8192,950.2720,196.6080,557.0560,46.1673,35.4883,67.6279,4.5085
200,8x16x8,16,16384,1900.5440,393.2160,1114.1120,92.4053,-11.0185,-217.8156,-14.5210
200,8x16x16,16,32768,3801.0880,786.4320,2228.2240,184.6536,-11.0279,-217.6306,-14.5087
200,16x4x2,16,2048,237.5680,49.1520,139.2640,11.4984,35.6223,67.3736,4.4916
200,16x4x4,16,4096,475.1360,98.3040,278.5280,23.0553,35.5319,67.5449,4.5030
200,16x4x8,16,8192,950.2720,196.6080,557.0560,46.1639,35.4909,67.6229,4.5082
200,16x4x16,16,16384,1900.5440,393.2160,1114.1120,92.3485,-11.0253,-217.6818,-14.5121
200,16x8x2,16,4096,475.1360,98.3040,278.5280,23.0221,35.5832,67.4475,4.4965
200,16x8x4,16,8192,950.2720,196.6080,557.0560,46.1950,35.4671,67.6684,4.5112
200,16x8x8,16,16384,1900.5440,393.2160,1114.1120,92.3825,-11.0212,-217.7618,-14.5175
200,16x8x16,16,32768,3801.0880,786.4320,2228.2240,184.4897,-11.0377,-217.4373,-14.4958
200,16x16x2,16,8192,950.2720,196.6080,557.0560,46.1887,35.4719,67.6592,4.5106
200,16x16x4,16,16384,1900.5440,393.2160,1114.1120,92.4000,-11.0191,-217.8031,-14.5202
200,16x16x8,16,32768,3801.0880,786.4320,2228.2240,184.6561,-11.0277,-217.6335,-14.5089
200,16x16x16,16,65536,7602.1760,1572.8640,4456.4480,369.2723,0.6020,3986.7795,265.7853
200,4x4x2,32,1024,184.3200,24.5760,135.1680,11.4223,17.9299,133.8547,4.3179
200,4x4x4,32,2048,368.6400,49.1520,270.3360,22.8232,17.9466,133.7300,4.3139
200,4x4x8,32,4096,737.2800,98.3040,540.6720,45.6452,17.9471,133.7263,4.3138
200,4x4x16,32,8192,1474.5600,196.6080,1081.3440,91.3233,17.9407,133.7744,4.3153
200,4x8x2,32,2048,368.6400,49.1520,270.3360,22.8306,17.9408,133.7732,4.3153
200,4x8x4,32,4096,737.2800,98.3040,540.6720,45.6485,17.9458,133.7357,4.3141
200,4x8x8,32,8192,1474.5600,196.6080,1081.3440,91.3158,17.9421,133.7634,4.3149
200,4x8x16,32,16384,2949.1200,393.2160,2162.6880,182.6220,-5.5753,-430.4722,-13.8862
200,4x16x2,32,4096,737.2800,98.3040,540.6720,45.6489,17.9457,133.7371,4.3141
200,4x16x4,32,8192,1474.5600,196.6080,1081.3440,91.2890,17.9474,133.7241,4.3137
200,4x16x8,32,16384,2949.1200,393.2160,2162.6880,182.6145,-5.5755,-430.4547,-13.8856
200,4x16x16,32,32768,5898.2400,786.4320,4325.3760,365.2677,-5.5749,-430.5002,-13.8871
200,8x4x2,32,2048,368.6400,49.1520,270.3360,22.8301,17.9412,133.7701,4.3152
200,8x4x4,32,4096,737.2800,98.3040,540.6720,45.6689,17.9378,133.7956,4.3160
200,8x4x8,32,8192,1474.5600,196.6080,1081.3440,91.3143,17.9424,133.7611,4.3149
200,8x4x16,32,16384,2949.1200,393.2160,2162.6880,182.6332,-5.5749,-430.4987,-13.8871
200,8x8x2,32,4096,737.2800,98.3040,540.6720,45.6519,17.9445,133.7457,4.3144
200,8x8x4,32,8192,1474.5600,196.6080,1081.3440,91.3215,17.9410,133.7717,4.3152
200,8x8x8,32,16384,2949.1200,393.2160,2162.6880,182.6451,-5.5746,-430.5267,-13.8880
200,8x8x16,32,32768,5898.2400,786.4320,4325.3760,365.1989,-5.5760,-430.4192,-13.8845
200,8x16x2,32,8192,1474.5600,196.6080,1081.3440,91.3015,17.9449,133.7425,4.3143
200,8x16x4,32,16384,2949.1200,393.2160,2162.6880,182.6251,-5.5752,-430.4795,-13.8864
200,8x16x8,32,32768,5898.2400,786.4320,4325.3760,365.2087,-5.5758,-430.4307,-13.8849
200,8x16x16,32,65536,11796.4800,1572.8640,8650.7520,730.5781,0.3043,7887.5501,254.4371
200,16x4x2,32,4096,737.2800,98.3040,540.6720,45.6592,17.9416,133.7671,4.3151
200,16x4x4,32,8192,1474.5600,196.6080,1081.3440,91.3092,17.9434,133.7537,4.3146
200,16x4x8,32,16384,2949.1200,393.2160,2162.6880,182.6471,-5.5745,-430.5315,-13.8881
200,16x4x16,32,32768,5898.2400,786.4320,4325.3760,365.2706,-5.5749,-430.5036,-13.8872
200,16x8x2,32,8192,1474.5600,196.6080,1081.3440,91.3464,17.9361,133.8082,4.3164
200,16x8x4,32,16384,2949.1200,393.2160,2162.6880,182.6559,-5.5742,-430.5521,-13.8888
200,16x8x8,32,32768,5898.2400,786.4320,4325.3760,365.2269,-5.5755,-430.4521,-13.8856
200,16x8x16,32,65536,11796.4800,1572.8640,8650.7520,730.5707,0.3043,7887.4697,254.4345
200,16x16x2,32,16384,2949.1200,393.2160,2162.6880,182.6406,-5.5747,-430.5160,-13.8876
200,16x16x4,32,32768,5898.2400,786.4320,4325.3760,365.2364,-5.5754,-430.4633,-13.8859
200,16x16x8,32,65536,11796.4800,1572.8640,8650.7520,730.5352,0.3043,7887.0871,254.4222
200,16x16x16,32,131072,23592.9600,3145.7280,17301.5040,1461.1927,0.3043,7887.7469,254.4434

View File

@ -27,12 +27,12 @@
#include <parameter.h>
#include <atom.h>
double computeForce(
Parameter *param,
Atom *atom,
Neighbor *neighbor,
int profile)
{
// Number of times to compute the most internal loop
#ifndef INTERNAL_LOOP_NTIMES
#define INTERNAL_LOOP_NTIMES 1
#endif
double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor) {
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
@ -47,10 +47,6 @@ double computeForce(
fz[i] = 0.0;
}
if(profile) {
// LIKWID_MARKER_START("force");
}
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
@ -64,20 +60,22 @@ double computeForce(
// printf("%d: %d\n", i, numneighs);
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
for(int n = 0; n < INTERNAL_LOOP_NTIMES; n++) {
for(int k = 0; k < numneighs; k++) {
int j = neighs[k];
MD_FLOAT delx = xtmp - atom_x(j);
MD_FLOAT dely = ytmp - atom_y(j);
MD_FLOAT delz = ztmp - atom_z(j);
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
}
}
}
@ -86,9 +84,5 @@ double computeForce(
fz[i] += fiz;
}
if(profile) {
// LIKWID_MARKER_STOP("force");
}
return 0.0;
}

View File

@ -16,7 +16,7 @@
#define LATTICE_DISTANCE 10.0
#define NEIGH_DISTANCE 1.0
extern double computeForce( Parameter*, Atom*, Neighbor*, int);
extern double computeForce( Parameter*, Atom*, Neighbor*);
void init(Parameter *param) {
param->epsilon = 1.0;
@ -188,19 +188,19 @@ int main(int argc, const char *argv[]) {
DEBUG("Building neighbor lists...\n");
buildNeighbor(atom, &neighbor);
DEBUG("Computing forces...\n");
computeForce(&param, atom, &neighbor, 0);
computeForce(&param, atom, &neighbor);
double S, E;
S = getTimeStamp();
LIKWID_MARKER_START("force");
for(int i = 0; i < param.ntimes; i++) {
computeForce(&param, atom, &neighbor, 1);
computeForce(&param, atom, &neighbor);
}
LIKWID_MARKER_STOP("force");
E = getTimeStamp();
double T_accum = E-S;
const double atoms_updates_per_sec = atom->Nlocal * param.ntimes / T_accum;
const double cycles_per_atom = T_accum * freq / (atom->Nlocal * param.ntimes);
const double atoms_updates_per_sec = (double)(atom->Nlocal * INTERNAL_LOOP_NTIMES * param.ntimes) / T_accum;
const double cycles_per_atom = T_accum * freq / (double)(atom->Nlocal * param.ntimes * INTERNAL_LOOP_NTIMES);
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
if(!csv) {

View File

@ -47,7 +47,7 @@ typedef enum {
NUMTIMER
} timertype;
extern double computeForce( Parameter*, Atom*, Neighbor*, int);
extern double computeForce( Parameter*, Atom*, Neighbor*);
void init(Parameter *param)
{
@ -205,7 +205,7 @@ int main (int argc, char** argv)
setup(&param, &atom, &neighbor);
computeThermo(0, &param, &atom);
computeForce(&param, &atom, &neighbor, 1);
computeForce(&param, &atom, &neighbor);
timer[FORCE] = 0.0;
timer[NEIGH] = 0.0;
@ -221,7 +221,7 @@ int main (int argc, char** argv)
timer[NEIGH] += reneighbour(&param, &atom, &neighbor);
}
timer[FORCE] += computeForce(&param, &atom, &neighbor, 1);
timer[FORCE] += computeForce(&param, &atom, &neighbor);
finalIntegrate(&param, &atom);
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {