From 15de65303effc9ef6d0c56e3840f2a28aefad1ef Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Wed, 5 May 2021 03:04:41 +0200 Subject: [PATCH] Add version iterating most internal loop multiple times Signed-off-by: Rafael Ravedutti --- Makefile | 4 + arch_analysis/osaca_force_soa_lt8_iln1000.txt | 70 +++++++++ asm/force_aos_lt8_markers.s | 2 + scripts/results_soa_casclakesp2_iln1000.txt | 142 ++++++++++++++++++ src/force.c | 48 +++--- src/main-stub.c | 10 +- src/main.c | 6 +- 7 files changed, 247 insertions(+), 35 deletions(-) create mode 100644 arch_analysis/osaca_force_soa_lt8_iln1000.txt create mode 100644 scripts/results_soa_casclakesp2_iln1000.txt diff --git a/Makefile b/Makefile index 2e05f9a..e15e330 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,10 @@ else DEFINES += -DPRECISION=2 endif +ifneq ($(INTERNAL_LOOP_NTIMES),) + DEFINES += -DINTERNAL_LOOP_NTIMES=$(INTERNAL_LOOP_NTIMES) +endif + VPATH = $(SRC_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OBJ = $(filter-out $(BUILD_DIR)/main%,$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))) diff --git a/arch_analysis/osaca_force_soa_lt8_iln1000.txt b/arch_analysis/osaca_force_soa_lt8_iln1000.txt new file mode 100644 index 0000000..8ff88d5 --- /dev/null +++ b/arch_analysis/osaca_force_soa_lt8_iln1000.txt @@ -0,0 +1,70 @@ +iwia021h@testfront1:~/MD-Bench$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX ICC/force.s +Open Source Architecture Code Analyzer (OSACA) - 0.3.14 +Analyzed file: ICC/force.s +Architecture: CSX +Timestamp: 2021-04-30 16:08:44 + + + P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction + * - Instruction micro-ops not bound to a port + X - No throughput/latency information for this instruction in data file + + +Combined Analysis Report +------------------------ + Port pressure in cycles + | 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD | +------------------------------------------------------------------------------------------------- + 306 | | | | | | | | || | | # LOE rbp rdi r8 r9 r10 edx ecx r11d r12d r13d r14d r15d ymm13 ymm14 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm15 + 307 | | | | | | | | || | | ..B1.29: # Preds ..B1.28 + 308 | | | | | | | | || | | # Execution count [2.50e+04] + 309 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | movl %r14d, %eax #64.13 + 310 | | | | | | | | || | | X subl %ecx, %eax #64.13 + 311 | | | | | | | | || | | X vpbroadcastd %eax, %ymm0 #64.13 + 312 | | | | | | 1.00 | | || | | vpcmpgtd %ymm14, %ymm0, %k5 #64.13 + 313 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | movslq %ecx, %rcx #64.13 + 314 | | | | | | | | || | | * vmovaps %zmm15, %zmm17 #67.40 + 315 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #67.40 + 316 | | | | | | | | || | | * vmovaps %zmm15, %zmm16 #66.40 + 317 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%rdi,%rcx,4), %ymm1{%k5}{z} #65.25 + 318 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #66.40 + 319 | | | | | | | | || | | * vmovaps %zmm15, %zmm18 #68.40 + 320 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #68.40 + 321 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%r8,%ymm1,8), %zmm18{%k3} #68.40 + 322 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%r9,%ymm1,8), %zmm17{%k2} #67.40 + 323 | 1.50 | 0.34 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.66 | || | | vgatherdpd (%r10,%ymm1,8), %zmm16{%k1} #66.40 + 324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm3, %zmm31 #68.40 + 325 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm17, %zmm2, %zmm29 #67.40 + 326 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm16, %zmm4, %zmm28 #66.40 + 327 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm29, %zmm27 #69.53 + 328 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm28, %zmm28, %zmm27 #69.53 + 329 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm31, %zmm31, %zmm27 #69.67 + 330 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm27, %zmm26 #72.42 + 331 | | | | | | 1.00 | | || | | vcmppd $1, %zmm12, %zmm27, %k6{%k5} #71.26 + 332 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm26, %k0 #72.42 + 333 | | | | | | | | || | | * vmovaps %zmm27, %zmm19 #72.42 + 334 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm26, %zmm19 #72.42 + 335 | 1.00 | | | | | | | || | | knotw %k0, %k4 #72.42 + 336 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm19, %zmm20 #72.42 + 337 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm26, %zmm19, %zmm26{%k4} #72.42 + 338 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm26, %zmm20, %zmm26{%k4} #72.42 + 339 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm11, %zmm26, %zmm21 #73.42 + 340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm9, %zmm26, %zmm23 #74.58 + 341 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm21, %zmm26, %zmm24 #73.48 + 342 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm26, %zmm22 #73.54 + 343 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm10, %zmm24, %zmm26 #74.58 + 344 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm23, %zmm22, %zmm25 #74.65 + 345 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm26, %zmm25, %zmm30 #74.71 + 346 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21 + 347 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21 + 348 | 0.00 | | | | | 1.00 | | || 4.0 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21 + + 18.0 3.00 13.0 2.50 13.0 2.50 18.0 3.00 68.0 4 + + +Loop-Carried Dependencies Analysis Report +----------------------------------------- + 313 | 1.0 | movslq %ecx, %rcx #64.13| [313] + 348 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21| [348] + 347 | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21| [347] + 346 | 4.0 | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21| [346] diff --git a/asm/force_aos_lt8_markers.s b/asm/force_aos_lt8_markers.s index f9ef002..1c8e3ef 100644 --- a/asm/force_aos_lt8_markers.s +++ b/asm/force_aos_lt8_markers.s @@ -355,6 +355,7 @@ computeForce: ja ..B1.38 # Prob 50% #67.9 movl $111,%ebx #IACA/OSACA START MARKER .byte 100,103,144 #IACA/OSACA START MARKER + # LLVM-MCA-BEGIN # LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 ..B1.33: # Preds ..B1.32 # Execution count [2.50e+01] @@ -415,6 +416,7 @@ computeForce: vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17 movl $222,%ebx #IACA/OSACA END MARKER .byte 100,103,144 #IACA/OSACA END MARKER + # LLVM-MCA-END # LOE rax rdx rbp rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14 ..B1.38: # Preds ..B1.23 ..B1.37 ..B1.32 # Execution count [4.50e+00] diff --git a/scripts/results_soa_casclakesp2_iln1000.txt b/scripts/results_soa_casclakesp2_iln1000.txt new file mode 100644 index 0000000..303232b --- /dev/null +++ b/scripts/results_soa_casclakesp2_iln1000.txt @@ -0,0 +1,142 @@ +200,4x4x2,8,256,21.5040,6.1440,9.2160,0.7405,69.1424,34.7110,4.9587 +200,4x4x4,8,512,43.0080,12.2880,18.4320,1.4818,69.1066,34.7290,4.9613 +200,4x4x8,8,1024,86.0160,24.5760,36.8640,2.9627,69.1261,34.7191,4.9599 +200,4x4x16,8,2048,172.0320,49.1520,73.7280,5.9244,69.1378,34.7133,4.9590 +200,4x8x2,8,512,43.0080,12.2880,18.4320,1.4813,69.1298,34.7173,4.9596 +200,4x8x4,8,1024,86.0160,24.5760,36.8640,2.9633,69.1126,34.7259,4.9608 +200,4x8x8,8,2048,172.0320,49.1520,73.7280,5.9254,69.1265,34.7189,4.9598 +200,4x8x16,8,4096,344.0640,98.3040,147.4560,11.8621,69.0600,34.7524,4.9646 +200,4x16x2,8,1024,86.0160,24.5760,36.8640,2.9705,68.9438,34.8110,4.9730 +200,4x16x4,8,2048,172.0320,49.1520,73.7280,5.9338,69.0288,34.7681,4.9669 +200,4x16x8,8,4096,344.0640,98.3040,147.4560,11.8492,69.1354,34.7145,4.9592 +200,4x16x16,8,8192,688.1280,196.6080,294.9120,23.7239,69.0612,34.7518,4.9645 +200,8x4x2,8,512,43.0080,12.2880,18.4320,1.4814,69.1239,34.7203,4.9600 +200,8x4x4,8,1024,86.0160,24.5760,36.8640,2.9622,69.1383,34.7130,4.9590 +200,8x4x8,8,2048,172.0320,49.1520,73.7280,5.9256,69.1234,34.7205,4.9601 +200,8x4x16,8,4096,344.0640,98.3040,147.4560,11.8548,69.1026,34.7310,4.9616 +200,8x8x2,8,1024,86.0160,24.5760,36.8640,2.9625,69.1319,34.7162,4.9595 +200,8x8x4,8,2048,172.0320,49.1520,73.7280,5.9247,69.1339,34.7152,4.9593 +200,8x8x8,8,4096,344.0640,98.3040,147.4560,11.8484,69.1400,34.7122,4.9589 +200,8x8x16,8,8192,688.1280,196.6080,294.9120,23.7201,69.0721,34.7463,4.9638 +200,8x16x2,8,2048,172.0320,49.1520,73.7280,5.9244,69.1376,34.7134,4.9591 +200,8x16x4,8,4096,344.0640,98.3040,147.4560,11.8496,69.1331,34.7156,4.9594 +200,8x16x8,8,8192,688.1280,196.6080,294.9120,23.7200,69.0724,34.7461,4.9637 +200,8x16x16,8,16384,1376.2560,393.2160,589.8240,47.4404,-21.4620,-111.8254,-15.9751 +200,16x4x2,8,1024,86.0160,24.5760,36.8640,2.9624,69.1338,34.7153,4.9593 +200,16x4x4,8,2048,172.0320,49.1520,73.7280,5.9251,69.1300,34.7172,4.9596 +200,16x4x8,8,4096,344.0640,98.3040,147.4560,11.8495,69.1335,34.7155,4.9594 +200,16x4x16,8,8192,688.1280,196.6080,294.9120,23.7169,69.0814,34.7416,4.9631 +200,16x8x2,8,2048,172.0320,49.1520,73.7280,5.9246,69.1360,34.7142,4.9592 +200,16x8x4,8,4096,344.0640,98.3040,147.4560,11.8498,69.1319,34.7163,4.9595 +200,16x8x8,8,8192,688.1280,196.6080,294.9120,23.7234,69.0625,34.7511,4.9644 +200,16x8x16,8,16384,1376.2560,393.2160,589.8240,47.4499,-21.4578,-111.8477,-15.9782 +200,16x16x2,8,4096,344.0640,98.3040,147.4560,11.8552,69.1005,34.7320,4.9617 +200,16x16x4,8,8192,688.1280,196.6080,294.9120,23.7215,69.0682,34.7483,4.9640 +200,4x4x2,8,256,21.5040,6.1440,9.2160,0.7406,69.1332,34.7156,4.9594 +200,4x4x4,8,512,43.0080,12.2880,18.4320,1.4813,69.1306,34.7169,4.9596 +200,4x4x8,8,1024,86.0160,24.5760,36.8640,2.9622,69.1383,34.7130,4.9590 +200,4x4x16,8,2048,172.0320,49.1520,73.7280,5.9253,69.1267,34.7188,4.9598 +200,4x8x2,8,512,43.0080,12.2880,18.4320,1.4813,69.1298,34.7173,4.9596 +200,4x8x4,8,1024,86.0160,24.5760,36.8640,2.9625,69.1308,34.7168,4.9595 +200,4x8x8,8,2048,172.0320,49.1520,73.7280,5.9247,69.1340,34.7152,4.9593 +200,4x8x16,8,4096,344.0640,98.3040,147.4560,11.8482,69.1412,34.7116,4.9588 +200,4x16x2,8,1024,86.0160,24.5760,36.8640,2.9625,69.1310,34.7167,4.9595 +200,4x16x4,8,2048,172.0320,49.1520,73.7280,5.9254,69.1263,34.7191,4.9599 +200,4x16x8,8,4096,344.0640,98.3040,147.4560,11.8488,69.1375,34.7134,4.9591 +200,4x16x16,8,8192,688.1280,196.6080,294.9120,23.7265,69.0536,34.7556,4.9651 +200,8x4x2,8,512,43.0080,12.2880,18.4320,1.4814,69.1244,34.7200,4.9600 +200,8x4x4,8,1024,86.0160,24.5760,36.8640,2.9622,69.1375,34.7134,4.9591 +200,8x4x8,8,2048,172.0320,49.1520,73.7280,5.9251,69.1301,34.7172,4.9596 +200,8x4x16,8,4096,344.0640,98.3040,147.4560,11.8497,69.1326,34.7159,4.9594 +200,8x8x2,8,1024,86.0160,24.5760,36.8640,2.9623,69.1364,34.7140,4.9591 +200,8x8x4,8,2048,172.0320,49.1520,73.7280,5.9250,69.1311,34.7166,4.9595 +200,8x8x8,8,4096,344.0640,98.3040,147.4560,11.8488,69.1378,34.7133,4.9590 +200,8x8x16,8,8192,688.1280,196.6080,294.9120,23.7217,69.0677,34.7485,4.9641 +200,8x16x2,8,2048,172.0320,49.1520,73.7280,5.9246,69.1355,34.7145,4.9592 +200,8x16x4,8,4096,344.0640,98.3040,147.4560,11.8491,69.1358,34.7143,4.9592 +200,8x16x8,8,8192,688.1280,196.6080,294.9120,23.7200,69.0725,34.7461,4.9637 +200,8x16x16,8,16384,1376.2560,393.2160,589.8240,47.4484,-21.4584,-111.8442,-15.9777 +200,16x4x2,8,1024,86.0160,24.5760,36.8640,2.9624,69.1343,34.7151,4.9593 +200,16x4x4,8,2048,172.0320,49.1520,73.7280,5.9239,69.1432,34.7106,4.9587 +200,16x4x8,8,4096,344.0640,98.3040,147.4560,11.8490,69.1367,34.7138,4.9591 +200,16x4x16,8,8192,688.1280,196.6080,294.9120,23.7192,69.0749,34.7449,4.9636 +200,16x8x2,8,2048,172.0320,49.1520,73.7280,5.9251,69.1301,34.7171,4.9596 +200,16x8x4,8,4096,344.0640,98.3040,147.4560,11.8496,69.1329,34.7157,4.9594 +200,16x8x8,8,8192,688.1280,196.6080,294.9120,23.7224,69.0655,34.7496,4.9642 +200,16x8x16,8,16384,1376.2560,393.2160,589.8240,47.4415,-21.4615,-111.8280,-15.9754 +200,16x16x2,8,4096,344.0640,98.3040,147.4560,11.8502,69.1297,34.7173,4.9596 +200,16x16x4,8,8192,688.1280,196.6080,294.9120,23.7177,69.0792,34.7428,4.9633 +200,16x16x8,8,16384,1376.2560,393.2160,589.8240,47.4485,-21.4584,-111.8445,-15.9778 +200,16x16x16,8,32768,2752.5120,786.4320,1179.6480,94.8632,-21.4660,-111.8047,-15.9721 +200,4x4x2,16,512,59.3920,12.2880,34.8160,2.8749,35.6181,67.3814,4.4921 +200,4x4x4,16,1024,118.7840,24.5760,69.6320,5.7553,35.5849,67.4444,4.4963 +200,4x4x8,16,2048,237.5680,49.1520,139.2640,11.5277,35.5317,67.5452,4.5030 +200,4x4x16,16,4096,475.1360,98.3040,278.5280,23.0679,35.5125,67.5818,4.5055 +200,4x8x2,16,1024,118.7840,24.5760,69.6320,5.7516,35.6076,67.4014,4.4934 +200,4x8x4,16,2048,237.5680,49.1520,139.2640,11.5048,35.6026,67.4108,4.4941 +200,4x8x8,16,4096,475.1360,98.3040,278.5280,23.0343,35.5643,67.4834,4.4989 +200,4x8x16,16,8192,950.2720,196.6080,557.0560,46.1992,35.4638,67.6746,4.5116 +200,4x16x2,16,2048,237.5680,49.1520,139.2640,11.5339,35.5126,67.5817,4.5054 +200,4x16x4,16,4096,475.1360,98.3040,278.5280,23.0366,35.5607,67.4902,4.4993 +200,4x16x8,16,8192,950.2720,196.6080,557.0560,46.1666,35.4888,67.6269,4.5085 +200,4x16x16,16,16384,1900.5440,393.2160,1114.1120,92.3461,-11.0256,-217.6759,-14.5117 +200,8x4x2,16,1024,118.7840,24.5760,69.6320,5.7504,35.6148,67.3877,4.4925 +200,8x4x4,16,2048,237.5680,49.1520,139.2640,11.5363,35.5054,67.5954,4.5064 +200,8x4x8,16,4096,475.1360,98.3040,278.5280,23.0449,35.5480,67.5143,4.5010 +200,8x4x16,16,8192,950.2720,196.6080,557.0560,46.2153,35.4515,67.6981,4.5132 +200,8x8x2,16,2048,237.5680,49.1520,139.2640,11.5019,35.6114,67.3942,4.4929 +200,8x8x4,16,4096,475.1360,98.3040,278.5280,23.0126,35.5980,67.4196,4.4946 +200,8x8x8,16,8192,950.2720,196.6080,557.0560,46.1823,35.4768,67.6499,4.5100 +200,8x8x16,16,16384,1900.5440,393.2160,1114.1120,92.3762,-11.0220,-217.7469,-14.5165 +200,8x16x2,16,4096,475.1360,98.3040,278.5280,23.0367,35.5606,67.4904,4.4994 +200,8x16x4,16,8192,950.2720,196.6080,557.0560,46.1673,35.4883,67.6279,4.5085 +200,8x16x8,16,16384,1900.5440,393.2160,1114.1120,92.4053,-11.0185,-217.8156,-14.5210 +200,8x16x16,16,32768,3801.0880,786.4320,2228.2240,184.6536,-11.0279,-217.6306,-14.5087 +200,16x4x2,16,2048,237.5680,49.1520,139.2640,11.4984,35.6223,67.3736,4.4916 +200,16x4x4,16,4096,475.1360,98.3040,278.5280,23.0553,35.5319,67.5449,4.5030 +200,16x4x8,16,8192,950.2720,196.6080,557.0560,46.1639,35.4909,67.6229,4.5082 +200,16x4x16,16,16384,1900.5440,393.2160,1114.1120,92.3485,-11.0253,-217.6818,-14.5121 +200,16x8x2,16,4096,475.1360,98.3040,278.5280,23.0221,35.5832,67.4475,4.4965 +200,16x8x4,16,8192,950.2720,196.6080,557.0560,46.1950,35.4671,67.6684,4.5112 +200,16x8x8,16,16384,1900.5440,393.2160,1114.1120,92.3825,-11.0212,-217.7618,-14.5175 +200,16x8x16,16,32768,3801.0880,786.4320,2228.2240,184.4897,-11.0377,-217.4373,-14.4958 +200,16x16x2,16,8192,950.2720,196.6080,557.0560,46.1887,35.4719,67.6592,4.5106 +200,16x16x4,16,16384,1900.5440,393.2160,1114.1120,92.4000,-11.0191,-217.8031,-14.5202 +200,16x16x8,16,32768,3801.0880,786.4320,2228.2240,184.6561,-11.0277,-217.6335,-14.5089 +200,16x16x16,16,65536,7602.1760,1572.8640,4456.4480,369.2723,0.6020,3986.7795,265.7853 +200,4x4x2,32,1024,184.3200,24.5760,135.1680,11.4223,17.9299,133.8547,4.3179 +200,4x4x4,32,2048,368.6400,49.1520,270.3360,22.8232,17.9466,133.7300,4.3139 +200,4x4x8,32,4096,737.2800,98.3040,540.6720,45.6452,17.9471,133.7263,4.3138 +200,4x4x16,32,8192,1474.5600,196.6080,1081.3440,91.3233,17.9407,133.7744,4.3153 +200,4x8x2,32,2048,368.6400,49.1520,270.3360,22.8306,17.9408,133.7732,4.3153 +200,4x8x4,32,4096,737.2800,98.3040,540.6720,45.6485,17.9458,133.7357,4.3141 +200,4x8x8,32,8192,1474.5600,196.6080,1081.3440,91.3158,17.9421,133.7634,4.3149 +200,4x8x16,32,16384,2949.1200,393.2160,2162.6880,182.6220,-5.5753,-430.4722,-13.8862 +200,4x16x2,32,4096,737.2800,98.3040,540.6720,45.6489,17.9457,133.7371,4.3141 +200,4x16x4,32,8192,1474.5600,196.6080,1081.3440,91.2890,17.9474,133.7241,4.3137 +200,4x16x8,32,16384,2949.1200,393.2160,2162.6880,182.6145,-5.5755,-430.4547,-13.8856 +200,4x16x16,32,32768,5898.2400,786.4320,4325.3760,365.2677,-5.5749,-430.5002,-13.8871 +200,8x4x2,32,2048,368.6400,49.1520,270.3360,22.8301,17.9412,133.7701,4.3152 +200,8x4x4,32,4096,737.2800,98.3040,540.6720,45.6689,17.9378,133.7956,4.3160 +200,8x4x8,32,8192,1474.5600,196.6080,1081.3440,91.3143,17.9424,133.7611,4.3149 +200,8x4x16,32,16384,2949.1200,393.2160,2162.6880,182.6332,-5.5749,-430.4987,-13.8871 +200,8x8x2,32,4096,737.2800,98.3040,540.6720,45.6519,17.9445,133.7457,4.3144 +200,8x8x4,32,8192,1474.5600,196.6080,1081.3440,91.3215,17.9410,133.7717,4.3152 +200,8x8x8,32,16384,2949.1200,393.2160,2162.6880,182.6451,-5.5746,-430.5267,-13.8880 +200,8x8x16,32,32768,5898.2400,786.4320,4325.3760,365.1989,-5.5760,-430.4192,-13.8845 +200,8x16x2,32,8192,1474.5600,196.6080,1081.3440,91.3015,17.9449,133.7425,4.3143 +200,8x16x4,32,16384,2949.1200,393.2160,2162.6880,182.6251,-5.5752,-430.4795,-13.8864 +200,8x16x8,32,32768,5898.2400,786.4320,4325.3760,365.2087,-5.5758,-430.4307,-13.8849 +200,8x16x16,32,65536,11796.4800,1572.8640,8650.7520,730.5781,0.3043,7887.5501,254.4371 +200,16x4x2,32,4096,737.2800,98.3040,540.6720,45.6592,17.9416,133.7671,4.3151 +200,16x4x4,32,8192,1474.5600,196.6080,1081.3440,91.3092,17.9434,133.7537,4.3146 +200,16x4x8,32,16384,2949.1200,393.2160,2162.6880,182.6471,-5.5745,-430.5315,-13.8881 +200,16x4x16,32,32768,5898.2400,786.4320,4325.3760,365.2706,-5.5749,-430.5036,-13.8872 +200,16x8x2,32,8192,1474.5600,196.6080,1081.3440,91.3464,17.9361,133.8082,4.3164 +200,16x8x4,32,16384,2949.1200,393.2160,2162.6880,182.6559,-5.5742,-430.5521,-13.8888 +200,16x8x8,32,32768,5898.2400,786.4320,4325.3760,365.2269,-5.5755,-430.4521,-13.8856 +200,16x8x16,32,65536,11796.4800,1572.8640,8650.7520,730.5707,0.3043,7887.4697,254.4345 +200,16x16x2,32,16384,2949.1200,393.2160,2162.6880,182.6406,-5.5747,-430.5160,-13.8876 +200,16x16x4,32,32768,5898.2400,786.4320,4325.3760,365.2364,-5.5754,-430.4633,-13.8859 +200,16x16x8,32,65536,11796.4800,1572.8640,8650.7520,730.5352,0.3043,7887.0871,254.4222 +200,16x16x16,32,131072,23592.9600,3145.7280,17301.5040,1461.1927,0.3043,7887.7469,254.4434 diff --git a/src/force.c b/src/force.c index 380a31b..32b30f5 100644 --- a/src/force.c +++ b/src/force.c @@ -27,12 +27,12 @@ #include #include -double computeForce( - Parameter *param, - Atom *atom, - Neighbor *neighbor, - int profile) -{ +// Number of times to compute the most internal loop +#ifndef INTERNAL_LOOP_NTIMES +#define INTERNAL_LOOP_NTIMES 1 +#endif + +double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor) { int Nlocal = atom->Nlocal; int* neighs; MD_FLOAT cutforcesq = param->cutforce * param->cutforce; @@ -47,10 +47,6 @@ double computeForce( fz[i] = 0.0; } - if(profile) { - // LIKWID_MARKER_START("force"); - } - #pragma omp parallel for for(int i = 0; i < Nlocal; i++) { neighs = &neighbor->neighbors[i * neighbor->maxneighs]; @@ -64,20 +60,22 @@ double computeForce( // printf("%d: %d\n", i, numneighs); - for(int k = 0; k < numneighs; k++) { - int j = neighs[k]; - MD_FLOAT delx = xtmp - atom_x(j); - MD_FLOAT dely = ytmp - atom_y(j); - MD_FLOAT delz = ztmp - atom_z(j); - MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + for(int n = 0; n < INTERNAL_LOOP_NTIMES; n++) { + for(int k = 0; k < numneighs; k++) { + int j = neighs[k]; + MD_FLOAT delx = xtmp - atom_x(j); + MD_FLOAT dely = ytmp - atom_y(j); + MD_FLOAT delz = ztmp - atom_z(j); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; - if(rsq < cutforcesq) { - MD_FLOAT sr2 = 1.0 / rsq; - MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; - MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; - fix += delx * force; - fiy += dely * force; - fiz += delz * force; + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + fix += delx * force; + fiy += dely * force; + fiz += delz * force; + } } } @@ -86,9 +84,5 @@ double computeForce( fz[i] += fiz; } - if(profile) { - // LIKWID_MARKER_STOP("force"); - } - return 0.0; } diff --git a/src/main-stub.c b/src/main-stub.c index a884ae0..4041c2c 100644 --- a/src/main-stub.c +++ b/src/main-stub.c @@ -16,7 +16,7 @@ #define LATTICE_DISTANCE 10.0 #define NEIGH_DISTANCE 1.0 -extern double computeForce( Parameter*, Atom*, Neighbor*, int); +extern double computeForce( Parameter*, Atom*, Neighbor*); void init(Parameter *param) { param->epsilon = 1.0; @@ -188,19 +188,19 @@ int main(int argc, const char *argv[]) { DEBUG("Building neighbor lists...\n"); buildNeighbor(atom, &neighbor); DEBUG("Computing forces...\n"); - computeForce(¶m, atom, &neighbor, 0); + computeForce(¶m, atom, &neighbor); double S, E; S = getTimeStamp(); LIKWID_MARKER_START("force"); for(int i = 0; i < param.ntimes; i++) { - computeForce(¶m, atom, &neighbor, 1); + computeForce(¶m, atom, &neighbor); } LIKWID_MARKER_STOP("force"); E = getTimeStamp(); double T_accum = E-S; - const double atoms_updates_per_sec = atom->Nlocal * param.ntimes / T_accum; - const double cycles_per_atom = T_accum * freq / (atom->Nlocal * param.ntimes); + const double atoms_updates_per_sec = (double)(atom->Nlocal * INTERNAL_LOOP_NTIMES * param.ntimes) / T_accum; + const double cycles_per_atom = T_accum * freq / (double)(atom->Nlocal * param.ntimes * INTERNAL_LOOP_NTIMES); const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); if(!csv) { diff --git a/src/main.c b/src/main.c index 564708e..6b09c8e 100644 --- a/src/main.c +++ b/src/main.c @@ -47,7 +47,7 @@ typedef enum { NUMTIMER } timertype; -extern double computeForce( Parameter*, Atom*, Neighbor*, int); +extern double computeForce( Parameter*, Atom*, Neighbor*); void init(Parameter *param) { @@ -205,7 +205,7 @@ int main (int argc, char** argv) setup(¶m, &atom, &neighbor); computeThermo(0, ¶m, &atom); - computeForce(¶m, &atom, &neighbor, 1); + computeForce(¶m, &atom, &neighbor); timer[FORCE] = 0.0; timer[NEIGH] = 0.0; @@ -221,7 +221,7 @@ int main (int argc, char** argv) timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); } - timer[FORCE] += computeForce(¶m, &atom, &neighbor, 1); + timer[FORCE] += computeForce(¶m, &atom, &neighbor); finalIntegrate(¶m, &atom); if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {