Add version iterating most internal loop multiple times
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
faf1e2ae85
commit
15de65303e
4
Makefile
4
Makefile
@ -19,6 +19,10 @@ else
|
|||||||
DEFINES += -DPRECISION=2
|
DEFINES += -DPRECISION=2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifneq ($(INTERNAL_LOOP_NTIMES),)
|
||||||
|
DEFINES += -DINTERNAL_LOOP_NTIMES=$(INTERNAL_LOOP_NTIMES)
|
||||||
|
endif
|
||||||
|
|
||||||
VPATH = $(SRC_DIR)
|
VPATH = $(SRC_DIR)
|
||||||
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
||||||
OBJ = $(filter-out $(BUILD_DIR)/main%,$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
|
OBJ = $(filter-out $(BUILD_DIR)/main%,$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
|
||||||
|
70
arch_analysis/osaca_force_soa_lt8_iln1000.txt
Normal file
70
arch_analysis/osaca_force_soa_lt8_iln1000.txt
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
iwia021h@testfront1:~/MD-Bench$ /home/hpc/iwia/iwia021h/.local/bin/osaca --ignore-unknown --arch=CSX ICC/force.s
|
||||||
|
Open Source Architecture Code Analyzer (OSACA) - 0.3.14
|
||||||
|
Analyzed file: ICC/force.s
|
||||||
|
Architecture: CSX
|
||||||
|
Timestamp: 2021-04-30 16:08:44
|
||||||
|
|
||||||
|
|
||||||
|
P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
|
||||||
|
* - Instruction micro-ops not bound to a port
|
||||||
|
X - No throughput/latency information for this instruction in data file
|
||||||
|
|
||||||
|
|
||||||
|
Combined Analysis Report
|
||||||
|
------------------------
|
||||||
|
Port pressure in cycles
|
||||||
|
| 0 - 0DV | 1 | 2 - 2D | 3 - 3D | 4 | 5 | 6 | 7 || CP | LCD |
|
||||||
|
-------------------------------------------------------------------------------------------------
|
||||||
|
306 | | | | | | | | || | | # LOE rbp rdi r8 r9 r10 edx ecx r11d r12d r13d r14d r15d ymm13 ymm14 zmm2 zmm3 zmm4 zmm5 zmm6 zmm7 zmm8 zmm9 zmm10 zmm11 zmm12 zmm15
|
||||||
|
307 | | | | | | | | || | | ..B1.29: # Preds ..B1.28
|
||||||
|
308 | | | | | | | | || | | # Execution count [2.50e+04]
|
||||||
|
309 | 0.00 | 0.50 | | | | 0.00 | 0.50 | || | | movl %r14d, %eax #64.13
|
||||||
|
310 | | | | | | | | || | | X subl %ecx, %eax #64.13
|
||||||
|
311 | | | | | | | | || | | X vpbroadcastd %eax, %ymm0 #64.13
|
||||||
|
312 | | | | | | 1.00 | | || | | vpcmpgtd %ymm14, %ymm0, %k5 #64.13
|
||||||
|
313 | 0.00 | 0.16 | | | | 0.00 | 0.84 | || | | movslq %ecx, %rcx #64.13
|
||||||
|
314 | | | | | | | | || | | * vmovaps %zmm15, %zmm17 #67.40
|
||||||
|
315 | 1.00 | | | | | | | || | | kmovw %k5, %k2 #67.40
|
||||||
|
316 | | | | | | | | || | | * vmovaps %zmm15, %zmm16 #66.40
|
||||||
|
317 | 0.00 | 1.00 | 0.50 0.50 | 0.50 0.50 | | 0.00 | | || 4.0 | | vmovdqu32 (%rdi,%rcx,4), %ymm1{%k5}{z} #65.25
|
||||||
|
318 | 1.00 | | | | | | | || | | kmovw %k5, %k1 #66.40
|
||||||
|
319 | | | | | | | | || | | * vmovaps %zmm15, %zmm18 #68.40
|
||||||
|
320 | 1.00 | | | | | | | || | | kmovw %k5, %k3 #68.40
|
||||||
|
321 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || | | vgatherdpd (%r8,%ymm1,8), %zmm18{%k3} #68.40
|
||||||
|
322 | 1.50 | 0.50 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.50 | || 4.0 | | vgatherdpd (%r9,%ymm1,8), %zmm17{%k2} #67.40
|
||||||
|
323 | 1.50 | 0.34 | 4.00 0.50 | 4.00 0.50 | | 0.50 | 0.66 | || | | vgatherdpd (%r10,%ymm1,8), %zmm16{%k1} #66.40
|
||||||
|
324 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm18, %zmm3, %zmm31 #68.40
|
||||||
|
325 | 0.50 | | | | | 0.50 | | || 4.0 | | vsubpd %zmm17, %zmm2, %zmm29 #67.40
|
||||||
|
326 | 0.50 | | | | | 0.50 | | || | | vsubpd %zmm16, %zmm4, %zmm28 #66.40
|
||||||
|
327 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm29, %zmm29, %zmm27 #69.53
|
||||||
|
328 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm28, %zmm28, %zmm27 #69.53
|
||||||
|
329 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd231pd %zmm31, %zmm31, %zmm27 #69.67
|
||||||
|
330 | 2.50 | | | | | 0.50 | | || 8.0 | | vrcp14pd %zmm27, %zmm26 #72.42
|
||||||
|
331 | | | | | | 1.00 | | || | | vcmppd $1, %zmm12, %zmm27, %k6{%k5} #71.26
|
||||||
|
332 | | | | | | 1.00 | | || | | vfpclasspd $30, %zmm26, %k0 #72.42
|
||||||
|
333 | | | | | | | | || | | * vmovaps %zmm27, %zmm19 #72.42
|
||||||
|
334 | 0.50 | | 0.50 0.50 | 0.50 0.50 | | 0.50 | | || 4.0 | | vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm26, %zmm19 #72.42
|
||||||
|
335 | 1.00 | | | | | | | || | | knotw %k0, %k4 #72.42
|
||||||
|
336 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm19, %zmm19, %zmm20 #72.42
|
||||||
|
337 | 0.50 | | | | | 0.50 | | || | | vfmadd213pd %zmm26, %zmm19, %zmm26{%k4} #72.42
|
||||||
|
338 | 0.50 | | | | | 0.50 | | || 4.0 | | vfmadd213pd %zmm26, %zmm20, %zmm26{%k4} #72.42
|
||||||
|
339 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm11, %zmm26, %zmm21 #73.42
|
||||||
|
340 | 0.50 | | | | | 0.50 | | || | | vmulpd %zmm9, %zmm26, %zmm23 #74.58
|
||||||
|
341 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm21, %zmm26, %zmm24 #73.48
|
||||||
|
342 | 0.50 | | | | | 0.50 | | || 4.0 | | vmulpd %zmm24, %zmm26, %zmm22 #73.54
|
||||||
|
343 | 0.00 | | | | | 1.00 | | || | | vfmsub213pd %zmm10, %zmm24, %zmm26 #74.58
|
||||||
|
344 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm23, %zmm22, %zmm25 #74.65
|
||||||
|
345 | 0.00 | | | | | 1.00 | | || 4.0 | | vmulpd %zmm26, %zmm25, %zmm30 #74.71
|
||||||
|
346 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21
|
||||||
|
347 | 0.00 | | | | | 1.00 | | || | | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21
|
||||||
|
348 | 0.00 | | | | | 1.00 | | || 4.0 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21
|
||||||
|
|
||||||
|
18.0 3.00 13.0 2.50 13.0 2.50 18.0 3.00 68.0 4
|
||||||
|
|
||||||
|
|
||||||
|
Loop-Carried Dependencies Analysis Report
|
||||||
|
-----------------------------------------
|
||||||
|
313 | 1.0 | movslq %ecx, %rcx #64.13| [313]
|
||||||
|
348 | 4.0 | vfmadd231pd %zmm31, %zmm30, %zmm5{%k6} #77.21| [348]
|
||||||
|
347 | 4.0 | vfmadd231pd %zmm29, %zmm30, %zmm6{%k6} #76.21| [347]
|
||||||
|
346 | 4.0 | vfmadd231pd %zmm28, %zmm30, %zmm7{%k6} #75.21| [346]
|
@ -355,6 +355,7 @@ computeForce:
|
|||||||
ja ..B1.38 # Prob 50% #67.9
|
ja ..B1.38 # Prob 50% #67.9
|
||||||
movl $111,%ebx #IACA/OSACA START MARKER
|
movl $111,%ebx #IACA/OSACA START MARKER
|
||||||
.byte 100,103,144 #IACA/OSACA START MARKER
|
.byte 100,103,144 #IACA/OSACA START MARKER
|
||||||
|
# LLVM-MCA-BEGIN
|
||||||
# LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
|
# LOE rax rdx rcx rbp rsi rdi r8 r9 r10 r11d r14d xmm6 xmm7 xmm12 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
|
||||||
..B1.33: # Preds ..B1.32
|
..B1.33: # Preds ..B1.32
|
||||||
# Execution count [2.50e+01]
|
# Execution count [2.50e+01]
|
||||||
@ -415,6 +416,7 @@ computeForce:
|
|||||||
vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17
|
vfmadd231pd %zmm23, %zmm22, %zmm11{%k3} #80.17
|
||||||
movl $222,%ebx #IACA/OSACA END MARKER
|
movl $222,%ebx #IACA/OSACA END MARKER
|
||||||
.byte 100,103,144 #IACA/OSACA END MARKER
|
.byte 100,103,144 #IACA/OSACA END MARKER
|
||||||
|
# LLVM-MCA-END
|
||||||
# LOE rax rdx rbp rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
|
# LOE rax rdx rbp rsi rdi r8 r9 r10 ymm15 ymm16 zmm5 zmm8 zmm9 zmm10 zmm11 zmm13 zmm14
|
||||||
..B1.38: # Preds ..B1.23 ..B1.37 ..B1.32
|
..B1.38: # Preds ..B1.23 ..B1.37 ..B1.32
|
||||||
# Execution count [4.50e+00]
|
# Execution count [4.50e+00]
|
||||||
|
142
scripts/results_soa_casclakesp2_iln1000.txt
Normal file
142
scripts/results_soa_casclakesp2_iln1000.txt
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
200,4x4x2,8,256,21.5040,6.1440,9.2160,0.7405,69.1424,34.7110,4.9587
|
||||||
|
200,4x4x4,8,512,43.0080,12.2880,18.4320,1.4818,69.1066,34.7290,4.9613
|
||||||
|
200,4x4x8,8,1024,86.0160,24.5760,36.8640,2.9627,69.1261,34.7191,4.9599
|
||||||
|
200,4x4x16,8,2048,172.0320,49.1520,73.7280,5.9244,69.1378,34.7133,4.9590
|
||||||
|
200,4x8x2,8,512,43.0080,12.2880,18.4320,1.4813,69.1298,34.7173,4.9596
|
||||||
|
200,4x8x4,8,1024,86.0160,24.5760,36.8640,2.9633,69.1126,34.7259,4.9608
|
||||||
|
200,4x8x8,8,2048,172.0320,49.1520,73.7280,5.9254,69.1265,34.7189,4.9598
|
||||||
|
200,4x8x16,8,4096,344.0640,98.3040,147.4560,11.8621,69.0600,34.7524,4.9646
|
||||||
|
200,4x16x2,8,1024,86.0160,24.5760,36.8640,2.9705,68.9438,34.8110,4.9730
|
||||||
|
200,4x16x4,8,2048,172.0320,49.1520,73.7280,5.9338,69.0288,34.7681,4.9669
|
||||||
|
200,4x16x8,8,4096,344.0640,98.3040,147.4560,11.8492,69.1354,34.7145,4.9592
|
||||||
|
200,4x16x16,8,8192,688.1280,196.6080,294.9120,23.7239,69.0612,34.7518,4.9645
|
||||||
|
200,8x4x2,8,512,43.0080,12.2880,18.4320,1.4814,69.1239,34.7203,4.9600
|
||||||
|
200,8x4x4,8,1024,86.0160,24.5760,36.8640,2.9622,69.1383,34.7130,4.9590
|
||||||
|
200,8x4x8,8,2048,172.0320,49.1520,73.7280,5.9256,69.1234,34.7205,4.9601
|
||||||
|
200,8x4x16,8,4096,344.0640,98.3040,147.4560,11.8548,69.1026,34.7310,4.9616
|
||||||
|
200,8x8x2,8,1024,86.0160,24.5760,36.8640,2.9625,69.1319,34.7162,4.9595
|
||||||
|
200,8x8x4,8,2048,172.0320,49.1520,73.7280,5.9247,69.1339,34.7152,4.9593
|
||||||
|
200,8x8x8,8,4096,344.0640,98.3040,147.4560,11.8484,69.1400,34.7122,4.9589
|
||||||
|
200,8x8x16,8,8192,688.1280,196.6080,294.9120,23.7201,69.0721,34.7463,4.9638
|
||||||
|
200,8x16x2,8,2048,172.0320,49.1520,73.7280,5.9244,69.1376,34.7134,4.9591
|
||||||
|
200,8x16x4,8,4096,344.0640,98.3040,147.4560,11.8496,69.1331,34.7156,4.9594
|
||||||
|
200,8x16x8,8,8192,688.1280,196.6080,294.9120,23.7200,69.0724,34.7461,4.9637
|
||||||
|
200,8x16x16,8,16384,1376.2560,393.2160,589.8240,47.4404,-21.4620,-111.8254,-15.9751
|
||||||
|
200,16x4x2,8,1024,86.0160,24.5760,36.8640,2.9624,69.1338,34.7153,4.9593
|
||||||
|
200,16x4x4,8,2048,172.0320,49.1520,73.7280,5.9251,69.1300,34.7172,4.9596
|
||||||
|
200,16x4x8,8,4096,344.0640,98.3040,147.4560,11.8495,69.1335,34.7155,4.9594
|
||||||
|
200,16x4x16,8,8192,688.1280,196.6080,294.9120,23.7169,69.0814,34.7416,4.9631
|
||||||
|
200,16x8x2,8,2048,172.0320,49.1520,73.7280,5.9246,69.1360,34.7142,4.9592
|
||||||
|
200,16x8x4,8,4096,344.0640,98.3040,147.4560,11.8498,69.1319,34.7163,4.9595
|
||||||
|
200,16x8x8,8,8192,688.1280,196.6080,294.9120,23.7234,69.0625,34.7511,4.9644
|
||||||
|
200,16x8x16,8,16384,1376.2560,393.2160,589.8240,47.4499,-21.4578,-111.8477,-15.9782
|
||||||
|
200,16x16x2,8,4096,344.0640,98.3040,147.4560,11.8552,69.1005,34.7320,4.9617
|
||||||
|
200,16x16x4,8,8192,688.1280,196.6080,294.9120,23.7215,69.0682,34.7483,4.9640
|
||||||
|
200,4x4x2,8,256,21.5040,6.1440,9.2160,0.7406,69.1332,34.7156,4.9594
|
||||||
|
200,4x4x4,8,512,43.0080,12.2880,18.4320,1.4813,69.1306,34.7169,4.9596
|
||||||
|
200,4x4x8,8,1024,86.0160,24.5760,36.8640,2.9622,69.1383,34.7130,4.9590
|
||||||
|
200,4x4x16,8,2048,172.0320,49.1520,73.7280,5.9253,69.1267,34.7188,4.9598
|
||||||
|
200,4x8x2,8,512,43.0080,12.2880,18.4320,1.4813,69.1298,34.7173,4.9596
|
||||||
|
200,4x8x4,8,1024,86.0160,24.5760,36.8640,2.9625,69.1308,34.7168,4.9595
|
||||||
|
200,4x8x8,8,2048,172.0320,49.1520,73.7280,5.9247,69.1340,34.7152,4.9593
|
||||||
|
200,4x8x16,8,4096,344.0640,98.3040,147.4560,11.8482,69.1412,34.7116,4.9588
|
||||||
|
200,4x16x2,8,1024,86.0160,24.5760,36.8640,2.9625,69.1310,34.7167,4.9595
|
||||||
|
200,4x16x4,8,2048,172.0320,49.1520,73.7280,5.9254,69.1263,34.7191,4.9599
|
||||||
|
200,4x16x8,8,4096,344.0640,98.3040,147.4560,11.8488,69.1375,34.7134,4.9591
|
||||||
|
200,4x16x16,8,8192,688.1280,196.6080,294.9120,23.7265,69.0536,34.7556,4.9651
|
||||||
|
200,8x4x2,8,512,43.0080,12.2880,18.4320,1.4814,69.1244,34.7200,4.9600
|
||||||
|
200,8x4x4,8,1024,86.0160,24.5760,36.8640,2.9622,69.1375,34.7134,4.9591
|
||||||
|
200,8x4x8,8,2048,172.0320,49.1520,73.7280,5.9251,69.1301,34.7172,4.9596
|
||||||
|
200,8x4x16,8,4096,344.0640,98.3040,147.4560,11.8497,69.1326,34.7159,4.9594
|
||||||
|
200,8x8x2,8,1024,86.0160,24.5760,36.8640,2.9623,69.1364,34.7140,4.9591
|
||||||
|
200,8x8x4,8,2048,172.0320,49.1520,73.7280,5.9250,69.1311,34.7166,4.9595
|
||||||
|
200,8x8x8,8,4096,344.0640,98.3040,147.4560,11.8488,69.1378,34.7133,4.9590
|
||||||
|
200,8x8x16,8,8192,688.1280,196.6080,294.9120,23.7217,69.0677,34.7485,4.9641
|
||||||
|
200,8x16x2,8,2048,172.0320,49.1520,73.7280,5.9246,69.1355,34.7145,4.9592
|
||||||
|
200,8x16x4,8,4096,344.0640,98.3040,147.4560,11.8491,69.1358,34.7143,4.9592
|
||||||
|
200,8x16x8,8,8192,688.1280,196.6080,294.9120,23.7200,69.0725,34.7461,4.9637
|
||||||
|
200,8x16x16,8,16384,1376.2560,393.2160,589.8240,47.4484,-21.4584,-111.8442,-15.9777
|
||||||
|
200,16x4x2,8,1024,86.0160,24.5760,36.8640,2.9624,69.1343,34.7151,4.9593
|
||||||
|
200,16x4x4,8,2048,172.0320,49.1520,73.7280,5.9239,69.1432,34.7106,4.9587
|
||||||
|
200,16x4x8,8,4096,344.0640,98.3040,147.4560,11.8490,69.1367,34.7138,4.9591
|
||||||
|
200,16x4x16,8,8192,688.1280,196.6080,294.9120,23.7192,69.0749,34.7449,4.9636
|
||||||
|
200,16x8x2,8,2048,172.0320,49.1520,73.7280,5.9251,69.1301,34.7171,4.9596
|
||||||
|
200,16x8x4,8,4096,344.0640,98.3040,147.4560,11.8496,69.1329,34.7157,4.9594
|
||||||
|
200,16x8x8,8,8192,688.1280,196.6080,294.9120,23.7224,69.0655,34.7496,4.9642
|
||||||
|
200,16x8x16,8,16384,1376.2560,393.2160,589.8240,47.4415,-21.4615,-111.8280,-15.9754
|
||||||
|
200,16x16x2,8,4096,344.0640,98.3040,147.4560,11.8502,69.1297,34.7173,4.9596
|
||||||
|
200,16x16x4,8,8192,688.1280,196.6080,294.9120,23.7177,69.0792,34.7428,4.9633
|
||||||
|
200,16x16x8,8,16384,1376.2560,393.2160,589.8240,47.4485,-21.4584,-111.8445,-15.9778
|
||||||
|
200,16x16x16,8,32768,2752.5120,786.4320,1179.6480,94.8632,-21.4660,-111.8047,-15.9721
|
||||||
|
200,4x4x2,16,512,59.3920,12.2880,34.8160,2.8749,35.6181,67.3814,4.4921
|
||||||
|
200,4x4x4,16,1024,118.7840,24.5760,69.6320,5.7553,35.5849,67.4444,4.4963
|
||||||
|
200,4x4x8,16,2048,237.5680,49.1520,139.2640,11.5277,35.5317,67.5452,4.5030
|
||||||
|
200,4x4x16,16,4096,475.1360,98.3040,278.5280,23.0679,35.5125,67.5818,4.5055
|
||||||
|
200,4x8x2,16,1024,118.7840,24.5760,69.6320,5.7516,35.6076,67.4014,4.4934
|
||||||
|
200,4x8x4,16,2048,237.5680,49.1520,139.2640,11.5048,35.6026,67.4108,4.4941
|
||||||
|
200,4x8x8,16,4096,475.1360,98.3040,278.5280,23.0343,35.5643,67.4834,4.4989
|
||||||
|
200,4x8x16,16,8192,950.2720,196.6080,557.0560,46.1992,35.4638,67.6746,4.5116
|
||||||
|
200,4x16x2,16,2048,237.5680,49.1520,139.2640,11.5339,35.5126,67.5817,4.5054
|
||||||
|
200,4x16x4,16,4096,475.1360,98.3040,278.5280,23.0366,35.5607,67.4902,4.4993
|
||||||
|
200,4x16x8,16,8192,950.2720,196.6080,557.0560,46.1666,35.4888,67.6269,4.5085
|
||||||
|
200,4x16x16,16,16384,1900.5440,393.2160,1114.1120,92.3461,-11.0256,-217.6759,-14.5117
|
||||||
|
200,8x4x2,16,1024,118.7840,24.5760,69.6320,5.7504,35.6148,67.3877,4.4925
|
||||||
|
200,8x4x4,16,2048,237.5680,49.1520,139.2640,11.5363,35.5054,67.5954,4.5064
|
||||||
|
200,8x4x8,16,4096,475.1360,98.3040,278.5280,23.0449,35.5480,67.5143,4.5010
|
||||||
|
200,8x4x16,16,8192,950.2720,196.6080,557.0560,46.2153,35.4515,67.6981,4.5132
|
||||||
|
200,8x8x2,16,2048,237.5680,49.1520,139.2640,11.5019,35.6114,67.3942,4.4929
|
||||||
|
200,8x8x4,16,4096,475.1360,98.3040,278.5280,23.0126,35.5980,67.4196,4.4946
|
||||||
|
200,8x8x8,16,8192,950.2720,196.6080,557.0560,46.1823,35.4768,67.6499,4.5100
|
||||||
|
200,8x8x16,16,16384,1900.5440,393.2160,1114.1120,92.3762,-11.0220,-217.7469,-14.5165
|
||||||
|
200,8x16x2,16,4096,475.1360,98.3040,278.5280,23.0367,35.5606,67.4904,4.4994
|
||||||
|
200,8x16x4,16,8192,950.2720,196.6080,557.0560,46.1673,35.4883,67.6279,4.5085
|
||||||
|
200,8x16x8,16,16384,1900.5440,393.2160,1114.1120,92.4053,-11.0185,-217.8156,-14.5210
|
||||||
|
200,8x16x16,16,32768,3801.0880,786.4320,2228.2240,184.6536,-11.0279,-217.6306,-14.5087
|
||||||
|
200,16x4x2,16,2048,237.5680,49.1520,139.2640,11.4984,35.6223,67.3736,4.4916
|
||||||
|
200,16x4x4,16,4096,475.1360,98.3040,278.5280,23.0553,35.5319,67.5449,4.5030
|
||||||
|
200,16x4x8,16,8192,950.2720,196.6080,557.0560,46.1639,35.4909,67.6229,4.5082
|
||||||
|
200,16x4x16,16,16384,1900.5440,393.2160,1114.1120,92.3485,-11.0253,-217.6818,-14.5121
|
||||||
|
200,16x8x2,16,4096,475.1360,98.3040,278.5280,23.0221,35.5832,67.4475,4.4965
|
||||||
|
200,16x8x4,16,8192,950.2720,196.6080,557.0560,46.1950,35.4671,67.6684,4.5112
|
||||||
|
200,16x8x8,16,16384,1900.5440,393.2160,1114.1120,92.3825,-11.0212,-217.7618,-14.5175
|
||||||
|
200,16x8x16,16,32768,3801.0880,786.4320,2228.2240,184.4897,-11.0377,-217.4373,-14.4958
|
||||||
|
200,16x16x2,16,8192,950.2720,196.6080,557.0560,46.1887,35.4719,67.6592,4.5106
|
||||||
|
200,16x16x4,16,16384,1900.5440,393.2160,1114.1120,92.4000,-11.0191,-217.8031,-14.5202
|
||||||
|
200,16x16x8,16,32768,3801.0880,786.4320,2228.2240,184.6561,-11.0277,-217.6335,-14.5089
|
||||||
|
200,16x16x16,16,65536,7602.1760,1572.8640,4456.4480,369.2723,0.6020,3986.7795,265.7853
|
||||||
|
200,4x4x2,32,1024,184.3200,24.5760,135.1680,11.4223,17.9299,133.8547,4.3179
|
||||||
|
200,4x4x4,32,2048,368.6400,49.1520,270.3360,22.8232,17.9466,133.7300,4.3139
|
||||||
|
200,4x4x8,32,4096,737.2800,98.3040,540.6720,45.6452,17.9471,133.7263,4.3138
|
||||||
|
200,4x4x16,32,8192,1474.5600,196.6080,1081.3440,91.3233,17.9407,133.7744,4.3153
|
||||||
|
200,4x8x2,32,2048,368.6400,49.1520,270.3360,22.8306,17.9408,133.7732,4.3153
|
||||||
|
200,4x8x4,32,4096,737.2800,98.3040,540.6720,45.6485,17.9458,133.7357,4.3141
|
||||||
|
200,4x8x8,32,8192,1474.5600,196.6080,1081.3440,91.3158,17.9421,133.7634,4.3149
|
||||||
|
200,4x8x16,32,16384,2949.1200,393.2160,2162.6880,182.6220,-5.5753,-430.4722,-13.8862
|
||||||
|
200,4x16x2,32,4096,737.2800,98.3040,540.6720,45.6489,17.9457,133.7371,4.3141
|
||||||
|
200,4x16x4,32,8192,1474.5600,196.6080,1081.3440,91.2890,17.9474,133.7241,4.3137
|
||||||
|
200,4x16x8,32,16384,2949.1200,393.2160,2162.6880,182.6145,-5.5755,-430.4547,-13.8856
|
||||||
|
200,4x16x16,32,32768,5898.2400,786.4320,4325.3760,365.2677,-5.5749,-430.5002,-13.8871
|
||||||
|
200,8x4x2,32,2048,368.6400,49.1520,270.3360,22.8301,17.9412,133.7701,4.3152
|
||||||
|
200,8x4x4,32,4096,737.2800,98.3040,540.6720,45.6689,17.9378,133.7956,4.3160
|
||||||
|
200,8x4x8,32,8192,1474.5600,196.6080,1081.3440,91.3143,17.9424,133.7611,4.3149
|
||||||
|
200,8x4x16,32,16384,2949.1200,393.2160,2162.6880,182.6332,-5.5749,-430.4987,-13.8871
|
||||||
|
200,8x8x2,32,4096,737.2800,98.3040,540.6720,45.6519,17.9445,133.7457,4.3144
|
||||||
|
200,8x8x4,32,8192,1474.5600,196.6080,1081.3440,91.3215,17.9410,133.7717,4.3152
|
||||||
|
200,8x8x8,32,16384,2949.1200,393.2160,2162.6880,182.6451,-5.5746,-430.5267,-13.8880
|
||||||
|
200,8x8x16,32,32768,5898.2400,786.4320,4325.3760,365.1989,-5.5760,-430.4192,-13.8845
|
||||||
|
200,8x16x2,32,8192,1474.5600,196.6080,1081.3440,91.3015,17.9449,133.7425,4.3143
|
||||||
|
200,8x16x4,32,16384,2949.1200,393.2160,2162.6880,182.6251,-5.5752,-430.4795,-13.8864
|
||||||
|
200,8x16x8,32,32768,5898.2400,786.4320,4325.3760,365.2087,-5.5758,-430.4307,-13.8849
|
||||||
|
200,8x16x16,32,65536,11796.4800,1572.8640,8650.7520,730.5781,0.3043,7887.5501,254.4371
|
||||||
|
200,16x4x2,32,4096,737.2800,98.3040,540.6720,45.6592,17.9416,133.7671,4.3151
|
||||||
|
200,16x4x4,32,8192,1474.5600,196.6080,1081.3440,91.3092,17.9434,133.7537,4.3146
|
||||||
|
200,16x4x8,32,16384,2949.1200,393.2160,2162.6880,182.6471,-5.5745,-430.5315,-13.8881
|
||||||
|
200,16x4x16,32,32768,5898.2400,786.4320,4325.3760,365.2706,-5.5749,-430.5036,-13.8872
|
||||||
|
200,16x8x2,32,8192,1474.5600,196.6080,1081.3440,91.3464,17.9361,133.8082,4.3164
|
||||||
|
200,16x8x4,32,16384,2949.1200,393.2160,2162.6880,182.6559,-5.5742,-430.5521,-13.8888
|
||||||
|
200,16x8x8,32,32768,5898.2400,786.4320,4325.3760,365.2269,-5.5755,-430.4521,-13.8856
|
||||||
|
200,16x8x16,32,65536,11796.4800,1572.8640,8650.7520,730.5707,0.3043,7887.4697,254.4345
|
||||||
|
200,16x16x2,32,16384,2949.1200,393.2160,2162.6880,182.6406,-5.5747,-430.5160,-13.8876
|
||||||
|
200,16x16x4,32,32768,5898.2400,786.4320,4325.3760,365.2364,-5.5754,-430.4633,-13.8859
|
||||||
|
200,16x16x8,32,65536,11796.4800,1572.8640,8650.7520,730.5352,0.3043,7887.0871,254.4222
|
||||||
|
200,16x16x16,32,131072,23592.9600,3145.7280,17301.5040,1461.1927,0.3043,7887.7469,254.4434
|
22
src/force.c
22
src/force.c
@ -27,12 +27,12 @@
|
|||||||
#include <parameter.h>
|
#include <parameter.h>
|
||||||
#include <atom.h>
|
#include <atom.h>
|
||||||
|
|
||||||
double computeForce(
|
// Number of times to compute the most internal loop
|
||||||
Parameter *param,
|
#ifndef INTERNAL_LOOP_NTIMES
|
||||||
Atom *atom,
|
#define INTERNAL_LOOP_NTIMES 1
|
||||||
Neighbor *neighbor,
|
#endif
|
||||||
int profile)
|
|
||||||
{
|
double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
int* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
@ -47,10 +47,6 @@ double computeForce(
|
|||||||
fz[i] = 0.0;
|
fz[i] = 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(profile) {
|
|
||||||
// LIKWID_MARKER_START("force");
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for(int i = 0; i < Nlocal; i++) {
|
for(int i = 0; i < Nlocal; i++) {
|
||||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||||
@ -64,6 +60,7 @@ double computeForce(
|
|||||||
|
|
||||||
// printf("%d: %d\n", i, numneighs);
|
// printf("%d: %d\n", i, numneighs);
|
||||||
|
|
||||||
|
for(int n = 0; n < INTERNAL_LOOP_NTIMES; n++) {
|
||||||
for(int k = 0; k < numneighs; k++) {
|
for(int k = 0; k < numneighs; k++) {
|
||||||
int j = neighs[k];
|
int j = neighs[k];
|
||||||
MD_FLOAT delx = xtmp - atom_x(j);
|
MD_FLOAT delx = xtmp - atom_x(j);
|
||||||
@ -80,15 +77,12 @@ double computeForce(
|
|||||||
fiz += delz * force;
|
fiz += delz * force;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fx[i] += fix;
|
fx[i] += fix;
|
||||||
fy[i] += fiy;
|
fy[i] += fiy;
|
||||||
fz[i] += fiz;
|
fz[i] += fiz;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(profile) {
|
|
||||||
// LIKWID_MARKER_STOP("force");
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
@ -16,7 +16,7 @@
|
|||||||
#define LATTICE_DISTANCE 10.0
|
#define LATTICE_DISTANCE 10.0
|
||||||
#define NEIGH_DISTANCE 1.0
|
#define NEIGH_DISTANCE 1.0
|
||||||
|
|
||||||
extern double computeForce( Parameter*, Atom*, Neighbor*, int);
|
extern double computeForce( Parameter*, Atom*, Neighbor*);
|
||||||
|
|
||||||
void init(Parameter *param) {
|
void init(Parameter *param) {
|
||||||
param->epsilon = 1.0;
|
param->epsilon = 1.0;
|
||||||
@ -188,19 +188,19 @@ int main(int argc, const char *argv[]) {
|
|||||||
DEBUG("Building neighbor lists...\n");
|
DEBUG("Building neighbor lists...\n");
|
||||||
buildNeighbor(atom, &neighbor);
|
buildNeighbor(atom, &neighbor);
|
||||||
DEBUG("Computing forces...\n");
|
DEBUG("Computing forces...\n");
|
||||||
computeForce(¶m, atom, &neighbor, 0);
|
computeForce(¶m, atom, &neighbor);
|
||||||
|
|
||||||
double S, E;
|
double S, E;
|
||||||
S = getTimeStamp();
|
S = getTimeStamp();
|
||||||
LIKWID_MARKER_START("force");
|
LIKWID_MARKER_START("force");
|
||||||
for(int i = 0; i < param.ntimes; i++) {
|
for(int i = 0; i < param.ntimes; i++) {
|
||||||
computeForce(¶m, atom, &neighbor, 1);
|
computeForce(¶m, atom, &neighbor);
|
||||||
}
|
}
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
double T_accum = E-S;
|
double T_accum = E-S;
|
||||||
const double atoms_updates_per_sec = atom->Nlocal * param.ntimes / T_accum;
|
const double atoms_updates_per_sec = (double)(atom->Nlocal * INTERNAL_LOOP_NTIMES * param.ntimes) / T_accum;
|
||||||
const double cycles_per_atom = T_accum * freq / (atom->Nlocal * param.ntimes);
|
const double cycles_per_atom = T_accum * freq / (double)(atom->Nlocal * param.ntimes * INTERNAL_LOOP_NTIMES);
|
||||||
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
|
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
|
||||||
|
|
||||||
if(!csv) {
|
if(!csv) {
|
||||||
|
@ -47,7 +47,7 @@ typedef enum {
|
|||||||
NUMTIMER
|
NUMTIMER
|
||||||
} timertype;
|
} timertype;
|
||||||
|
|
||||||
extern double computeForce( Parameter*, Atom*, Neighbor*, int);
|
extern double computeForce( Parameter*, Atom*, Neighbor*);
|
||||||
|
|
||||||
void init(Parameter *param)
|
void init(Parameter *param)
|
||||||
{
|
{
|
||||||
@ -205,7 +205,7 @@ int main (int argc, char** argv)
|
|||||||
|
|
||||||
setup(¶m, &atom, &neighbor);
|
setup(¶m, &atom, &neighbor);
|
||||||
computeThermo(0, ¶m, &atom);
|
computeThermo(0, ¶m, &atom);
|
||||||
computeForce(¶m, &atom, &neighbor, 1);
|
computeForce(¶m, &atom, &neighbor);
|
||||||
|
|
||||||
timer[FORCE] = 0.0;
|
timer[FORCE] = 0.0;
|
||||||
timer[NEIGH] = 0.0;
|
timer[NEIGH] = 0.0;
|
||||||
@ -221,7 +221,7 @@ int main (int argc, char** argv)
|
|||||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||||
}
|
}
|
||||||
|
|
||||||
timer[FORCE] += computeForce(¶m, &atom, &neighbor, 1);
|
timer[FORCE] += computeForce(¶m, &atom, &neighbor);
|
||||||
finalIntegrate(¶m, &atom);
|
finalIntegrate(¶m, &atom);
|
||||||
|
|
||||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||||
|
Loading…
Reference in New Issue
Block a user