| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-O3-all | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 9.04% | (excl. loops): 0.00% |
|---|
| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-O3-all | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 9.04% | (excl. loops): 0.00% |
|---|
/home/fmusial/KMEANS_Benchmarks/kmeans/main.cpp: 55 - 96 |
-------------------------------------------------------------------------------- |
55: void k_means(int niters, point_t *points, point_t *centroids, int *assignment, point_t* memory, int n, int k) { |
56: for (int iter = 0; iter < niters; ++iter) { |
57: // determine nearest centroids |
58: #pragma omp parallel for |
[...] |
73: int count[k]; |
74: double sum_x[k]; |
75: double sum_y[k]; |
76: for (int j = 0; j < k; ++j) { |
77: count[j] = 0; |
78: sum_x[j] = 0.; |
79: sum_y[j] = 0.; |
80: } |
81: for (int i = 0; i < n; ++i) { |
82: count[assignment[i]]++; |
83: sum_x[assignment[i]] += points[i].x; |
84: sum_y[assignment[i]] += points[i].y; |
85: } |
86: for (int j = 0; j < k; ++j) { |
87: if (count[j] != 0) { |
88: centroids[j].x = sum_x[j] / count[j]; |
89: centroids[j].y = sum_y[j] / count[j]; |
90: } |
91: // save centroids to memory |
92: memory[(iter + 1) * k + j].x = centroids[j].x; |
93: memory[(iter + 1) * k + j].y = centroids[j].y; |
94: } |
95: } |
96: } |
0x1800 STP X29, X30, [SP, #928]! |
0x1804 STP X28, X27, [SP, #16] |
0x1808 STP X26, X25, [SP, #32] |
0x180c STP X24, X23, [SP, #48] |
0x1810 STP X22, X21, [SP, #64] |
0x1814 STP X20, X19, [SP, #80] |
0x1818 ADD X29, SP, #0 |
0x181c SUB SP, SP, #48 |
0x1820 CMP W0, #1 |
0x1824 STP X2, X1, [X29, #1008] |
0x1828 STUR X3, [X29, #488] |
0x182c STP W6, W5, [X29, #480] |
0x1830 B.LT 1af0 |
0x1834 RDVL X8, #1 |
0x1838 ORR W9, WZR, #1887 |
0x183c ORR X20, XZR, X4 |
0x1840 UBFM X8, X8, #4, #63 |
0x1844 ORR W21, WZR, W0 |
0x1848 ORR W28, WZR, WZR |
0x184c MADD X10, X8, X9, XZR |
0x1850 ADD X8, X4, #8 |
0x1854 STP X10, X8, [X29, #976] |
0x1858 B 1868 |
(4) 0x185c ADD SP, X23, #0 |
(4) 0x1860 CMP W28, W21 |
(4) 0x1864 B.EQ 1af0 |
(4) 0x1868 SUB X3, X29, #28 |
(4) 0x186c SUB X4, X29, #32 |
(4) 0x1870 SUB X5, X29, #8 |
(4) 0x1874 SUB X6, X29, #16 |
(4) 0x1878 SUB X7, X29, #24 |
(4) 0x187c ADRP X0, |
(4) 0x1880 ADD X0, X0, #3392 |
(4) 0x1884 MOVZ W1, #5 |
(4) 0x1888 ADRP X2, 1888 |
(4) 0x188c ADD X2, X2, #2832 |
(4) 0x1890 BL 12f0 |
(4) 0x1894 LDUR W8, [X29, #480] |
(4) 0x1898 ADD X9, SP, #0 |
(4) 0x189c ADD X23, SP, #0 |
(4) 0x18a0 UBFM X8, X8, #62, #61 |
(4) 0x18a4 ADD X8, X8, #15 |
(4) 0x18a8 AND X8, X8, #6076 |
(4) 0x18ac SUB X24, X9, X8 |
(4) 0x18b0 ADD SP, X24, #0 |
(4) 0x18b4 LDUR W22, [X29, #480] |
(4) 0x18b8 ADD X8, SP, #0 |
(4) 0x18bc UBFM X27, X22, #61, #60 |
(4) 0x18c0 ADD X9, X27, #15 |
(4) 0x18c4 AND X9, X9, #6140 |
(4) 0x18c8 SUB X25, X8, X9 |
(4) 0x18cc ADD SP, X25, #0 |
(4) 0x18d0 ADD X8, SP, #0 |
(4) 0x18d4 SUB X26, X8, X9 |
(4) 0x18d8 ADD SP, X26, #0 |
(4) 0x18dc CMP W22, #1 |
(4) 0x18e0 B.LT 1914 |
(4) 0x18e4 UBFM X2, X22, #62, #61 |
(4) 0x18e8 ORR X0, XZR, X24 |
(4) 0x18ec ORR W1, WZR, WZR |
(4) 0x18f0 BL 1200 |
(4) 0x18f4 ORR X0, XZR, X25 |
(4) 0x18f8 ORR W1, WZR, WZR |
(4) 0x18fc ORR X2, XZR, X27 |
(4) 0x1900 BL 1200 |
(4) 0x1904 ORR X0, XZR, X26 |
(4) 0x1908 ORR W1, WZR, WZR |
(4) 0x190c ORR X2, XZR, X27 |
(4) 0x1910 BL 1200 |
(4) 0x1914 LDUR W8, [X29, #484] |
(4) 0x1918 CMP W8, #1 |
(4) 0x191c B.LT 1964 |
(4) 0x1920 LDUR X10, [X29, #504] |
(4) 0x1924 LDUR X9, [X29, #488] |
(4) 0x1928 ADD X10, X10, #8 |
(7) 0x192c LDRSW X11, [X9], #4 |
(7) 0x1930 SUBS X8, X8, #1 |
(7) 0x1934 LDP D1, D2, [X10, #1016] |
(7) 0x1938 ADD X10, X10, #16 |
(7) 0x193c LDR D0, [X25, X11,LSL #3] |
(7) 0x1940 LDR D3, [X26, X11,LSL #3] |
(7) 0x1944 LDR W12, [X24, X11,LSL #2] |
(7) 0x1948 FADD D0, D0, D1 |
(7) 0x194c FADD D1, D3, D2 |
(7) 0x1950 ADD W12, W12, #1 |
(7) 0x1954 STR W12, [X24, X11,LSL #2] |
(7) 0x1958 STR D0, [X25, X11,LSL #3] |
(7) 0x195c STR D1, [X26, X11,LSL #3] |
(7) 0x1960 B.NE 192c |
(4) 0x1964 PTRUE P4.D, ALL |
(4) 0x1968 CMP W22, #0 |
(4) 0x196c ADD W28, W28, #1 |
(4) 0x1970 B.LE 185c |
(4) 0x1974 CNTD X8, ALL |
(4) 0x1978 MADD W9, W22, W28, WZR |
(4) 0x197c MOVZ W10, #12 |
(4) 0x1980 CMP X8, #12 |
(4) 0x1984 CSEL X10, X8, X10, #8 |
(4) 0x1988 LDUR X8, [X29, #496] |
(4) 0x198c CMP X10, X22 |
(4) 0x1990 B.LS 1a00 |
(4) 0x1994 ORR X10, XZR, XZR |
(4) 0x1998 UBFM X11, X10, #60, #59 |
(4) 0x199c UBFM X13, X10, #61, #60 |
(4) 0x19a0 ADD X12, X11, X9,LSL #4 |
(4) 0x19a4 ADD X9, X24, X10,LSL #2 |
(4) 0x19a8 ADD X8, X8, X11 |
(4) 0x19ac SUB X10, X22, X10 |
(4) 0x19b0 ADD X11, X20, X12 |
(4) 0x19b4 ADD X12, X26, X13 |
(4) 0x19b8 ADD X13, X25, X13 |
(4) 0x19bc B 19f0 |
(5) 0x19c0 SCVTF D0, W14 |
(5) 0x19c4 LDR D1, [X13] |
(5) 0x19c8 LD1 {V1.D[1]}, [X12] |
(5) 0x19cc DUP V0.2D, V0.D[0] |
(5) 0x19d0 FDIV V0.2D, V1.2D, V0.2D |
(5) 0x19d4 STR Q0, [X8] |
(5) 0x19d8 SUBS X10, X10, #1 |
(5) 0x19dc ADD X8, X8, #16 |
(5) 0x19e0 ADD X12, X12, #8 |
(5) 0x19e4 ADD X13, X13, #8 |
(5) 0x19e8 STR Q0, [X11], #16 |
(5) 0x19ec B.EQ 185c |
(5) 0x19f0 LDR W14, [X9], #4 |
(5) 0x19f4 CBNZ W14, 19c0 |
(5) 0x19f8 LDR Q0, [X8] |
(5) 0x19fc B 19d8 |
(4) 0x1a00 SUB X12, X22, #1 |
(4) 0x1a04 UBFM X13, X9, #60, #59 |
(4) 0x1a08 UBFM X14, X12, #60, #59 |
(4) 0x1a0c ADD X11, X20, X13 |
(4) 0x1a10 ADD X10, X11, X14 |
(4) 0x1a14 CMP X10, X11 |
(4) 0x1a18 ORR X10, XZR, XZR |
(4) 0x1a1c B.CC 1998 |
(4) 0x1a20 LDUR X15, [X29, #472] |
(4) 0x1a24 ADD X13, X15, X13 |
(4) 0x1a28 ADD X14, X13, X14 |
(4) 0x1a2c CMP X14, X13 |
(4) 0x1a30 B.CC 1998 |
(4) 0x1a34 UBFM X12, X12, #60, #63 |
(4) 0x1a38 CBNZ X12, 1998 |
(4) 0x1a3c UBFM X10, X22, #60, #59 |
(4) 0x1a40 ADD X12, X20, X9,LSL #4 |
(4) 0x1a44 ADD X13, X12, X10 |
(4) 0x1a48 CMP X8, X13 |
(4) 0x1a4c B.CS 1a5c |
(4) 0x1a50 ADD X10, X8, X10 |
(4) 0x1a54 CMP X12, X10 |
(4) 0x1a58 B.CC 1994 |
(4) 0x1a5c LDUR X10, [X29, #464] |
(4) 0x1a60 ORR X12, XZR, XZR |
(4) 0x1a64 ORR X13, XZR, XZR |
(4) 0x1a68 AND X10, X10, X22 |
(6) 0x1a6c LD1SW {Z0.D}, P4/Z, [X24, X13,LSL #2] |
(6) 0x1a70 ADD X14, X8, X12,LSL #3 |
(6) 0x1a74 ORR Z1.D, Z0.D, Z0.D |
(6) 0x1a78 SCVTF Z0.D, P4/M, Z0.D |
(6) 0x1a7c AND Z1.D, Z1.D, #4294967295 |
(6) 0x1a80 CMPNE P0.D, P4/Z, Z1.D, #0 |
(6) 0x1a84 CMPEQ P1.D, P4/Z, Z1.D, #0 |
(6) 0x1a88 LD1D {Z2.D}, P0/Z, [X25, X13,LSL #3] |
(6) 0x1a8c LD1D {Z3.D}, P0/Z, [X26, X13,LSL #3] |
(6) 0x1a90 ZIP2 P2.D, P0.D, P0.D |
(6) 0x1a94 ZIP1 P0.D, P0.D, P0.D |
(6) 0x1a98 INCD X13, ALL |
(6) 0x1a9c ZIP2 P3.D, P1.D, P1.D |
(6) 0x1aa0 FDIV Z2.D, P4/M, Z2.D, Z0.D |
(6) 0x1aa4 CMP X10, X13 |
(6) 0x1aa8 FDIVR Z0.D, P4/M, Z0.D, Z3.D |
(6) 0x1aac ZIP2 Z1.D, Z2.D, Z0.D |
(6) 0x1ab0 ZIP1 Z3.D, Z2.D, Z0.D |
(6) 0x1ab4 ST1D {Z1.D}, P0, [X8, X12,LSL #3] |
(6) 0x1ab8 ZIP1 P0.D, P1.D, P1.D |
(6) 0x1abc ST1D {Z3.D}, P2, [X14, #1, MUL VL] |
(6) 0x1ac0 LD1D {Z1.D}, P0/Z, [X8, X12,LSL #3] |
(6) 0x1ac4 LD1D {Z3.D}, P3/Z, [X14, #1, MUL VL] |
(6) 0x1ac8 UZP2 Z4.D, Z1.D, Z3.D |
(6) 0x1acc UZP1 Z1.D, Z1.D, Z3.D |
(6) 0x1ad0 SEL Z4.D, P1, Z4.D, Z0.D |
(6) 0x1ad4 SEL Z3.D, P1, Z1.D, Z2.D |
(6) 0x1ad8 ST2D {Z3.D, Z4.D}, P4, [X11, X12,LSL #3] |
(6) 0x1adc INCW X12, ALL |
(6) 0x1ae0 B.NE 1a6c |
(4) 0x1ae4 CMP X10, X22 |
(4) 0x1ae8 B.EQ 185c |
(4) 0x1aec B 1998 |
0x1af0 ADD SP, X29, #0 |
0x1af4 LDP X20, X19, [SP, #80] |
0x1af8 LDP X22, X21, [SP, #64] |
0x1afc LDP X24, X23, [SP, #48] |
0x1b00 LDP X26, X25, [SP, #32] |
0x1b04 LDP X28, X27, [SP, #16] |
0x1b08 LDP X29, X30, [SP], #96 |
0x1b0c RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-O3-all |
| nb instructions | 31 |
| nb uops | 31 |
| loop length | 124 |
| used w registers | 7 |
| used x registers | 20 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.88 cycles |
| front end | 3.88 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 0.00 | 0.00 | 0.00 | 0.00 | 5.33 | 5.33 | 5.33 | 5.00 | 5.00 |
| cycles | 1.50 | 1.50 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 0.00 | 0.00 | 0.00 | 0.00 | 5.33 | 5.33 | 5.33 | 5.00 | 5.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.88 |
| Dispatch | 5.33 |
| Overall L1 | 5.33 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 72% |
| load | 100% |
| store | 90% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | 50% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 35% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| SUB SP, SP, #48 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| B.LT 1af0 <_Z7k_meansiP7point_tS0_PiS0_ii+0x2f0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| RDVL X8, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (50.0%) |
| ORR W9, WZR, #1887 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| ORR X20, XZR, X4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| UBFM X8, X8, #4, #63 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| ORR W21, WZR, W0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| ORR W28, WZR, WZR | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| MADD X10, X8, X9, XZR | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (50.0%) |
| ADD X8, X4, #8 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP X10, X8, [X29, #976] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| B 1868 <_Z7k_meansiP7point_tS0_PiS0_ii+0x68> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-O3-all |
| nb instructions | 31 |
| nb uops | 31 |
| loop length | 124 |
| used w registers | 7 |
| used x registers | 20 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.88 cycles |
| front end | 3.88 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 0.00 | 0.00 | 0.00 | 0.00 | 5.33 | 5.33 | 5.33 | 5.00 | 5.00 |
| cycles | 1.50 | 1.50 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 0.00 | 0.00 | 0.00 | 0.00 | 5.33 | 5.33 | 5.33 | 5.00 | 5.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.88 |
| Dispatch | 5.33 |
| Overall L1 | 5.33 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 72% |
| load | 100% |
| store | 90% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | 50% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 35% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| SUB SP, SP, #48 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| B.LT 1af0 <_Z7k_meansiP7point_tS0_PiS0_ii+0x2f0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| RDVL X8, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (50.0%) |
| ORR W9, WZR, #1887 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| ORR X20, XZR, X4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| UBFM X8, X8, #4, #63 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| ORR W21, WZR, W0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| ORR W28, WZR, WZR | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| MADD X10, X8, X9, XZR | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (50.0%) |
| ADD X8, X4, #8 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP X10, X8, [X29, #976] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| B 1868 <_Z7k_meansiP7point_tS0_PiS0_ii+0x68> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Run run_1_thread | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 1 |
|---|---|
| Run run_2_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 2 |
| Run run_4_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 4 |
| Run run_8_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 8 |
| Run run_16_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 16 |
| Run run_32_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 32 |
| Run run_48_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 48 |
| Run run_64_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 64 |
| Run run_80_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 80 |
| Run run_96_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 96 |
| (run_1_thread) Efficiency | (run_1_thread) Potential Speed-Up (%) | (run_2_threads) Efficiency | (run_2_threads) Potential Speed-Up (%) | (run_4_threads) Efficiency | (run_4_threads) Potential Speed-Up (%) | (run_8_threads) Efficiency | (run_8_threads) Potential Speed-Up (%) | (run_16_threads) Efficiency | (run_16_threads) Potential Speed-Up (%) | (run_32_threads) Efficiency | (run_32_threads) Potential Speed-Up (%) | (run_48_threads) Efficiency | (run_48_threads) Potential Speed-Up (%) | (run_64_threads) Efficiency | (run_64_threads) Potential Speed-Up (%) | (run_80_threads) Efficiency | (run_80_threads) Potential Speed-Up (%) | (run_96_threads) Efficiency | (run_96_threads) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.95 | 0.43 | 0.86 | 1.04 | 0.78 | 1.4 | 0.71 | 1.48 | 0.66 | 1.17 | 0.64 | 0.91 | 0.62 | 0.79 | 0.62 | 0.66 | 0.59 | 0.61 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| run_1_thread | 1 | 1 | 1 | 1 | 7.7249994277954 | 9.0430212020874 |
| run_2_threads | 1 | 0.95 | 1.9 | 2 | 7.8049993515015 | 8.4173631668091 |
| run_4_threads | 1 | 0.86 | 3.44 | 4 | 7.7799987792969 | 7.4424843788147 |
| run_8_threads | 1 | 0.78 | 6.27 | 8 | 7.8249988555908 | 6.4682784080505 |
| run_16_threads | 1 | 0.71 | 11.31 | 16 | 7.7749996185303 | 5.0344800949097 |
| run_32_threads | 1 | 0.66 | 20.98 | 32 | 7.7849988937378 | 3.4070019721985 |
| run_48_threads | 1 | 0.64 | 30.94 | 48 | 7.7299990653992 | 2.5540208816528 |
| run_64_threads | 1 | 0.62 | 39.63 | 64 | 7.7949986457825 | 2.0658857822418 |
| run_80_threads | 1 | 0.62 | 49.25 | 80 | 7.7649989128113 | 1.7133526802063 |
| run_96_threads | 1 | 0.59 | 56.82 | 96 | 7.8799991607666 | 1.4956392049789 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼k_means(int, point_t*, point_t*, int*, point_t*, int, int)– | 9.04 | 7.72 |
| ▼Loop 4 - main.cpp:56-93 - kmeans-acfl-O3-all– | 0.00 | 0.00 |
| ○Loop 7 - main.cpp:81-84 - kmeans-acfl-O3-all | 9.04 | 7.72 |
| ○Loop 5 - main.cpp:86-92 - kmeans-acfl-O3-all | 0.00 | 0.00 |
| ○Loop 6 - main.cpp:86-93 - kmeans-acfl-O3-all | 0.00 | 0.00 |
