| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-O3 | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 7.12% | (excl. loops): 0.00% |
|---|
| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-O3 | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 7.12% | (excl. loops): 0.00% |
|---|
/home/fmusial/KMEANS_Benchmarks/kmeans/main.cpp: 55 - 96 |
-------------------------------------------------------------------------------- |
55: void k_means(int niters, point_t *points, point_t *centroids, int *assignment, point_t* memory, int n, int k) { |
56: for (int iter = 0; iter < niters; ++iter) { |
57: // determine nearest centroids |
58: #pragma omp parallel for |
[...] |
73: int count[k]; |
74: double sum_x[k]; |
75: double sum_y[k]; |
76: for (int j = 0; j < k; ++j) { |
77: count[j] = 0; |
78: sum_x[j] = 0.; |
79: sum_y[j] = 0.; |
80: } |
81: for (int i = 0; i < n; ++i) { |
82: count[assignment[i]]++; |
83: sum_x[assignment[i]] += points[i].x; |
84: sum_y[assignment[i]] += points[i].y; |
85: } |
86: for (int j = 0; j < k; ++j) { |
87: if (count[j] != 0) { |
88: centroids[j].x = sum_x[j] / count[j]; |
89: centroids[j].y = sum_y[j] / count[j]; |
90: } |
91: // save centroids to memory |
92: memory[(iter + 1) * k + j].x = centroids[j].x; |
93: memory[(iter + 1) * k + j].y = centroids[j].y; |
94: } |
95: } |
96: } |
0x17cc STP X29, X30, [SP, #928]! |
0x17d0 STP X28, X27, [SP, #16] |
0x17d4 STP X26, X25, [SP, #32] |
0x17d8 STP X24, X23, [SP, #48] |
0x17dc STP X22, X21, [SP, #64] |
0x17e0 STP X20, X19, [SP, #80] |
0x17e4 ADD X29, SP, #0 |
0x17e8 SUB SP, SP, #48 |
0x17ec CMP W0, #1 |
0x17f0 STP X2, X1, [X29, #1008] |
0x17f4 STUR X3, [X29, #488] |
0x17f8 STP W6, W5, [X29, #480] |
0x17fc B.LT 1abc |
0x1800 RDVL X8, #1 |
0x1804 ORR W9, WZR, #1887 |
0x1808 ORR X20, XZR, X4 |
0x180c UBFM X8, X8, #4, #63 |
0x1810 ORR W21, WZR, W0 |
0x1814 ORR W28, WZR, WZR |
0x1818 MADD X10, X8, X9, XZR |
0x181c ADD X8, X4, #8 |
0x1820 STP X10, X8, [X29, #976] |
0x1824 B 1834 |
(4) 0x1828 ADD SP, X23, #0 |
(4) 0x182c CMP W28, W21 |
(4) 0x1830 B.EQ 1abc |
(4) 0x1834 SUB X3, X29, #28 |
(4) 0x1838 SUB X4, X29, #32 |
(4) 0x183c SUB X5, X29, #8 |
(4) 0x1840 SUB X6, X29, #16 |
(4) 0x1844 SUB X7, X29, #24 |
(4) 0x1848 ADRP X0, |
(4) 0x184c ADD X0, X0, #3392 |
(4) 0x1850 MOVZ W1, #5 |
(4) 0x1854 ADRP X2, 1854 |
(4) 0x1858 ADD X2, X2, #2780 |
(4) 0x185c BL 12d0 |
(4) 0x1860 LDUR W8, [X29, #480] |
(4) 0x1864 ADD X9, SP, #0 |
(4) 0x1868 ADD X23, SP, #0 |
(4) 0x186c UBFM X8, X8, #62, #61 |
(4) 0x1870 ADD X8, X8, #15 |
(4) 0x1874 AND X8, X8, #6076 |
(4) 0x1878 SUB X24, X9, X8 |
(4) 0x187c ADD SP, X24, #0 |
(4) 0x1880 LDUR W22, [X29, #480] |
(4) 0x1884 ADD X8, SP, #0 |
(4) 0x1888 UBFM X27, X22, #61, #60 |
(4) 0x188c ADD X9, X27, #15 |
(4) 0x1890 AND X9, X9, #6140 |
(4) 0x1894 SUB X25, X8, X9 |
(4) 0x1898 ADD SP, X25, #0 |
(4) 0x189c ADD X8, SP, #0 |
(4) 0x18a0 SUB X26, X8, X9 |
(4) 0x18a4 ADD SP, X26, #0 |
(4) 0x18a8 CMP W22, #1 |
(4) 0x18ac B.LT 18e0 |
(4) 0x18b0 UBFM X2, X22, #62, #61 |
(4) 0x18b4 ORR X0, XZR, X24 |
(4) 0x18b8 ORR W1, WZR, WZR |
(4) 0x18bc BL 11e0 |
(4) 0x18c0 ORR X0, XZR, X25 |
(4) 0x18c4 ORR W1, WZR, WZR |
(4) 0x18c8 ORR X2, XZR, X27 |
(4) 0x18cc BL 11e0 |
(4) 0x18d0 ORR X0, XZR, X26 |
(4) 0x18d4 ORR W1, WZR, WZR |
(4) 0x18d8 ORR X2, XZR, X27 |
(4) 0x18dc BL 11e0 |
(4) 0x18e0 LDUR W8, [X29, #484] |
(4) 0x18e4 CMP W8, #1 |
(4) 0x18e8 B.LT 1930 |
(4) 0x18ec LDUR X10, [X29, #504] |
(4) 0x18f0 LDUR X9, [X29, #488] |
(4) 0x18f4 ADD X10, X10, #8 |
(7) 0x18f8 LDRSW X11, [X9], #4 |
(7) 0x18fc SUBS X8, X8, #1 |
(7) 0x1900 LDP D1, D2, [X10, #1016] |
(7) 0x1904 ADD X10, X10, #16 |
(7) 0x1908 LDR D0, [X25, X11,LSL #3] |
(7) 0x190c LDR D3, [X26, X11,LSL #3] |
(7) 0x1910 LDR W12, [X24, X11,LSL #2] |
(7) 0x1914 FADD D0, D1, D0 |
(7) 0x1918 FADD D1, D2, D3 |
(7) 0x191c ADD W12, W12, #1 |
(7) 0x1920 STR W12, [X24, X11,LSL #2] |
(7) 0x1924 STR D0, [X25, X11,LSL #3] |
(7) 0x1928 STR D1, [X26, X11,LSL #3] |
(7) 0x192c B.NE 18f8 |
(4) 0x1930 PTRUE P4.D, ALL |
(4) 0x1934 CMP W22, #0 |
(4) 0x1938 ADD W28, W28, #1 |
(4) 0x193c B.LE 1828 |
(4) 0x1940 CNTD X8, ALL |
(4) 0x1944 MADD W9, W22, W28, WZR |
(4) 0x1948 MOVZ W10, #12 |
(4) 0x194c CMP X8, #12 |
(4) 0x1950 CSEL X10, X8, X10, #8 |
(4) 0x1954 LDUR X8, [X29, #496] |
(4) 0x1958 CMP X10, X22 |
(4) 0x195c B.LS 19cc |
(4) 0x1960 ORR X10, XZR, XZR |
(4) 0x1964 UBFM X11, X10, #60, #59 |
(4) 0x1968 UBFM X13, X10, #61, #60 |
(4) 0x196c ADD X12, X11, X9,LSL #4 |
(4) 0x1970 ADD X9, X24, X10,LSL #2 |
(4) 0x1974 ADD X8, X8, X11 |
(4) 0x1978 SUB X10, X22, X10 |
(4) 0x197c ADD X11, X20, X12 |
(4) 0x1980 ADD X12, X26, X13 |
(4) 0x1984 ADD X13, X25, X13 |
(4) 0x1988 B 19bc |
(5) 0x198c SCVTF D0, W14 |
(5) 0x1990 LDR D1, [X13] |
(5) 0x1994 LD1 {V1.D[1]}, [X12] |
(5) 0x1998 DUP V0.2D, V0.D[0] |
(5) 0x199c FDIV V0.2D, V1.2D, V0.2D |
(5) 0x19a0 STR Q0, [X8] |
(5) 0x19a4 SUBS X10, X10, #1 |
(5) 0x19a8 ADD X8, X8, #16 |
(5) 0x19ac ADD X12, X12, #8 |
(5) 0x19b0 ADD X13, X13, #8 |
(5) 0x19b4 STR Q0, [X11], #16 |
(5) 0x19b8 B.EQ 1828 |
(5) 0x19bc LDR W14, [X9], #4 |
(5) 0x19c0 CBNZ W14, 198c |
(5) 0x19c4 LDR Q0, [X8] |
(5) 0x19c8 B 19a4 |
(4) 0x19cc SUB X12, X22, #1 |
(4) 0x19d0 UBFM X13, X9, #60, #59 |
(4) 0x19d4 UBFM X14, X12, #60, #59 |
(4) 0x19d8 ADD X11, X20, X13 |
(4) 0x19dc ADD X10, X11, X14 |
(4) 0x19e0 CMP X10, X11 |
(4) 0x19e4 ORR X10, XZR, XZR |
(4) 0x19e8 B.CC 1964 |
(4) 0x19ec LDUR X15, [X29, #472] |
(4) 0x19f0 ADD X13, X15, X13 |
(4) 0x19f4 ADD X14, X13, X14 |
(4) 0x19f8 CMP X14, X13 |
(4) 0x19fc B.CC 1964 |
(4) 0x1a00 UBFM X12, X12, #60, #63 |
(4) 0x1a04 CBNZ X12, 1964 |
(4) 0x1a08 UBFM X10, X22, #60, #59 |
(4) 0x1a0c ADD X12, X20, X9,LSL #4 |
(4) 0x1a10 ADD X13, X12, X10 |
(4) 0x1a14 CMP X8, X13 |
(4) 0x1a18 B.CS 1a28 |
(4) 0x1a1c ADD X10, X8, X10 |
(4) 0x1a20 CMP X12, X10 |
(4) 0x1a24 B.CC 1960 |
(4) 0x1a28 LDUR X10, [X29, #464] |
(4) 0x1a2c ORR X12, XZR, XZR |
(4) 0x1a30 ORR X13, XZR, XZR |
(4) 0x1a34 AND X10, X10, X22 |
(6) 0x1a38 LD1SW {Z0.D}, P4/Z, [X24, X13,LSL #2] |
(6) 0x1a3c ADD X14, X8, X12,LSL #3 |
(6) 0x1a40 ORR Z1.D, Z0.D, Z0.D |
(6) 0x1a44 SCVTF Z0.D, P4/M, Z0.D |
(6) 0x1a48 AND Z1.D, Z1.D, #4294967295 |
(6) 0x1a4c CMPNE P0.D, P4/Z, Z1.D, #0 |
(6) 0x1a50 CMPEQ P1.D, P4/Z, Z1.D, #0 |
(6) 0x1a54 LD1D {Z2.D}, P0/Z, [X25, X13,LSL #3] |
(6) 0x1a58 LD1D {Z3.D}, P0/Z, [X26, X13,LSL #3] |
(6) 0x1a5c ZIP2 P2.D, P0.D, P0.D |
(6) 0x1a60 ZIP1 P0.D, P0.D, P0.D |
(6) 0x1a64 INCD X13, ALL |
(6) 0x1a68 ZIP2 P3.D, P1.D, P1.D |
(6) 0x1a6c FDIV Z2.D, P4/M, Z2.D, Z0.D |
(6) 0x1a70 CMP X10, X13 |
(6) 0x1a74 FDIVR Z0.D, P4/M, Z0.D, Z3.D |
(6) 0x1a78 ZIP2 Z1.D, Z2.D, Z0.D |
(6) 0x1a7c ZIP1 Z3.D, Z2.D, Z0.D |
(6) 0x1a80 ST1D {Z1.D}, P0, [X8, X12,LSL #3] |
(6) 0x1a84 ZIP1 P0.D, P1.D, P1.D |
(6) 0x1a88 ST1D {Z3.D}, P2, [X14, #1, MUL VL] |
(6) 0x1a8c LD1D {Z1.D}, P0/Z, [X8, X12,LSL #3] |
(6) 0x1a90 LD1D {Z3.D}, P3/Z, [X14, #1, MUL VL] |
(6) 0x1a94 UZP2 Z4.D, Z1.D, Z3.D |
(6) 0x1a98 UZP1 Z1.D, Z1.D, Z3.D |
(6) 0x1a9c SEL Z4.D, P1, Z4.D, Z0.D |
(6) 0x1aa0 SEL Z3.D, P1, Z1.D, Z2.D |
(6) 0x1aa4 ST2D {Z3.D, Z4.D}, P4, [X11, X12,LSL #3] |
(6) 0x1aa8 INCW X12, ALL |
(6) 0x1aac B.NE 1a38 |
(4) 0x1ab0 CMP X10, X22 |
(4) 0x1ab4 B.EQ 1828 |
(4) 0x1ab8 B 1964 |
0x1abc ADD SP, X29, #0 |
0x1ac0 LDP X20, X19, [SP, #80] |
0x1ac4 LDP X22, X21, [SP, #64] |
0x1ac8 LDP X24, X23, [SP, #48] |
0x1acc LDP X26, X25, [SP, #32] |
0x1ad0 LDP X28, X27, [SP, #16] |
0x1ad4 LDP X29, X30, [SP], #96 |
0x1ad8 RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3 |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-O3 |
| nb instructions | 31 |
| nb uops | 31 |
| loop length | 124 |
| used w registers | 7 |
| used x registers | 20 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.88 cycles |
| front end | 3.88 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 0.00 | 0.00 | 0.00 | 0.00 | 5.33 | 5.33 | 5.33 | 5.00 | 5.00 |
| cycles | 1.50 | 1.50 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 0.00 | 0.00 | 0.00 | 0.00 | 5.33 | 5.33 | 5.33 | 5.00 | 5.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.88 |
| Dispatch | 5.33 |
| Overall L1 | 5.33 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 72% |
| load | 100% |
| store | 90% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | 50% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 35% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| SUB SP, SP, #48 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| B.LT 1abc <_Z7k_meansiP7point_tS0_PiS0_ii+0x2f0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| RDVL X8, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (50.0%) |
| ORR W9, WZR, #1887 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| ORR X20, XZR, X4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| UBFM X8, X8, #4, #63 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| ORR W21, WZR, W0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| ORR W28, WZR, WZR | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| MADD X10, X8, X9, XZR | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (50.0%) |
| ADD X8, X4, #8 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP X10, X8, [X29, #976] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| B 1834 <_Z7k_meansiP7point_tS0_PiS0_ii+0x68> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-O3 |
| nb instructions | 31 |
| nb uops | 31 |
| loop length | 124 |
| used w registers | 7 |
| used x registers | 20 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.88 cycles |
| front end | 3.88 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 0.00 | 0.00 | 0.00 | 0.00 | 5.33 | 5.33 | 5.33 | 5.00 | 5.00 |
| cycles | 1.50 | 1.50 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 2.17 | 0.00 | 0.00 | 0.00 | 0.00 | 5.33 | 5.33 | 5.33 | 5.00 | 5.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.88 |
| Dispatch | 5.33 |
| Overall L1 | 5.33 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 72% |
| load | 100% |
| store | 90% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 50% |
| fma | 50% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 35% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| SUB SP, SP, #48 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| B.LT 1abc <_Z7k_meansiP7point_tS0_PiS0_ii+0x2f0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| RDVL X8, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (50.0%) |
| ORR W9, WZR, #1887 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| ORR X20, XZR, X4 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| UBFM X8, X8, #4, #63 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| ORR W21, WZR, W0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| ORR W28, WZR, WZR | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (25.0%) |
| MADD X10, X8, X9, XZR | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (50.0%) |
| ADD X8, X4, #8 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (50.0%) |
| STP X10, X8, [X29, #976] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (100.0%) |
| B 1834 <_Z7k_meansiP7point_tS0_PiS0_ii+0x68> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0.17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | scal (100.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.50 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Run run_1_thread | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 1 |
|---|---|
| Run run_2_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 2 |
| Run run_4_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 4 |
| Run run_8_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 8 |
| Run run_16_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 16 |
| Run run_32_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 32 |
| Run run_48_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 48 |
| Run run_64_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 64 |
| Run run_80_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 80 |
| Run run_96_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 96 |
| (run_1_thread) Efficiency | (run_1_thread) Potential Speed-Up (%) | (run_2_threads) Efficiency | (run_2_threads) Potential Speed-Up (%) | (run_4_threads) Efficiency | (run_4_threads) Potential Speed-Up (%) | (run_8_threads) Efficiency | (run_8_threads) Potential Speed-Up (%) | (run_16_threads) Efficiency | (run_16_threads) Potential Speed-Up (%) | (run_32_threads) Efficiency | (run_32_threads) Potential Speed-Up (%) | (run_48_threads) Efficiency | (run_48_threads) Potential Speed-Up (%) | (run_64_threads) Efficiency | (run_64_threads) Potential Speed-Up (%) | (run_80_threads) Efficiency | (run_80_threads) Potential Speed-Up (%) | (run_96_threads) Efficiency | (run_96_threads) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.98 | 0.11 | 0.92 | 0.53 | 0.85 | 0.8 | 0.76 | 1.09 | 0.7 | 0.95 | 0.67 | 0.81 | 0.63 | 0.75 | 0.62 | 0.65 | 0.6 | 0.59 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| run_1_thread | 1 | 1 | 1 | 1 | 7.8799986839294 | 7.1218757629395 |
| run_2_threads | 1 | 0.98 | 1.97 | 2 | 7.7500009536743 | 6.7644248008728 |
| run_4_threads | 1 | 0.92 | 3.67 | 4 | 7.8149981498718 | 6.370491027832 |
| run_8_threads | 1 | 0.85 | 6.83 | 8 | 7.7299995422363 | 5.4857707023621 |
| run_16_threads | 1 | 0.76 | 12.12 | 16 | 7.8399991989136 | 4.4797439575195 |
| run_32_threads | 1 | 0.7 | 22.34 | 32 | 7.729998588562 | 3.1516933441162 |
| run_48_threads | 1 | 0.67 | 32.15 | 48 | 7.7149987220764 | 2.4423055648804 |
| run_64_threads | 1 | 0.63 | 40.32 | 64 | 7.8499989509583 | 2.0220232009888 |
| run_80_threads | 1 | 0.62 | 49.53 | 80 | 7.864999294281 | 1.6978979110718 |
| run_96_threads | 1 | 0.6 | 57.7 | 96 | 7.925000667572 | 1.4691842794418 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼k_means(int, point_t*, point_t*, int*, point_t*, int, int)– | 7.12 | 7.88 |
| ▼Loop 4 - main.cpp:56-93 - kmeans-acfl-O3– | 0.00 | 0.00 |
| ○Loop 7 - main.cpp:81-84 - kmeans-acfl-O3 | 7.12 | 7.88 |
| ○Loop 5 - main.cpp:86-92 - kmeans-acfl-O3 | 0.00 | 0.00 |
| ○Loop 6 - main.cpp:86-93 - kmeans-acfl-O3 | 0.00 | 0.00 |
