| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-O3-all | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 9.59% | (excl. loops): 0.00% |
|---|
| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-O3-all | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 9.59% | (excl. loops): 0.00% |
|---|
/home/fmusial/KMEANS_Benchmarks/kmeans/main.cpp: 55 - 96 |
-------------------------------------------------------------------------------- |
55: void k_means(int niters, point_t *points, point_t *centroids, int *assignment, point_t* memory, int n, int k) { |
56: for (int iter = 0; iter < niters; ++iter) { |
57: // determine nearest centroids |
58: #pragma omp parallel for |
[...] |
73: int count[k]; |
74: double sum_x[k]; |
75: double sum_y[k]; |
76: for (int j = 0; j < k; ++j) { |
77: count[j] = 0; |
78: sum_x[j] = 0.; |
79: sum_y[j] = 0.; |
80: } |
81: for (int i = 0; i < n; ++i) { |
82: count[assignment[i]]++; |
83: sum_x[assignment[i]] += points[i].x; |
84: sum_y[assignment[i]] += points[i].y; |
85: } |
86: for (int j = 0; j < k; ++j) { |
87: if (count[j] != 0) { |
88: centroids[j].x = sum_x[j] / count[j]; |
89: centroids[j].y = sum_y[j] / count[j]; |
90: } |
91: // save centroids to memory |
92: memory[(iter + 1) * k + j].x = centroids[j].x; |
93: memory[(iter + 1) * k + j].y = centroids[j].y; |
94: } |
95: } |
96: } |
0x1800 STP X29, X30, [SP, #928]! |
0x1804 STP X28, X27, [SP, #16] |
0x1808 STP X26, X25, [SP, #32] |
0x180c STP X24, X23, [SP, #48] |
0x1810 STP X22, X21, [SP, #64] |
0x1814 STP X20, X19, [SP, #80] |
0x1818 ADD X29, SP, #0 |
0x181c SUB SP, SP, #32 |
0x1820 CMP W0, #1 |
0x1824 STP X2, X1, [X29, #1008] |
0x1828 STUR X3, [X29, #488] |
0x182c STP W6, W5, [X29, #480] |
0x1830 B.LT 19ac |
0x1834 ORR X19, XZR, X4 |
0x1838 ORR W20, WZR, W0 |
0x183c ORR W27, WZR, WZR |
0x1840 ADRP X22, 1840 |
0x1844 ADD X22, X22, #2508 |
0x1848 B 1858 |
(4) 0x184c ADD SP, X28, #0 |
(4) 0x1850 CMP W27, W20 |
(4) 0x1854 B.EQ 19ac |
(4) 0x1858 SUB X3, X29, #28 |
(4) 0x185c SUB X4, X29, #32 |
(4) 0x1860 SUB X5, X29, #8 |
(4) 0x1864 SUB X6, X29, #16 |
(4) 0x1868 SUB X7, X29, #24 |
(4) 0x186c ADRP X0, |
(4) 0x1870 ADD X0, X0, #3392 |
(4) 0x1874 MOVZ W1, #5 |
(4) 0x1878 ORR X2, XZR, X22 |
(4) 0x187c BL 12f0 |
(4) 0x1880 LDUR W8, [X29, #480] |
(4) 0x1884 ADD X9, SP, #0 |
(4) 0x1888 ADD X28, SP, #0 |
(4) 0x188c UBFM X8, X8, #62, #61 |
(4) 0x1890 ADD X8, X8, #15 |
(4) 0x1894 AND X8, X8, #6076 |
(4) 0x1898 SUB X23, X9, X8 |
(4) 0x189c ADD SP, X23, #0 |
(4) 0x18a0 LDUR W21, [X29, #480] |
(4) 0x18a4 ADD X8, SP, #0 |
(4) 0x18a8 UBFM X26, X21, #61, #60 |
(4) 0x18ac ADD X9, X26, #15 |
(4) 0x18b0 AND X9, X9, #6140 |
(4) 0x18b4 SUB X24, X8, X9 |
(4) 0x18b8 ADD SP, X24, #0 |
(4) 0x18bc ADD X8, SP, #0 |
(4) 0x18c0 SUB X25, X8, X9 |
(4) 0x18c4 ADD SP, X25, #0 |
(4) 0x18c8 CMP W21, #1 |
(4) 0x18cc B.LT 1900 |
(4) 0x18d0 UBFM X2, X21, #62, #61 |
(4) 0x18d4 ORR X0, XZR, X23 |
(4) 0x18d8 ORR W1, WZR, WZR |
(4) 0x18dc BL 1200 |
(4) 0x18e0 ORR X0, XZR, X24 |
(4) 0x18e4 ORR W1, WZR, WZR |
(4) 0x18e8 ORR X2, XZR, X26 |
(4) 0x18ec BL 1200 |
(4) 0x18f0 ORR X0, XZR, X25 |
(4) 0x18f4 ORR W1, WZR, WZR |
(4) 0x18f8 ORR X2, XZR, X26 |
(4) 0x18fc BL 1200 |
(4) 0x1900 LDUR W8, [X29, #484] |
(4) 0x1904 CMP W8, #1 |
(4) 0x1908 B.LT 1950 |
(4) 0x190c LDUR X10, [X29, #504] |
(4) 0x1910 LDUR X9, [X29, #488] |
(4) 0x1914 ADD X10, X10, #8 |
(6) 0x1918 LDRSW X11, [X9], #4 |
(6) 0x191c SUBS X8, X8, #1 |
(6) 0x1920 LDP D1, D2, [X10, #1016] |
(6) 0x1924 ADD X10, X10, #16 |
(6) 0x1928 LDR D0, [X24, X11,LSL #3] |
(6) 0x192c LDR D3, [X25, X11,LSL #3] |
(6) 0x1930 LDR W12, [X23, X11,LSL #2] |
(6) 0x1934 FADD D0, D0, D1 |
(6) 0x1938 FADD D1, D3, D2 |
(6) 0x193c ADD W12, W12, #1 |
(6) 0x1940 STR W12, [X23, X11,LSL #2] |
(6) 0x1944 STR D0, [X24, X11,LSL #3] |
(6) 0x1948 STR D1, [X25, X11,LSL #3] |
(6) 0x194c B.NE 1918 |
(4) 0x1950 ADD W27, W27, #1 |
(4) 0x1954 CMP W21, #0 |
(4) 0x1958 B.LE 184c |
(4) 0x195c MADD W9, W21, W27, WZR |
(4) 0x1960 LDUR X8, [X29, #496] |
(4) 0x1964 ADD X9, X19, W9,UXTW #4 |
(4) 0x1968 B 199c |
(5) 0x196c SCVTF D0, W10 |
(5) 0x1970 LDR D1, [X24] |
(5) 0x1974 LD1 {V1.D[1]}, [X25] |
(5) 0x1978 DUP V0.2D, V0.D[0] |
(5) 0x197c FDIV V0.2D, V1.2D, V0.2D |
(5) 0x1980 STR Q0, [X8] |
(5) 0x1984 SUBS X21, X21, #1 |
(5) 0x1988 ADD X8, X8, #16 |
(5) 0x198c ADD X25, X25, #8 |
(5) 0x1990 ADD X24, X24, #8 |
(5) 0x1994 STR Q0, [X9], #16 |
(5) 0x1998 B.EQ 184c |
(5) 0x199c LDR W10, [X23], #4 |
(5) 0x19a0 CBNZ W10, 196c |
(5) 0x19a4 LDR Q0, [X8] |
(5) 0x19a8 B 1984 |
0x19ac ADD SP, X29, #0 |
0x19b0 LDP X20, X19, [SP, #80] |
0x19b4 LDP X22, X21, [SP, #64] |
0x19b8 LDP X24, X23, [SP, #48] |
0x19bc LDP X26, X25, [SP, #32] |
0x19c0 LDP X28, X27, [SP, #16] |
0x19c4 LDP X29, X30, [SP], #96 |
0x19c8 RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-all |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-acfl-O3-all |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-O3-all |
| nb instructions | 27 |
| nb uops | 27 |
| loop length | 108 |
| used w registers | 6 |
| used x registers | 17 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.38 cycles |
| front end | 3.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| cycles | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.38 |
| Dispatch | 5.17 |
| Overall L1 | 5.17 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 38% |
| load | 50% |
| store | 44% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 15% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SUB SP, SP, #32 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| B.LT 19ac <_Z7k_meansiP7point_tS0_PiS0_ii+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR X19, XZR, X4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR W20, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W27, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADRP X22, 1840 <_Z7k_meansiP7point_tS0_PiS0_ii+0x40> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X22, #2508 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| B 1858 <_Z7k_meansiP7point_tS0_PiS0_ii+0x58> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-O3-all |
| nb instructions | 27 |
| nb uops | 27 |
| loop length | 108 |
| used w registers | 6 |
| used x registers | 17 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.38 cycles |
| front end | 3.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| cycles | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.38 |
| Dispatch | 5.17 |
| Overall L1 | 5.17 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 38% |
| load | 50% |
| store | 44% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 15% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SUB SP, SP, #32 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| B.LT 19ac <_Z7k_meansiP7point_tS0_PiS0_ii+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR X19, XZR, X4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR W20, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W27, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADRP X22, 1840 <_Z7k_meansiP7point_tS0_PiS0_ii+0x40> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X22, #2508 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| B 1858 <_Z7k_meansiP7point_tS0_PiS0_ii+0x58> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Run run_1_thread | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 1 |
|---|---|
| Run run_2_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 2 |
| Run run_4_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 4 |
| Run run_8_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 8 |
| Run run_16_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 16 |
| Run run_32_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 32 |
| Run run_48_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 48 |
| Run run_64_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 64 |
| (run_1_thread) Efficiency | (run_1_thread) Potential Speed-Up (%) | (run_2_threads) Efficiency | (run_2_threads) Potential Speed-Up (%) | (run_4_threads) Efficiency | (run_4_threads) Potential Speed-Up (%) | (run_8_threads) Efficiency | (run_8_threads) Potential Speed-Up (%) | (run_16_threads) Efficiency | (run_16_threads) Potential Speed-Up (%) | (run_32_threads) Efficiency | (run_32_threads) Potential Speed-Up (%) | (run_48_threads) Efficiency | (run_48_threads) Potential Speed-Up (%) | (run_64_threads) Efficiency | (run_64_threads) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.95 | 0.46 | 0.87 | 1.09 | 0.78 | 1.63 | 0.66 | 2.07 | 0.57 | 1.8 | 0.53 | 1.54 | 0.5 | 1.32 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| run_1_thread | 1 | 1 | 1 | 1 | 9.201997756958 | 9.5870113372803 |
| run_2_threads | 1 | 0.95 | 1.9 | 2 | 9.1809911727905 | 9.1981067657471 |
| run_4_threads | 1 | 0.87 | 3.49 | 4 | 9.1479940414429 | 8.5378828048706 |
| run_8_threads | 1 | 0.78 | 6.24 | 8 | 9.0729961395264 | 7.403510093689 |
| run_16_threads | 1 | 0.66 | 10.48 | 16 | 9.1709928512573 | 6.0055394172668 |
| run_32_threads | 1 | 0.57 | 18.24 | 32 | 9.1479940414429 | 4.1896615028381 |
| run_48_threads | 1 | 0.53 | 25.37 | 48 | 9.1489992141724 | 3.2612104415894 |
| run_64_threads | 1 | 0.5 | 32.29 | 64 | 9.1609973907471 | 2.6716711521149 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼k_means(int, point_t*, point_t*, int*, point_t*, int, int)– | 9.59 | 9.20 |
| ▼Loop 4 - main.cpp:56-92 - kmeans-acfl-O3-all– | 0.00 | 0.00 |
| ○Loop 6 - main.cpp:81-84 - kmeans-acfl-O3-all | 9.59 | 9.20 |
| ○Loop 5 - main.cpp:86-92 - kmeans-acfl-O3-all | 0.00 | 0.00 |
