| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-Ofast | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 9.12% | (excl. loops): 0.00% |
|---|
| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-Ofast | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 9.12% | (excl. loops): 0.00% |
|---|
/home/fmusial/KMEANS_Benchmarks/kmeans/main.cpp: 55 - 96 |
-------------------------------------------------------------------------------- |
55: void k_means(int niters, point_t *points, point_t *centroids, int *assignment, point_t* memory, int n, int k) { |
56: for (int iter = 0; iter < niters; ++iter) { |
57: // determine nearest centroids |
58: #pragma omp parallel for |
[...] |
73: int count[k]; |
74: double sum_x[k]; |
75: double sum_y[k]; |
76: for (int j = 0; j < k; ++j) { |
77: count[j] = 0; |
78: sum_x[j] = 0.; |
79: sum_y[j] = 0.; |
80: } |
81: for (int i = 0; i < n; ++i) { |
82: count[assignment[i]]++; |
83: sum_x[assignment[i]] += points[i].x; |
84: sum_y[assignment[i]] += points[i].y; |
85: } |
86: for (int j = 0; j < k; ++j) { |
87: if (count[j] != 0) { |
88: centroids[j].x = sum_x[j] / count[j]; |
89: centroids[j].y = sum_y[j] / count[j]; |
90: } |
91: // save centroids to memory |
92: memory[(iter + 1) * k + j].x = centroids[j].x; |
93: memory[(iter + 1) * k + j].y = centroids[j].y; |
94: } |
95: } |
96: } |
0x17e0 STP X29, X30, [SP, #928]! |
0x17e4 STP X28, X27, [SP, #16] |
0x17e8 STP X26, X25, [SP, #32] |
0x17ec STP X24, X23, [SP, #48] |
0x17f0 STP X22, X21, [SP, #64] |
0x17f4 STP X20, X19, [SP, #80] |
0x17f8 ADD X29, SP, #0 |
0x17fc SUB SP, SP, #32 |
0x1800 CMP W0, #1 |
0x1804 STP X2, X1, [X29, #1008] |
0x1808 STUR X3, [X29, #488] |
0x180c STP W6, W5, [X29, #480] |
0x1810 B.LT 198c |
0x1814 ORR X19, XZR, X4 |
0x1818 ORR W20, WZR, W0 |
0x181c ORR W27, WZR, WZR |
0x1820 ADRP X22, 1820 |
0x1824 ADD X22, X22, #2476 |
0x1828 B 1838 |
(4) 0x182c ADD SP, X28, #0 |
(4) 0x1830 CMP W27, W20 |
(4) 0x1834 B.EQ 198c |
(4) 0x1838 SUB X3, X29, #28 |
(4) 0x183c SUB X4, X29, #32 |
(4) 0x1840 SUB X5, X29, #8 |
(4) 0x1844 SUB X6, X29, #16 |
(4) 0x1848 SUB X7, X29, #24 |
(4) 0x184c ADRP X0, |
(4) 0x1850 ADD X0, X0, #3344 |
(4) 0x1854 MOVZ W1, #5 |
(4) 0x1858 ORR X2, XZR, X22 |
(4) 0x185c BL 1310 |
(4) 0x1860 LDUR W8, [X29, #480] |
(4) 0x1864 ADD X9, SP, #0 |
(4) 0x1868 ADD X28, SP, #0 |
(4) 0x186c UBFM X8, X8, #62, #61 |
(4) 0x1870 ADD X8, X8, #15 |
(4) 0x1874 AND X8, X8, #6076 |
(4) 0x1878 SUB X23, X9, X8 |
(4) 0x187c ADD SP, X23, #0 |
(4) 0x1880 LDUR W21, [X29, #480] |
(4) 0x1884 ADD X8, SP, #0 |
(4) 0x1888 UBFM X26, X21, #61, #60 |
(4) 0x188c ADD X9, X26, #15 |
(4) 0x1890 AND X9, X9, #6140 |
(4) 0x1894 SUB X24, X8, X9 |
(4) 0x1898 ADD SP, X24, #0 |
(4) 0x189c ADD X8, SP, #0 |
(4) 0x18a0 SUB X25, X8, X9 |
(4) 0x18a4 ADD SP, X25, #0 |
(4) 0x18a8 CMP W21, #1 |
(4) 0x18ac B.LT 18e0 |
(4) 0x18b0 UBFM X2, X21, #62, #61 |
(4) 0x18b4 ORR X0, XZR, X23 |
(4) 0x18b8 ORR W1, WZR, WZR |
(4) 0x18bc BL 1210 |
(4) 0x18c0 ORR X0, XZR, X24 |
(4) 0x18c4 ORR W1, WZR, WZR |
(4) 0x18c8 ORR X2, XZR, X26 |
(4) 0x18cc BL 1210 |
(4) 0x18d0 ORR X0, XZR, X25 |
(4) 0x18d4 ORR W1, WZR, WZR |
(4) 0x18d8 ORR X2, XZR, X26 |
(4) 0x18dc BL 1210 |
(4) 0x18e0 LDUR W8, [X29, #484] |
(4) 0x18e4 CMP W8, #1 |
(4) 0x18e8 B.LT 1930 |
(4) 0x18ec LDUR X10, [X29, #504] |
(4) 0x18f0 LDUR X9, [X29, #488] |
(4) 0x18f4 ADD X10, X10, #8 |
(6) 0x18f8 LDRSW X11, [X9], #4 |
(6) 0x18fc SUBS X8, X8, #1 |
(6) 0x1900 LDP D1, D2, [X10, #1016] |
(6) 0x1904 ADD X10, X10, #16 |
(6) 0x1908 LDR D0, [X24, X11,LSL #3] |
(6) 0x190c LDR D3, [X25, X11,LSL #3] |
(6) 0x1910 LDR W12, [X23, X11,LSL #2] |
(6) 0x1914 FADD D0, D0, D1 |
(6) 0x1918 FADD D1, D3, D2 |
(6) 0x191c ADD W12, W12, #1 |
(6) 0x1920 STR W12, [X23, X11,LSL #2] |
(6) 0x1924 STR D0, [X24, X11,LSL #3] |
(6) 0x1928 STR D1, [X25, X11,LSL #3] |
(6) 0x192c B.NE 18f8 |
(4) 0x1930 ADD W27, W27, #1 |
(4) 0x1934 CMP W21, #0 |
(4) 0x1938 B.LE 182c |
(4) 0x193c MADD W9, W21, W27, WZR |
(4) 0x1940 LDUR X8, [X29, #496] |
(4) 0x1944 ADD X9, X19, W9,UXTW #4 |
(4) 0x1948 B 197c |
(5) 0x194c SCVTF D0, W10 |
(5) 0x1950 LDR D1, [X24] |
(5) 0x1954 LD1 {V1.D[1]}, [X25] |
(5) 0x1958 DUP V0.2D, V0.D[0] |
(5) 0x195c FDIV V0.2D, V1.2D, V0.2D |
(5) 0x1960 STR Q0, [X8] |
(5) 0x1964 SUBS X21, X21, #1 |
(5) 0x1968 ADD X8, X8, #16 |
(5) 0x196c ADD X25, X25, #8 |
(5) 0x1970 ADD X24, X24, #8 |
(5) 0x1974 STR Q0, [X9], #16 |
(5) 0x1978 B.EQ 182c |
(5) 0x197c LDR W10, [X23], #4 |
(5) 0x1980 CBNZ W10, 194c |
(5) 0x1984 LDR Q0, [X8] |
(5) 0x1988 B 1964 |
0x198c ADD SP, X29, #0 |
0x1990 LDP X20, X19, [SP, #80] |
0x1994 LDP X22, X21, [SP, #64] |
0x1998 LDP X24, X23, [SP, #48] |
0x199c LDP X26, X25, [SP, #32] |
0x19a0 LDP X28, X27, [SP, #16] |
0x19a4 LDP X29, X30, [SP], #96 |
0x19a8 RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-Ofast |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-Ofast |
| nb instructions | 27 |
| nb uops | 27 |
| loop length | 108 |
| used w registers | 6 |
| used x registers | 17 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.38 cycles |
| front end | 3.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| cycles | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.38 |
| Dispatch | 5.17 |
| Overall L1 | 5.17 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 38% |
| load | 50% |
| store | 44% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 15% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SUB SP, SP, #32 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| B.LT 198c <_Z7k_meansiP7point_tS0_PiS0_ii+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR X19, XZR, X4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR W20, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W27, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADRP X22, 1820 <_Z7k_meansiP7point_tS0_PiS0_ii+0x40> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X22, #2476 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| B 1838 <_Z7k_meansiP7point_tS0_PiS0_ii+0x58> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-Ofast |
| nb instructions | 27 |
| nb uops | 27 |
| loop length | 108 |
| used w registers | 6 |
| used x registers | 17 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.38 cycles |
| front end | 3.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| cycles | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.38 |
| Dispatch | 5.17 |
| Overall L1 | 5.17 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 38% |
| load | 50% |
| store | 44% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 15% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SUB SP, SP, #32 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| B.LT 198c <_Z7k_meansiP7point_tS0_PiS0_ii+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR X19, XZR, X4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR W20, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W27, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADRP X22, 1820 <_Z7k_meansiP7point_tS0_PiS0_ii+0x40> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X22, #2476 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| B 1838 <_Z7k_meansiP7point_tS0_PiS0_ii+0x58> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Run run_1_thread | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 1 |
|---|---|
| Run run_2_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 2 |
| Run run_4_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 4 |
| Run run_8_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 8 |
| Run run_16_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 16 |
| Run run_32_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 32 |
| Run run_48_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 48 |
| Run run_64_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 64 |
| (run_1_thread) Efficiency | (run_1_thread) Potential Speed-Up (%) | (run_2_threads) Efficiency | (run_2_threads) Potential Speed-Up (%) | (run_4_threads) Efficiency | (run_4_threads) Potential Speed-Up (%) | (run_8_threads) Efficiency | (run_8_threads) Potential Speed-Up (%) | (run_16_threads) Efficiency | (run_16_threads) Potential Speed-Up (%) | (run_32_threads) Efficiency | (run_32_threads) Potential Speed-Up (%) | (run_48_threads) Efficiency | (run_48_threads) Potential Speed-Up (%) | (run_64_threads) Efficiency | (run_64_threads) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.95 | 0.43 | 0.87 | 1.03 | 0.77 | 1.63 | 0.66 | 1.93 | 0.58 | 1.69 | 0.54 | 1.4 | 0.52 | 1.17 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| run_1_thread | 1 | 1 | 1 | 1 | 9.4209938049316 | 9.1240148544312 |
| run_2_threads | 1 | 0.95 | 1.9 | 2 | 9.423996925354 | 8.7449541091919 |
| run_4_threads | 1 | 0.87 | 3.5 | 4 | 9.4249925613403 | 8.128643989563 |
| run_8_threads | 1 | 0.77 | 6.18 | 8 | 9.4249954223633 | 7.1372847557068 |
| run_16_threads | 1 | 0.66 | 10.6 | 16 | 9.4299945831299 | 5.7270221710205 |
| run_32_threads | 1 | 0.58 | 18.44 | 32 | 9.4449920654297 | 3.9798758029938 |
| run_48_threads | 1 | 0.54 | 25.89 | 48 | 9.4529972076416 | 3.0417206287384 |
| run_64_threads | 1 | 0.52 | 33.32 | 64 | 9.4649963378906 | 2.4408988952637 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼k_means(int, point_t*, point_t*, int*, point_t*, int, int)– | 9.12 | 9.42 |
| ▼Loop 4 - main.cpp:56-92 - kmeans-acfl-Ofast– | 0.00 | 0.00 |
| ○Loop 6 - main.cpp:81-84 - kmeans-acfl-Ofast | 9.12 | 9.42 |
| ○Loop 5 - main.cpp:86-92 - kmeans-acfl-Ofast | 0.00 | 0.00 |
