| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-O3-funroll | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 7.90% | (excl. loops): 0.00% |
|---|
| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-acfl-O3-funroll | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 7.90% | (excl. loops): 0.00% |
|---|
/home/fmusial/KMEANS_Benchmarks/kmeans/main.cpp: 55 - 96 |
-------------------------------------------------------------------------------- |
55: void k_means(int niters, point_t *points, point_t *centroids, int *assignment, point_t* memory, int n, int k) { |
56: for (int iter = 0; iter < niters; ++iter) { |
57: // determine nearest centroids |
58: #pragma omp parallel for |
[...] |
73: int count[k]; |
74: double sum_x[k]; |
75: double sum_y[k]; |
76: for (int j = 0; j < k; ++j) { |
77: count[j] = 0; |
78: sum_x[j] = 0.; |
79: sum_y[j] = 0.; |
80: } |
81: for (int i = 0; i < n; ++i) { |
82: count[assignment[i]]++; |
83: sum_x[assignment[i]] += points[i].x; |
84: sum_y[assignment[i]] += points[i].y; |
85: } |
86: for (int j = 0; j < k; ++j) { |
87: if (count[j] != 0) { |
88: centroids[j].x = sum_x[j] / count[j]; |
89: centroids[j].y = sum_y[j] / count[j]; |
90: } |
91: // save centroids to memory |
92: memory[(iter + 1) * k + j].x = centroids[j].x; |
93: memory[(iter + 1) * k + j].y = centroids[j].y; |
94: } |
95: } |
96: } |
0x17cc STP X29, X30, [SP, #928]! |
0x17d0 STP X28, X27, [SP, #16] |
0x17d4 STP X26, X25, [SP, #32] |
0x17d8 STP X24, X23, [SP, #48] |
0x17dc STP X22, X21, [SP, #64] |
0x17e0 STP X20, X19, [SP, #80] |
0x17e4 ADD X29, SP, #0 |
0x17e8 SUB SP, SP, #32 |
0x17ec CMP W0, #1 |
0x17f0 STP X2, X1, [X29, #1008] |
0x17f4 STUR X3, [X29, #488] |
0x17f8 STP W6, W5, [X29, #480] |
0x17fc B.LT 1978 |
0x1800 ORR X19, XZR, X4 |
0x1804 ORR W20, WZR, W0 |
0x1808 ORR W27, WZR, WZR |
0x180c ADRP X22, 180c |
0x1810 ADD X22, X22, #2456 |
0x1814 B 1824 |
(4) 0x1818 ADD SP, X28, #0 |
(4) 0x181c CMP W27, W20 |
(4) 0x1820 B.EQ 1978 |
(4) 0x1824 SUB X3, X29, #28 |
(4) 0x1828 SUB X4, X29, #32 |
(4) 0x182c SUB X5, X29, #8 |
(4) 0x1830 SUB X6, X29, #16 |
(4) 0x1834 SUB X7, X29, #24 |
(4) 0x1838 ADRP X0, |
(4) 0x183c ADD X0, X0, #3344 |
(4) 0x1840 MOVZ W1, #5 |
(4) 0x1844 ORR X2, XZR, X22 |
(4) 0x1848 BL 12f0 |
(4) 0x184c LDUR W8, [X29, #480] |
(4) 0x1850 ADD X9, SP, #0 |
(4) 0x1854 ADD X28, SP, #0 |
(4) 0x1858 UBFM X8, X8, #62, #61 |
(4) 0x185c ADD X8, X8, #15 |
(4) 0x1860 AND X8, X8, #6076 |
(4) 0x1864 SUB X23, X9, X8 |
(4) 0x1868 ADD SP, X23, #0 |
(4) 0x186c LDUR W21, [X29, #480] |
(4) 0x1870 ADD X8, SP, #0 |
(4) 0x1874 UBFM X26, X21, #61, #60 |
(4) 0x1878 ADD X9, X26, #15 |
(4) 0x187c AND X9, X9, #6140 |
(4) 0x1880 SUB X24, X8, X9 |
(4) 0x1884 ADD SP, X24, #0 |
(4) 0x1888 ADD X8, SP, #0 |
(4) 0x188c SUB X25, X8, X9 |
(4) 0x1890 ADD SP, X25, #0 |
(4) 0x1894 CMP W21, #1 |
(4) 0x1898 B.LT 18cc |
(4) 0x189c UBFM X2, X21, #62, #61 |
(4) 0x18a0 ORR X0, XZR, X23 |
(4) 0x18a4 ORR W1, WZR, WZR |
(4) 0x18a8 BL 11f0 |
(4) 0x18ac ORR X0, XZR, X24 |
(4) 0x18b0 ORR W1, WZR, WZR |
(4) 0x18b4 ORR X2, XZR, X26 |
(4) 0x18b8 BL 11f0 |
(4) 0x18bc ORR X0, XZR, X25 |
(4) 0x18c0 ORR W1, WZR, WZR |
(4) 0x18c4 ORR X2, XZR, X26 |
(4) 0x18c8 BL 11f0 |
(4) 0x18cc LDUR W8, [X29, #484] |
(4) 0x18d0 CMP W8, #1 |
(4) 0x18d4 B.LT 191c |
(4) 0x18d8 LDUR X10, [X29, #504] |
(4) 0x18dc LDUR X9, [X29, #488] |
(4) 0x18e0 ADD X10, X10, #8 |
(6) 0x18e4 LDRSW X11, [X9], #4 |
(6) 0x18e8 SUBS X8, X8, #1 |
(6) 0x18ec LDP D1, D2, [X10, #1016] |
(6) 0x18f0 ADD X10, X10, #16 |
(6) 0x18f4 LDR D0, [X24, X11,LSL #3] |
(6) 0x18f8 LDR D3, [X25, X11,LSL #3] |
(6) 0x18fc LDR W12, [X23, X11,LSL #2] |
(6) 0x1900 FADD D0, D1, D0 |
(6) 0x1904 FADD D1, D2, D3 |
(6) 0x1908 ADD W12, W12, #1 |
(6) 0x190c STR W12, [X23, X11,LSL #2] |
(6) 0x1910 STR D0, [X24, X11,LSL #3] |
(6) 0x1914 STR D1, [X25, X11,LSL #3] |
(6) 0x1918 B.NE 18e4 |
(4) 0x191c ADD W27, W27, #1 |
(4) 0x1920 CMP W21, #0 |
(4) 0x1924 B.LE 1818 |
(4) 0x1928 MADD W9, W21, W27, WZR |
(4) 0x192c LDUR X8, [X29, #496] |
(4) 0x1930 ADD X9, X19, W9,UXTW #4 |
(4) 0x1934 B 1968 |
(5) 0x1938 SCVTF D0, W10 |
(5) 0x193c LDR D1, [X24] |
(5) 0x1940 LD1 {V1.D[1]}, [X25] |
(5) 0x1944 DUP V0.2D, V0.D[0] |
(5) 0x1948 FDIV V0.2D, V1.2D, V0.2D |
(5) 0x194c STR Q0, [X8] |
(5) 0x1950 SUBS X21, X21, #1 |
(5) 0x1954 ADD X8, X8, #16 |
(5) 0x1958 ADD X25, X25, #8 |
(5) 0x195c ADD X24, X24, #8 |
(5) 0x1960 STR Q0, [X9], #16 |
(5) 0x1964 B.EQ 1818 |
(5) 0x1968 LDR W10, [X23], #4 |
(5) 0x196c CBNZ W10, 1938 |
(5) 0x1970 LDR Q0, [X8] |
(5) 0x1974 B 1950 |
0x1978 ADD SP, X29, #0 |
0x197c LDP X20, X19, [SP, #80] |
0x1980 LDP X22, X21, [SP, #64] |
0x1984 LDP X24, X23, [SP, #48] |
0x1988 LDP X26, X25, [SP, #32] |
0x198c LDP X28, X27, [SP, #16] |
0x1990 LDP X29, X30, [SP], #96 |
0x1994 RET |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:20 | kmeans-acfl-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | kmeans-acfl-O3-funroll |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-O3-funroll |
| nb instructions | 27 |
| nb uops | 27 |
| loop length | 108 |
| used w registers | 6 |
| used x registers | 17 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.38 cycles |
| front end | 3.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| cycles | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.38 |
| Dispatch | 5.17 |
| Overall L1 | 5.17 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 38% |
| load | 50% |
| store | 44% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 15% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SUB SP, SP, #32 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| B.LT 1978 <_Z7k_meansiP7point_tS0_PiS0_ii+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR X19, XZR, X4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR W20, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W27, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADRP X22, 180c <_Z7k_meansiP7point_tS0_PiS0_ii+0x40> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X22, #2456 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| B 1824 <_Z7k_meansiP7point_tS0_PiS0_ii+0x58> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-acfl-O3-funroll |
| nb instructions | 27 |
| nb uops | 27 |
| loop length | 108 |
| used w registers | 6 |
| used x registers | 17 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 12 |
| micro-operation queue | 3.38 cycles |
| front end | 3.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| cycles | 1.50 | 1.50 | 2.50 | 2.50 | 2.50 | 2.50 | 0.00 | 0.00 | 0.00 | 0.00 | 5.17 | 4.83 | 5.00 | 4.50 | 4.50 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 3.38 |
| Dispatch | 5.17 |
| Overall L1 | 5.17 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 38% |
| load | 50% |
| store | 44% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 15% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #928]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SUB SP, SP, #32 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| STP X2, X1, [X29, #1008] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STUR X3, [X29, #488] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STP W6, W5, [X29, #480] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| B.LT 1978 <_Z7k_meansiP7point_tS0_PiS0_ii+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR X19, XZR, X4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR W20, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W27, WZR, WZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADRP X22, 180c <_Z7k_meansiP7point_tS0_PiS0_ii+0x40> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X22, #2456 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| B 1824 <_Z7k_meansiP7point_tS0_PiS0_ii+0x58> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD SP, X29, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDP X20, X19, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X22, X21, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X24, X23, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X26, X25, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X28, X27, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| LDP X29, X30, [SP], #96 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Run run_1_thread | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 1 |
|---|---|
| Run run_2_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 2 |
| Run run_4_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 4 |
| Run run_8_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 8 |
| Run run_16_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 16 |
| Run run_32_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 32 |
| Run run_48_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 48 |
| Run run_64_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 64 |
| (run_1_thread) Efficiency | (run_1_thread) Potential Speed-Up (%) | (run_2_threads) Efficiency | (run_2_threads) Potential Speed-Up (%) | (run_4_threads) Efficiency | (run_4_threads) Potential Speed-Up (%) | (run_8_threads) Efficiency | (run_8_threads) Potential Speed-Up (%) | (run_16_threads) Efficiency | (run_16_threads) Potential Speed-Up (%) | (run_32_threads) Efficiency | (run_32_threads) Potential Speed-Up (%) | (run_48_threads) Efficiency | (run_48_threads) Potential Speed-Up (%) | (run_64_threads) Efficiency | (run_64_threads) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.96 | 0.32 | 0.89 | 0.81 | 0.79 | 1.33 | 0.68 | 1.65 | 0.51 | 1.98 | 0.55 | 1.33 | 0.53 | 1.13 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| run_1_thread | 1 | 1 | 1 | 1 | 9.4129981994629 | 7.9005899429321 |
| run_2_threads | 1 | 0.96 | 1.91 | 2 | 9.4069957733154 | 7.611394405365 |
| run_4_threads | 1 | 0.89 | 3.55 | 4 | 9.414999961853 | 7.1500720977783 |
| run_8_threads | 1 | 0.79 | 6.33 | 8 | 9.4119968414307 | 6.3664712905884 |
| run_16_threads | 1 | 0.68 | 10.93 | 16 | 9.4179983139038 | 5.2081198692322 |
| run_32_threads | 1 | 0.51 | 16.47 | 32 | 10.214997291565 | 4.0857548713684 |
| run_48_threads | 1 | 0.55 | 26.32 | 48 | 9.4399967193604 | 2.9423599243164 |
| run_64_threads | 1 | 0.53 | 33.62 | 64 | 9.4569997787476 | 2.3862574100494 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼k_means(int, point_t*, point_t*, int*, point_t*, int, int)– | 7.90 | 9.41 |
| ▼Loop 4 - main.cpp:56-92 - kmeans-acfl-O3-funroll– | 0.00 | 0.00 |
| ○Loop 6 - main.cpp:81-84 - kmeans-acfl-O3-funroll | 7.90 | 9.41 |
| ○Loop 5 - main.cpp:86-92 - kmeans-acfl-O3-funroll | 0.00 | 0.00 |
