| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) [clone ._omp_fn.0] | Module: kmeans-gcc-O3-funroll | Source: main.cpp:58-67 | Coverage (incl. loops): 91.46% | (excl. loops): 0.00% |
|---|
| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) [clone ._omp_fn.0] | Module: kmeans-gcc-O3-funroll | Source: main.cpp:58-67 | Coverage (incl. loops): 91.46% | (excl. loops): 0.00% |
|---|
/home/fmusial/KMEANS_Benchmarks/kmeans/main.cpp: 58 - 67 |
-------------------------------------------------------------------------------- |
58: #pragma omp parallel for |
59: for (int i = 0; i < n; ++i) { |
60: double optimal_dist = DBL_MAX; |
61: for (int j = 0; j < k; ++j) { |
62: double dist = |
63: (points[i].x - centroids[j].x) * (points[i].x - centroids[j].x) + |
64: (points[i].y - centroids[j].y) * (points[i].y - centroids[j].y); |
65: if (dist < optimal_dist) { |
66: optimal_dist = dist; |
67: assignment[i] = j; |
0x401cc0 STP X29, X30, [SP, #976]! |
0x401cc4 ADD X29, SP, #0 |
0x401cc8 STP X19, X20, [SP, #16] |
0x401ccc ORR X20, XZR, X0 |
0x401cd0 STR X21, [SP, #32] |
0x401cd4 LDR W21, [X0, #24] |
0x401cd8 BL 401770 |
0x401cdc ORR W19, WZR, W0 |
0x401ce0 BL 4016d0 |
0x401ce4 ORR W4, WZR, W0 |
0x401ce8 SDIV W7, W21, W19 |
0x401cec MSUB W1, W7, W19, W21 |
0x401cf0 CMP W0, W1 |
0x401cf4 B.LT 401ed0 |
0x401cf8 MADD W2, W7, W4, W1 |
0x401cfc ADD W17, W7, W2 |
0x401d00 CMP W2, W17 |
0x401d04 B.GE 401e6c |
0x401d08 LDR W5, [X20, #28] |
0x401d0c LDP X0, X8, [X20] |
0x401d10 LDR X6, [X20, #16] |
0x401d14 CMP W5, #0 |
0x401d18 B.LE 401e6c |
0x401d1c MOVN X9, #32784 |
0x401d20 SBFM X3, X2, #0, #31 |
0x401d24 ADD X18, X0, W2,SXTW #4 |
0x401d28 FMOV D26, X9 |
(5) 0x401d2c ANDS W10, W5, #64 |
(5) 0x401d30 LDP D27, D28, [X18] |
(5) 0x401d34 FMOV D30, D26 |
(5) 0x401d38 ORR X16, XZR, X8 |
(5) 0x401d3c MOVZ W15, #0 |
(5) 0x401d40 B.EQ 401dc8 |
(5) 0x401d44 CMP W10, #1 |
(5) 0x401d48 B.EQ 401d9c |
(5) 0x401d4c CMP W10, #2 |
(5) 0x401d50 B.EQ 401d78 |
(5) 0x401d54 LDP D31, D29, [X8] |
(5) 0x401d58 FSUB D0, D28, S29 |
(5) 0x401d5c FSUB D1, D27, S31 |
(5) 0x401d60 FMUL D2, D0, D0 |
(5) 0x401d64 FMADD D3, D1, D1, D2 |
(5) 0x401d68 FCMPE D26, D3 |
(5) 0x401d6c B.GT 401ec4 |
(5) 0x401d70 MOVZ W15, #1 |
(5) 0x401d74 ADD X16, X8, #16 |
(5) 0x401d78 LDP D4, D5, [X16] |
(5) 0x401d7c FSUB D6, D28, S5 |
(5) 0x401d80 FSUB D7, D27, S4 |
(5) 0x401d84 FMUL D16, D6, D6 |
(5) 0x401d88 FMADD D17, D7, D7, D16 |
(5) 0x401d8c FCMPE D30, D17 |
(5) 0x401d90 B.GT 401eb8 |
(5) 0x401d94 ADD W15, W15, #1 |
(5) 0x401d98 ADD X16, X16, #16 |
(5) 0x401d9c LDP D18, D19, [X16] |
(5) 0x401da0 FSUB D20, D28, S19 |
(5) 0x401da4 FSUB D21, D27, S18 |
(5) 0x401da8 FMUL D22, D20, D20 |
(5) 0x401dac FMADD D23, D21, D21, D22 |
(5) 0x401db0 FCMPE D30, D23 |
(5) 0x401db4 B.GT 401eac |
(5) 0x401db8 ADD W15, W15, #1 |
(5) 0x401dbc ADD X16, X16, #16 |
(5) 0x401dc0 CMP W5, W15 |
(5) 0x401dc4 B.EQ 401e5c |
(4) 0x401dc8 LDP D24, D25, [X16] |
(4) 0x401dcc FSUB D29, D28, S25 |
(4) 0x401dd0 FSUB D31, D27, S24 |
(4) 0x401dd4 FMUL D0, D29, D29 |
(4) 0x401dd8 FMADD D1, D31, D31, D0 |
(4) 0x401ddc FCMPE D30, D1 |
(4) 0x401de0 B.GT 401ea0 |
(4) 0x401de4 LDR D4, [X16, #24] |
(4) 0x401de8 ADD X11, X16, #16 |
(4) 0x401dec ADD W12, W15, #1 |
(4) 0x401df0 LDR D2, [X16, #16] |
(4) 0x401df4 FSUB D5, D28, S4 |
(4) 0x401df8 FSUB D3, D27, S2 |
(4) 0x401dfc FMUL D6, D5, D5 |
(4) 0x401e00 FMADD D7, D3, D3, D6 |
(4) 0x401e04 FCMPE D30, D7 |
(4) 0x401e08 B.GT 401e94 |
(4) 0x401e0c LDP D16, D17, [X11, #16] |
(4) 0x401e10 ADD W13, W12, #1 |
(4) 0x401e14 FSUB D18, D28, S17 |
(4) 0x401e18 FSUB D19, D27, S16 |
(4) 0x401e1c FMUL D20, D18, D18 |
(4) 0x401e20 FMADD D21, D19, D19, D20 |
(4) 0x401e24 FCMPE D30, D21 |
(4) 0x401e28 B.GT 401e88 |
(4) 0x401e2c LDP D22, D23, [X11, #32] |
(4) 0x401e30 ADD W14, W12, #2 |
(4) 0x401e34 FSUB D24, D28, S23 |
(4) 0x401e38 FSUB D25, D27, S22 |
(4) 0x401e3c FMUL D29, D24, D24 |
(4) 0x401e40 FMADD D31, D25, D25, D29 |
(4) 0x401e44 FCMPE D30, D31 |
(4) 0x401e48 B.GT 401e7c |
(4) 0x401e4c ADD W15, W12, #3 |
(4) 0x401e50 ADD X16, X11, #48 |
(4) 0x401e54 CMP W5, W15 |
(4) 0x401e58 B.NE 401dc8 |
(5) 0x401e5c ADD X3, X3, #1 |
(5) 0x401e60 ADD X18, X18, #16 |
(5) 0x401e64 CMP W17, W3 |
(5) 0x401e68 B.GT 401d2c |
(6) 0x401e6c LDR X21, [SP, #32] |
(6) 0x401e70 LDP X19, X20, [SP, #16] |
(6) 0x401e74 LDP X29, X30, [SP], #48 |
(6) 0x401e78 RET |
(3) 0x401e7c FMOV D30, D31 |
(3) 0x401e80 STR W14, [X6, X3,LSL #2] |
(3) 0x401e84 B 401e4c |
(4) 0x401e88 FMOV D30, D21 |
(4) 0x401e8c STR W13, [X6, X3,LSL #2] |
(4) 0x401e90 B 401e2c |
(4) 0x401e94 FMOV D30, D7 |
(4) 0x401e98 STR W12, [X6, X3,LSL #2] |
(4) 0x401e9c B 401e0c |
(4) 0x401ea0 FMOV D30, D1 |
(4) 0x401ea4 STR W15, [X6, X3,LSL #2] |
(4) 0x401ea8 B 401de4 |
(5) 0x401eac FMOV D30, D23 |
(5) 0x401eb0 STR W15, [X6, X3,LSL #2] |
(5) 0x401eb4 B 401db8 |
(5) 0x401eb8 FMOV D30, D17 |
(5) 0x401ebc STR W15, [X6, X3,LSL #2] |
(5) 0x401ec0 B 401d94 |
(5) 0x401ec4 FMOV D30, D3 |
(5) 0x401ec8 STR WZR, [X6, X3,LSL #2] |
(5) 0x401ecc B 401d70 |
0x401ed0 ADD W7, W7, #1 |
0x401ed4 MOVZ W1, #0 |
0x401ed8 B 401cf8 |
0x401edc HINT #0 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | GOMP_parallel | libgomp.h:980 | libgomp.so.1.0.0 |
| ○ | k_means(int, point_t*, point_t[...] | main.cpp:73 | kmeans-gcc-O3-funroll |
| ○ | main | main.cpp:19 | kmeans-gcc-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►50.11+ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
| ○ | start_thread | libpthread-2.31.so | |
| ○ | __clone | libc-2.31.so | |
| ►49.89+ | GOMP_parallel | libgomp.h:980 | libgomp.so.1.0.0 |
| ○ | k_means(int, point_t*, point_t[...] | main.cpp:73 | kmeans-gcc-O3-funroll |
| ○ | main | main.cpp:19 | kmeans-gcc-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►74.99+ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
| ○ | start_thread | libpthread-2.31.so | |
| ○ | __clone | libc-2.31.so | |
| ►25.01+ | GOMP_parallel | libgomp.h:980 | libgomp.so.1.0.0 |
| ○ | k_means(int, point_t*, point_t[...] | main.cpp:73 | kmeans-gcc-O3-funroll |
| ○ | main | main.cpp:19 | kmeans-gcc-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►87.48+ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
| ○ | start_thread | libpthread-2.31.so | |
| ○ | __clone | libc-2.31.so | |
| ►12.52+ | GOMP_parallel | libgomp.h:980 | libgomp.so.1.0.0 |
| ○ | k_means(int, point_t*, point_t[...] | main.cpp:73 | kmeans-gcc-O3-funroll |
| ○ | main | main.cpp:19 | kmeans-gcc-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►93.71+ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
| ○ | start_thread | libpthread-2.31.so | |
| ○ | __clone | libc-2.31.so | |
| ►6.29+ | GOMP_parallel | libgomp.h:980 | libgomp.so.1.0.0 |
| ○ | k_means(int, point_t*, point_t[...] | main.cpp:73 | kmeans-gcc-O3-funroll |
| ○ | main | main.cpp:19 | kmeans-gcc-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►96.85+ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
| ○ | start_thread | libpthread-2.31.so | |
| ○ | __clone | libc-2.31.so | |
| ►3.15+ | GOMP_parallel | libgomp.h:980 | libgomp.so.1.0.0 |
| ○ | k_means(int, point_t*, point_t[...] | main.cpp:73 | kmeans-gcc-O3-funroll |
| ○ | main | main.cpp:19 | kmeans-gcc-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►97.89+ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
| ○ | start_thread | libpthread-2.31.so | |
| ○ | __clone | libc-2.31.so | |
| ►2.11+ | GOMP_parallel | libgomp.h:980 | libgomp.so.1.0.0 |
| ○ | k_means(int, point_t*, point_t[...] | main.cpp:73 | kmeans-gcc-O3-funroll |
| ○ | main | main.cpp:19 | kmeans-gcc-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-O3-funroll |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►98.41+ | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
| ○ | start_thread | libpthread-2.31.so | |
| ○ | __clone | libc-2.31.so | |
| ►1.59+ | GOMP_parallel | libgomp.h:980 | libgomp.so.1.0.0 |
| ○ | k_means(int, point_t*, point_t[...] | main.cpp:73 | kmeans-gcc-O3-funroll |
| ○ | main | main.cpp:19 | kmeans-gcc-O3-funroll |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-O3-funroll |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:58-67 |
| Module | kmeans-gcc-O3-funroll |
| nb instructions | 31 |
| nb uops | 30 |
| loop length | 124 |
| used w registers | 10 |
| used x registers | 13 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 1 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 3 |
| micro-operation queue | 3.75 cycles |
| front end | 3.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 3.00 | 3.00 | 4.75 | 4.75 | 4.75 | 4.75 | 0.25 | 0.25 | 0.25 | 0.25 | 2.50 | 2.17 | 2.33 | 1.50 | 1.50 |
| cycles | 3.00 | 3.00 | 4.75 | 4.75 | 4.75 | 4.75 | 0.25 | 0.25 | 0.25 | 0.25 | 2.50 | 2.17 | 2.33 | 1.50 | 1.50 |
| Cycles executing div or sqrt instructions | 5.00-12.50 |
| Front-end | 3.75 |
| Dispatch | 4.75 |
| DIV/SQRT | 5.00-12.50 |
| Overall L1 | 5.00-12.50 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| other | 0% |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 24% |
| load | 16% |
| store | 41% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 20% |
| fma | 12% |
| other | 25% |
| all | 25% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| all | 24% |
| load | 16% |
| store | 41% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 20% |
| fma | 12% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #976]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ORR X20, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STR X21, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| LDR W21, [X0, #24] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| BL 401770 <@plt_start@+0x270> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR W19, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| BL 4016d0 <@plt_start@+0x1d0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR W4, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SDIV W7, W21, W19 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 5-12.50 | N/A |
| MSUB W1, W7, W19, W21 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (12.5%) |
| CMP W0, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LT 401ed0 <_Z7k_meansiP7point_tS0_PiS0_ii._omp_fn.0+0x210> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| MADD W2, W7, W4, W1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (12.5%) |
| ADD W17, W7, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP W2, W17 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.GE 401e6c <_Z7k_meansiP7point_tS0_PiS0_ii._omp_fn.0+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR W5, [X20, #28] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| LDP X0, X8, [X20] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X6, [X20, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| CMP W5, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 401e6c <_Z7k_meansiP7point_tS0_PiS0_ii._omp_fn.0+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| MOVN X9, #32784 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| SBFM X3, X2, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (100.0%) |
| ADD X18, X0, W2,SXTW #4 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| FMOV D26, X9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ADD W7, W7, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOVZ W1, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| B 401cf8 <_Z7k_meansiP7point_tS0_PiS0_ii._omp_fn.0+0x38> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| HINT #0 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:58-67 |
| Module | kmeans-gcc-O3-funroll |
| nb instructions | 31 |
| nb uops | 30 |
| loop length | 124 |
| used w registers | 10 |
| used x registers | 13 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 1 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 3 |
| micro-operation queue | 3.75 cycles |
| front end | 3.75 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 3.00 | 3.00 | 4.75 | 4.75 | 4.75 | 4.75 | 0.25 | 0.25 | 0.25 | 0.25 | 2.50 | 2.17 | 2.33 | 1.50 | 1.50 |
| cycles | 3.00 | 3.00 | 4.75 | 4.75 | 4.75 | 4.75 | 0.25 | 0.25 | 0.25 | 0.25 | 2.50 | 2.17 | 2.33 | 1.50 | 1.50 |
| Cycles executing div or sqrt instructions | 5.00-12.50 |
| Front-end | 3.75 |
| Dispatch | 4.75 |
| DIV/SQRT | 5.00-12.50 |
| Overall L1 | 5.00-12.50 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| other | 0% |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 24% |
| load | 16% |
| store | 41% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 20% |
| fma | 12% |
| other | 25% |
| all | 25% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| all | 24% |
| load | 16% |
| store | 41% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 20% |
| fma | 12% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #976]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ORR X20, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STR X21, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| LDR W21, [X0, #24] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| BL 401770 <@plt_start@+0x270> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR W19, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| BL 4016d0 <@plt_start@+0x1d0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR W4, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SDIV W7, W21, W19 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 5-12.50 | N/A |
| MSUB W1, W7, W19, W21 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (12.5%) |
| CMP W0, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LT 401ed0 <_Z7k_meansiP7point_tS0_PiS0_ii._omp_fn.0+0x210> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| MADD W2, W7, W4, W1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (12.5%) |
| ADD W17, W7, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP W2, W17 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.GE 401e6c <_Z7k_meansiP7point_tS0_PiS0_ii._omp_fn.0+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR W5, [X20, #28] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| LDP X0, X8, [X20] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X6, [X20, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| CMP W5, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 401e6c <_Z7k_meansiP7point_tS0_PiS0_ii._omp_fn.0+0x1ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| MOVN X9, #32784 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| SBFM X3, X2, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (100.0%) |
| ADD X18, X0, W2,SXTW #4 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| FMOV D26, X9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ADD W7, W7, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOVZ W1, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| B 401cf8 <_Z7k_meansiP7point_tS0_PiS0_ii._omp_fn.0+0x38> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| HINT #0 | N/A |
| Run run_1_thread | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 1 |
|---|---|
| Run run_2_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 2 |
| Run run_4_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 4 |
| Run run_8_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 8 |
| Run run_16_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 16 |
| Run run_32_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 32 |
| Run run_48_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 48 |
| Run run_64_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 64 |
| (run_1_thread) Efficiency | (run_1_thread) Potential Speed-Up (%) | (run_2_threads) Efficiency | (run_2_threads) Potential Speed-Up (%) | (run_4_threads) Efficiency | (run_4_threads) Potential Speed-Up (%) | (run_8_threads) Efficiency | (run_8_threads) Potential Speed-Up (%) | (run_16_threads) Efficiency | (run_16_threads) Potential Speed-Up (%) | (run_32_threads) Efficiency | (run_32_threads) Potential Speed-Up (%) | (run_48_threads) Efficiency | (run_48_threads) Potential Speed-Up (%) | (run_64_threads) Efficiency | (run_64_threads) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.92 | 7.63 | 0.79 | 19.22 | 0.62 | 35 | 0.43 | 52.37 | 0.26 | 67.71 | 0.19 | 75.05 | 0.15 | 79.48 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| run_1_thread | 1 | 1 | 1 | 1 | 99.214912414551 | 91.463470458984 |
| run_2_threads | 2 | 0.92 | 1.83 | 2 | 50.081951141357 | 91.50700378418 |
| run_4_threads | 4 | 0.79 | 3.16 | 4 | 25.046993255615 | 91.527626037598 |
| run_8_threads | 8 | 0.62 | 4.94 | 8 | 12.65799331665 | 91.574333190918 |
| run_16_threads | 16 | 0.43 | 6.86 | 16 | 6.4989972114563 | 91.703178405762 |
| run_32_threads | 32 | 0.26 | 8.47 | 32 | 3.4519987106323 | 92.071434020996 |
| run_48_threads | 48 | 0.19 | 9.08 | 48 | 2.5109984874725 | 92.55517578125 |
| run_64_threads | 64 | 0.15 | 9.37 | 64 | 2.0769982337952 | 93.116752624512 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼k_means(int, point_t*, point_t*, int*, point_t*, int, int) [clone ._omp_fn.0]– | 91.46 | 99.21 |
| ▼Loop 6 - main.cpp:58-67 - kmeans-gcc-O3-funroll– | 0.00 | 0.00 |
| ▼Loop 3 - main.cpp:60-67 - kmeans-gcc-O3-funroll– | 0.00 | 0.00 |
| ▼Loop 4 - main.cpp:60-67 - kmeans-gcc-O3-funroll– | 85.44 | 92.68 |
| ○Loop 5 - main.cpp:60-67 - kmeans-gcc-O3-funroll | 6.03 | 6.54 |
