| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-gcc-Ofast | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 5.76% | (excl. loops): 0.00% |
|---|
| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-gcc-Ofast | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 5.76% | (excl. loops): 0.00% |
|---|
/home/fmusial/KMEANS_Benchmarks/kmeans/main.cpp: 55 - 96 |
-------------------------------------------------------------------------------- |
55: void k_means(int niters, point_t *points, point_t *centroids, int *assignment, point_t* memory, int n, int k) { |
56: for (int iter = 0; iter < niters; ++iter) { |
57: // determine nearest centroids |
58: #pragma omp parallel for |
[...] |
73: int count[k]; |
74: double sum_x[k]; |
75: double sum_y[k]; |
76: for (int j = 0; j < k; ++j) { |
77: count[j] = 0; |
78: sum_x[j] = 0.; |
79: sum_y[j] = 0.; |
80: } |
81: for (int i = 0; i < n; ++i) { |
82: count[assignment[i]]++; |
83: sum_x[assignment[i]] += points[i].x; |
84: sum_y[assignment[i]] += points[i].y; |
85: } |
86: for (int j = 0; j < k; ++j) { |
87: if (count[j] != 0) { |
88: centroids[j].x = sum_x[j] / count[j]; |
89: centroids[j].y = sum_y[j] / count[j]; |
90: } |
91: // save centroids to memory |
92: memory[(iter + 1) * k + j].x = centroids[j].x; |
93: memory[(iter + 1) * k + j].y = centroids[j].y; |
94: } |
95: } |
96: } |
0x401dac STP X29, X30, [SP, #800]! |
0x401db0 ADD X29, SP, #0 |
0x401db4 STP X19, X20, [SP, #16] |
0x401db8 STP X21, X22, [SP, #32] |
0x401dbc STP X23, X24, [SP, #48] |
0x401dc0 STP X25, X26, [SP, #64] |
0x401dc4 STP X27, X28, [SP, #80] |
0x401dc8 STR X4, [X29, #128] |
0x401dcc STR W0, [X29, #156] |
0x401dd0 CMP W0, #0 |
0x401dd4 B.LE 401fcc |
0x401dd8 SBFM X0, X6, #62, #31 |
0x401ddc ORR X28, XZR, X2 |
0x401de0 ORR X23, XZR, X1 |
0x401de4 ADD X2, X0, #15 |
0x401de8 ADRP X1, 401de8 |
0x401dec CNTB X0, ALL, MUL #2 |
0x401df0 AND X2, X2, #7932 |
0x401df4 SUB X0, X0, #16 |
0x401df8 SUB W22, W6, #1 |
0x401dfc ADD X1, X1, #2912 |
0x401e00 ADD X22, X22, #1 |
0x401e04 SBFM X21, X6, #61, #31 |
0x401e08 STR X0, [X29, #104] |
0x401e0c SBFM X0, X6, #0, #31 |
0x401e10 STP X1, X2, [X29, #136] |
0x401e14 UBFM X2, X22, #62, #61 |
0x401e18 ADD X21, X21, #15 |
0x401e1c ORR X27, XZR, X3 |
0x401e20 ORR W25, WZR, W5 |
0x401e24 ORR W24, WZR, W6 |
0x401e28 MOVZ W19, #0 |
0x401e2c AND X21, X21, #7932 |
0x401e30 STR X2, [X29, #120] |
0x401e34 ORR W20, WZR, W6 |
0x401e38 UBFM X2, X22, #61, #60 |
0x401e3c STR X0, [X29, #160] |
0x401e40 ADD X0, X28, #16 |
0x401e44 STR X2, [X29, #176] |
0x401e48 STR X0, [X29, #112] |
(11) 0x401e4c ADD X4, SP, #0 |
(11) 0x401e50 LDR X0, [X29, #136] |
(11) 0x401e54 MOVZ W3, #0 |
(11) 0x401e58 MOVZ W2, #0 |
(11) 0x401e5c ADD X1, X29, #192 |
(11) 0x401e60 STP X23, X28, [X29, #192] |
(11) 0x401e64 ADD W19, W19, #1 |
(11) 0x401e68 STR X4, [X29, #184] |
(11) 0x401e6c STR X27, [X29, #208] |
(11) 0x401e70 STP W25, W24, [X29, #216] |
(11) 0x401e74 BL 4016a0 |
(11) 0x401e78 LDR X0, [X29, #144] |
(11) 0x401e7c SUB SP, SP, X0,UXTX |
(11) 0x401e80 ADD X8, SP, #0 |
(11) 0x401e84 SUB SP, SP, X21,UXTX |
(11) 0x401e88 ADD X22, SP, #0 |
(11) 0x401e8c SUB SP, SP, X21,UXTX |
(11) 0x401e90 ADD X26, SP, #0 |
(11) 0x401e94 CMP W24, #0 |
(11) 0x401e98 B.LE 40207c |
(11) 0x401e9c LDR X2, [X29, #120] |
(11) 0x401ea0 ORR X0, XZR, X8 |
(11) 0x401ea4 MOVZ W1, #0 |
(11) 0x401ea8 STR X8, [X29, #168] |
(11) 0x401eac BL 4015d0 |
(11) 0x401eb0 LDR X2, [X29, #176] |
(11) 0x401eb4 MOVZ W1, #0 |
(11) 0x401eb8 ORR X0, XZR, X22 |
(11) 0x401ebc BL 4015d0 |
(11) 0x401ec0 LDR X2, [X29, #176] |
(11) 0x401ec4 ADD X0, SP, #0 |
(11) 0x401ec8 MOVZ W1, #0 |
(11) 0x401ecc BL 4015d0 |
(11) 0x401ed0 CMP W25, #0 |
(11) 0x401ed4 LDR X8, [X29, #168] |
(11) 0x401ed8 B.LE 401f28 |
(11) 0x401edc ORR X4, XZR, X23 |
(11) 0x401ee0 MOVZ X1, #0 |
(10) 0x401ee4 LDR W0, [X27, X1,LSL #2] |
(10) 0x401ee8 ADD X1, X1, #1 |
(10) 0x401eec LDP D28, D30, [X4], #16 |
(10) 0x401ef0 LDR D29, [X22, X0,SXTW #3] |
(10) 0x401ef4 SBFM X2, X0, #62, #31 |
(10) 0x401ef8 LDR D31, [X26, X0,SXTW #3] |
(10) 0x401efc LDR W7, [X8, X2] |
(10) 0x401f00 FADD D28, D29, D28 |
(10) 0x401f04 FADD D30, D31, D30 |
(10) 0x401f08 ADD W7, W7, #1 |
(10) 0x401f0c STR W7, [X8, X2] |
(10) 0x401f10 STR D28, [X22, X0,SXTW #3] |
(10) 0x401f14 STR D30, [X26, X0,SXTW #3] |
(10) 0x401f18 CMP W25, W1 |
(10) 0x401f1c B.GT 401ee4 |
(11) 0x401f20 CMP W24, #0 |
(11) 0x401f24 B.LE 401fb4 |
(11) 0x401f28 LDR X0, [X29, #128] |
(11) 0x401f2c ADD X4, X0, W20,UXTW #4 |
(11) 0x401f30 LDP X1, X0, [X29, #104] |
(11) 0x401f34 SUB X0, X4, X0 |
(11) 0x401f38 ADD X0, X0, #8 |
(11) 0x401f3c CMP X0, X1 |
(11) 0x401f40 B.LS 401fec |
(11) 0x401f44 MOVZ X1, #0 |
(11) 0x401f48 MOVZ X0, #0 |
(11) 0x401f4c WHILELO P5.D, WZR, W24 |
(11) 0x401f50 CNTD X12, ALL |
(11) 0x401f54 CNTW X7, ALL |
(11) 0x401f58 PTRUE P6.B, ALL |
(11) 0x401f5c FDUP Z23.D, #112 |
(14) 0x401f60 LD1W {Z1.D}, P5/Z, [X8, X0,LSL #2] |
(14) 0x401f64 ADD X2, X28, X1,LSL #3 |
(14) 0x401f68 CMPNE P7.S, P5/Z, Z1.S, #0 |
(14) 0x401f6c CMPEQ P4.S, P5/Z, Z1.S, #0 |
(14) 0x401f70 LD1D {Z24.D}, P7/Z, [X22, X0,LSL #3] |
(14) 0x401f74 LD1D {Z25.D}, P7/Z, [X26, X0,LSL #3] |
(14) 0x401f78 MOVPRFX Z0, Z1 |
(14) 0x401f7c SXTW Z0.D, P6/M, Z1.D |
(14) 0x401f80 SCVTF Z0.D, P6/M, Z0.D |
(14) 0x401f84 FDIVR Z0.D, P6/M, Z0.D, Z23.D |
(14) 0x401f88 FMUL Z24.D, Z0.D, Z24.D |
(14) 0x401f8c FMUL Z25.D, Z0.D, Z25.D |
(14) 0x401f90 ST2D {Z24.D, Z25.D}, P7, [X2, MUL VL] |
(14) 0x401f94 LD2D {Z26.D, Z27.D}, P4/Z, [X2, MUL VL] |
(14) 0x401f98 SEL Z26.D, P7, Z24.D, Z26.D |
(14) 0x401f9c SEL Z27.D, P7, Z25.D, Z27.D |
(14) 0x401fa0 ST2D {Z26.D, Z27.D}, P5, [X4, X1,LSL #3] |
(14) 0x401fa4 ADD X0, X0, X12 |
(14) 0x401fa8 ADD X1, X1, X7 |
(14) 0x401fac WHILELO P5.D, W0, W24 |
(14) 0x401fb0 B.NE 401f60 |
(11) 0x401fb4 LDR X0, [X29, #184] |
(11) 0x401fb8 ADD W20, W20, W24 |
(11) 0x401fbc ADD SP, X0, #0 |
(11) 0x401fc0 LDR W0, [X29, #156] |
(11) 0x401fc4 CMP W0, W19 |
(11) 0x401fc8 B.NE 401e4c |
(12) 0x401fcc ADD SP, X29, #0 |
(12) 0x401fd0 LDP X19, X20, [SP, #16] |
(12) 0x401fd4 LDP X21, X22, [SP, #32] |
(12) 0x401fd8 LDP X23, X24, [SP, #48] |
(12) 0x401fdc LDP X25, X26, [SP, #64] |
(12) 0x401fe0 LDP X27, X28, [SP, #80] |
(12) 0x401fe4 LDP X29, X30, [SP], #224 |
(12) 0x401fe8 RET |
(13) 0x401fec FMOV D22, #1.0000000 |
(13) 0x401ff0 MOVZ X0, #0 |
(13) 0x401ff4 ADD X1, X28, #8 |
(13) 0x401ff8 HINT #0 |
(13) 0x401ffc HINT #0 |
(9) 0x402000 LDR W2, [X8, X0,LSL #2] |
(9) 0x402004 CBNZ W2, 40202c |
(15) 0x402008 LDUR D31, [X1, #504] |
(15) 0x40200c ADD X0, X0, #1 |
(15) 0x402010 LDR D30, [X1], #16 |
(15) 0x402014 LDR X2, [X29, #160] |
(15) 0x402018 STP D31, D30, [X4], #16 |
(15) 0x40201c CMP X2, X0 |
(15) 0x402020 B.EQ 401fb4 |
(15) 0x402024 LDR W2, [X8, X0,LSL #2] |
(15) 0x402028 CBZ W2, 402008 |
(9) 0x40202c SCVTF D21, W2 |
(9) 0x402030 LDR D20, [X22, X0,LSL #3] |
(9) 0x402034 LDR X2, [X29, #160] |
(9) 0x402038 FDIV D21, D22, D21 |
(9) 0x40203c FMUL D20, D21, D20 |
(9) 0x402040 STUR D20, [X1, #504] |
(9) 0x402044 LDR D19, [X26, X0,LSL #3] |
(9) 0x402048 ADD X0, X0, #1 |
(9) 0x40204c FMUL D19, D21, D19 |
(9) 0x402050 STR D19, [X1], #16 |
(9) 0x402054 STP D20, D19, [X4], #16 |
(9) 0x402058 CMP X2, X0 |
(9) 0x40205c B.NE 402000 |
(13) 0x402060 LDR X0, [X29, #184] |
(13) 0x402064 ADD W20, W20, W24 |
(13) 0x402068 ADD SP, X0, #0 |
(13) 0x40206c LDR W0, [X29, #156] |
(13) 0x402070 CMP W0, W19 |
(13) 0x402074 B.NE 401e4c |
(12) 0x402078 B 401fcc |
(11) 0x40207c CMP W25, #0 |
(11) 0x402080 B.GT 401edc |
(11) 0x402084 B 401fb4 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:19 | kmeans-gcc-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:19 | kmeans-gcc-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:19 | kmeans-gcc-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:19 | kmeans-gcc-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:19 | kmeans-gcc-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:19 | kmeans-gcc-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:19 | kmeans-gcc-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-Ofast |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:19 | kmeans-gcc-Ofast |
| ○ | __libc_start_main | libc-2.31.so | |
| ○ | _start | new_allocator.h:104 | kmeans-gcc-Ofast |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-gcc-Ofast |
| nb instructions | 40 |
| nb uops | 40 |
| loop length | 160 |
| used w registers | 9 |
| used x registers | 19 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 6 |
| micro-operation queue | 5.00 cycles |
| front end | 5.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.50 | 0.50 | 6.50 | 6.50 | 6.50 | 6.50 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | 7.00 | 0.00 | 7.00 | 7.00 |
| cycles | 0.50 | 0.50 | 6.50 | 6.50 | 6.50 | 6.50 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | 7.00 | 0.00 | 7.00 | 7.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.00 |
| Dispatch | 7.00 |
| Overall L1 | 7.00 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 30% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 36% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 22% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #800]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X4, [X29, #128] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STR W0, [X29, #156] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (12.5%) |
| CMP W0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 401fcc <_Z7k_meansiP7point_tS0_PiS0_ii+0x220> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| SBFM X0, X6, #62, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR X28, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR X23, XZR, X1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X2, X0, #15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADRP X1, 401de8 <_Z7k_meansiP7point_tS0_PiS0_ii+0x3c> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| CNTB X0, ALL, MUL #2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| AND X2, X2, #7932 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| SUB X0, X0, #16 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SUB W22, W6, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD X1, X1, #2912 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X22, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| SBFM X21, X6, #61, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (100.0%) |
| STR X0, [X29, #104] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| SBFM X0, X6, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X1, X2, [X29, #136] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| UBFM X2, X22, #62, #61 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X21, X21, #15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR X27, XZR, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR W25, WZR, W5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W24, WZR, W6 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| MOVZ W19, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| AND X21, X21, #7932 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X2, [X29, #120] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| ORR W20, WZR, W6 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| UBFM X2, X22, #61, #60 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X0, [X29, #160] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| ADD X0, X28, #16 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STR X2, [X29, #176] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STR X0, [X29, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-gcc-Ofast |
| nb instructions | 40 |
| nb uops | 40 |
| loop length | 160 |
| used w registers | 9 |
| used x registers | 19 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 6 |
| micro-operation queue | 5.00 cycles |
| front end | 5.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 0.50 | 0.50 | 6.50 | 6.50 | 6.50 | 6.50 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | 7.00 | 0.00 | 7.00 | 7.00 |
| cycles | 0.50 | 0.50 | 6.50 | 6.50 | 6.50 | 6.50 | 0.00 | 0.00 | 0.00 | 0.00 | 7.00 | 7.00 | 0.00 | 7.00 | 7.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.00 |
| Dispatch | 7.00 |
| Overall L1 | 7.00 |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 30% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | 36% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 22% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #800]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| STR X4, [X29, #128] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STR W0, [X29, #156] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (12.5%) |
| CMP W0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 401fcc <_Z7k_meansiP7point_tS0_PiS0_ii+0x220> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| SBFM X0, X6, #62, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR X28, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR X23, XZR, X1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X2, X0, #15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADRP X1, 401de8 <_Z7k_meansiP7point_tS0_PiS0_ii+0x3c> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| CNTB X0, ALL, MUL #2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | N/A |
| AND X2, X2, #7932 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| SUB X0, X0, #16 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SUB W22, W6, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD X1, X1, #2912 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X22, X22, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| SBFM X21, X6, #61, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (100.0%) |
| STR X0, [X29, #104] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| SBFM X0, X6, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STP X1, X2, [X29, #136] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| UBFM X2, X22, #62, #61 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X21, X21, #15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR X27, XZR, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR W25, WZR, W5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W24, WZR, W6 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| MOVZ W19, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| AND X21, X21, #7932 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X2, [X29, #120] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| ORR W20, WZR, W6 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| UBFM X2, X22, #61, #60 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STR X0, [X29, #160] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| ADD X0, X28, #16 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| STR X2, [X29, #176] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| STR X0, [X29, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| Run run_1_thread | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 1 |
|---|---|
| Run run_2_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 2 |
| Run run_4_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 4 |
| Run run_8_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 8 |
| Run run_16_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 16 |
| Run run_32_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 32 |
| Run run_48_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 48 |
| Run run_64_threads | Number processes: 1Number nodes: 1Number processes per node: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: trueOMP_NUM_THREADS: 64 |
| (run_1_thread) Efficiency | (run_1_thread) Potential Speed-Up (%) | (run_2_threads) Efficiency | (run_2_threads) Potential Speed-Up (%) | (run_4_threads) Efficiency | (run_4_threads) Potential Speed-Up (%) | (run_8_threads) Efficiency | (run_8_threads) Potential Speed-Up (%) | (run_16_threads) Efficiency | (run_16_threads) Potential Speed-Up (%) | (run_32_threads) Efficiency | (run_32_threads) Potential Speed-Up (%) | (run_48_threads) Efficiency | (run_48_threads) Potential Speed-Up (%) | (run_64_threads) Efficiency | (run_64_threads) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.95 | 0.3 | 0.86 | 0.79 | 0.71 | 1.62 | 0.55 | 2.49 | 0.38 | 3.21 | 0.31 | 3.32 | 0.26 | 3.29 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| run_1_thread | 1 | 1 | 1 | 1 | 9.3249950408936 | 5.7632880210876 |
| run_2_threads | 1 | 0.95 | 1.89 | 2 | 9.3249998092651 | 5.666006565094 |
| run_4_threads | 1 | 0.86 | 3.43 | 4 | 9.3229932785034 | 5.5153603553772 |
| run_8_threads | 1 | 0.71 | 5.68 | 8 | 9.3279981613159 | 5.5828881263733 |
| run_16_threads | 1 | 0.55 | 8.72 | 16 | 9.3309926986694 | 5.4666028022766 |
| run_32_threads | 1 | 0.38 | 12.23 | 32 | 9.3459949493408 | 5.1994442939758 |
| run_48_threads | 1 | 0.31 | 14.78 | 48 | 9.3529930114746 | 4.7925524711609 |
| run_64_threads | 1 | 0.26 | 16.81 | 64 | 9.3549995422363 | 4.4609441757202 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼k_means(int, point_t*, point_t*, int*, point_t*, int, int)– | 5.76 | 9.32 |
| ▼Loop 12 - main.cpp:56-96 - kmeans-gcc-Ofast– | 0.00 | 0.00 |
| ▼Loop 13 - main.cpp:56-96 - kmeans-gcc-Ofast– | 0.00 | 0.00 |
| ○Loop 9 - main.cpp:86-92 - kmeans-gcc-Ofast | 0.00 | 0.00 |
| ○Loop 15 - main.cpp:86-93 - kmeans-gcc-Ofast | 0.00 | 0.00 |
| ▼Loop 11 - main.cpp:56-95 - kmeans-gcc-Ofast– | 0.00 | 0.00 |
| ○Loop 10 - main.cpp:81-84 - kmeans-gcc-Ofast | 5.76 | 9.32 |
| ○Loop 14 - main.cpp:86-93 - kmeans-gcc-Ofast | 0.00 | 0.00 |
