| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-icpx-O3-aggressive | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 7.56% | (excl. loops): 0.00% |
|---|
| Function: k_means(int, point_t*, point_t*, int*, point_t*, int, int) | Module: kmeans-icpx-O3-aggressive | Source: main.cpp:55-96 [...] | Coverage (incl. loops): 7.56% | (excl. loops): 0.00% |
|---|
/home/fmusial/KMEANS_Benchmarks/kmeans/main.cpp: 55 - 96 |
-------------------------------------------------------------------------------- |
55: void k_means(int niters, point_t *points, point_t *centroids, int *assignment, point_t* memory, int n, int k) { |
56: for (int iter = 0; iter < niters; ++iter) { |
57: // determine nearest centroids |
58: #pragma omp parallel for |
59: for (int i = 0; i < n; ++i) { |
[...] |
73: int count[k]; |
74: double sum_x[k]; |
75: double sum_y[k]; |
76: for (int j = 0; j < k; ++j) { |
77: count[j] = 0; |
78: sum_x[j] = 0.; |
79: sum_y[j] = 0.; |
80: } |
81: for (int i = 0; i < n; ++i) { |
82: count[assignment[i]]++; |
83: sum_x[assignment[i]] += points[i].x; |
84: sum_y[assignment[i]] += points[i].y; |
85: } |
86: for (int j = 0; j < k; ++j) { |
87: if (count[j] != 0) { |
88: centroids[j].x = sum_x[j] / count[j]; |
89: centroids[j].y = sum_y[j] / count[j]; |
90: } |
91: // save centroids to memory |
92: memory[(iter + 1) * k + j].x = centroids[j].x; |
93: memory[(iter + 1) * k + j].y = centroids[j].y; |
94: } |
95: } |
96: } |
0x402c70 PUSH %RBP |
0x402c71 MOV %RSP,%RBP |
0x402c74 PUSH %R15 |
0x402c76 PUSH %R14 |
0x402c78 PUSH %R13 |
0x402c7a PUSH %R12 |
0x402c7c PUSH %RBX |
0x402c7d SUB $0xa8,%RSP |
0x402c84 MOV %R8,-0x50(%RBP) |
0x402c88 MOV %RSI,-0x38(%RBP) |
0x402c8c TEST %EDI,%EDI |
0x402c8e JLE 403180 |
0x402c94 MOV %RCX,%R15 |
0x402c97 MOV %RDX,%R12 |
0x402c9a MOV 0x10(%RBP),%ECX |
0x402c9d LEA -0x1(%R9),%EAX |
0x402ca1 MOV %RAX,-0xa0(%RBP) |
0x402ca8 MOV %ECX,%EDX |
0x402caa LEA (,%RDX,4),%RAX |
0x402cb2 MOV %RAX,-0x98(%RBP) |
0x402cb9 LEA (,%RDX,8),%RAX |
0x402cc1 MOV %RAX,-0x90(%RBP) |
0x402cc8 MOV %EDI,%EAX |
0x402cca MOV %R9D,%ESI |
0x402ccd DEC %RAX |
0x402cd0 MOV %RAX,-0xc0(%RBP) |
0x402cd7 LEA -0x1(%RDX),%RAX |
0x402cdb MOV %RAX,-0x70(%RBP) |
0x402cdf SAL $0x4,%RAX |
0x402ce3 ADD %R12,%RAX |
0x402ce6 ADD $0x8,%RAX |
0x402cea MOV %RAX,-0x78(%RBP) |
0x402cee MOV %RSI,-0x80(%RBP) |
0x402cf2 AND $0x7ffffffe,%ESI |
0x402cf8 MOV %RSI,-0xb8(%RBP) |
0x402cff MOV %EDX,%EAX |
0x402d01 AND $0x7ffffffc,%EAX |
0x402d06 MOV %RAX,-0x68(%RBP) |
0x402d0a LEA 0xf(,%RDX,4),%RAX |
0x402d12 AND $-0x10,%RAX |
0x402d16 MOV %RAX,-0xb0(%RBP) |
0x402d1d LEA 0xf(,%RDX,8),%RAX |
0x402d25 VMOVSD 0x72e1(%RIP),%XMM16 |
0x402d2f VBROADCASTSD 0x72d7(%RIP),%YMM17 |
0x402d39 VMOVUPD 0x72dd(%RIP),%YMM18 |
0x402d43 AND $-0x10,%RAX |
0x402d47 MOV %RAX,-0xa8(%RBP) |
0x402d4e VMOVUPD 0x72e8(%RIP),%YMM19 |
0x402d58 MOV -0x38(%RBP),%RAX |
0x402d5c ADD $0x18,%RAX |
0x402d60 MOV %RAX,-0x60(%RBP) |
0x402d64 MOV %RDX,-0x40(%RBP) |
0x402d68 LEA (%RDX,%RDX,1),%RAX |
0x402d6c MOV %RAX,-0xd0(%RBP) |
0x402d73 MOV %ECX,-0x2c(%RBP) |
0x402d76 MOVQ $0,-0x48(%RBP) |
0x402d7e MOV %R9,-0x58(%RBP) |
0x402d82 MOV %R15,-0x88(%RBP) |
0x402d89 JMP 402dc9 |
0x402d8b NOPW %CS:(%RAX,%RAX,1) |
0x402d9a NOPW (%RAX,%RAX,1) |
(5) 0x402da0 MOV -0xc8(%RBP),%RSP |
(5) 0x402da7 MOV -0x2c(%RBP),%EAX |
(5) 0x402daa ADD 0x10(%RBP),%EAX |
(5) 0x402dad MOV %EAX,-0x2c(%RBP) |
(5) 0x402db0 MOV -0x48(%RBP),%RAX |
(5) 0x402db4 CMP -0xc0(%RBP),%RAX |
(5) 0x402dbb LEA 0x1(%RAX),%RAX |
(5) 0x402dbf MOV %RAX,-0x48(%RBP) |
(5) 0x402dc3 JE 403180 |
(5) 0x402dc9 TEST %R9D,%R9D |
(5) 0x402dcc JLE 402e30 |
(5) 0x402dce SUB $0x8,%RSP |
(5) 0x402dd2 MOV $0x40e1a0,%EDI |
(5) 0x402dd7 MOV $0x403780,%EDX |
(5) 0x402ddc MOV $0x6,%ESI |
(5) 0x402de1 MOV -0x38(%RBP),%RCX |
(5) 0x402de5 MOV %R12,%R8 |
(5) 0x402de8 MOV %R15,%R9 |
(5) 0x402deb XOR %EAX,%EAX |
(5) 0x402ded PUSHQ -0xa0(%RBP) |
(5) 0x402df3 PUSH $0 |
(5) 0x402df5 PUSHQ -0x40(%RBP) |
(5) 0x402df8 VZEROUPPER |
(5) 0x402dfb CALL 402290 <__kmpc_fork_call@plt> |
(5) 0x402e00 VMOVUPD 0x7236(%RIP),%YMM19 |
(5) 0x402e0a VMOVUPD 0x720c(%RIP),%YMM18 |
(5) 0x402e14 VBROADCASTSD 0x71f2(%RIP),%YMM17 |
(5) 0x402e1e VMOVSD 0x71e8(%RIP),%XMM16 |
(5) 0x402e28 MOV -0x58(%RBP),%R9 |
(5) 0x402e2c ADD $0x20,%RSP |
(5) 0x402e30 MOV %RSP,-0xc8(%RBP) |
(5) 0x402e37 MOV %RSP,%R13 |
(5) 0x402e3a SUB -0xb0(%RBP),%R13 |
(5) 0x402e41 MOV %R13,%RSP |
(5) 0x402e44 MOV %RSP,%R14 |
(5) 0x402e47 MOV -0xa8(%RBP),%RAX |
(5) 0x402e4e SUB %RAX,%R14 |
(5) 0x402e51 MOV %R14,%RSP |
(5) 0x402e54 MOV %RSP,%RBX |
(5) 0x402e57 SUB %RAX,%RBX |
(5) 0x402e5a MOV %RBX,%RSP |
(5) 0x402e5d CMPL $0,0x10(%RBP) |
(5) 0x402e61 JLE 402ea3 |
(5) 0x402e63 MOV %R13,%RDI |
(5) 0x402e66 XOR %ESI,%ESI |
(5) 0x402e68 MOV -0x98(%RBP),%RDX |
(5) 0x402e6f VZEROUPPER |
(5) 0x402e72 CALL 405840 <__intel_avx_rep_memset> |
(5) 0x402e77 MOV %R14,%RDI |
(5) 0x402e7a XOR %ESI,%ESI |
(5) 0x402e7c MOV -0x90(%RBP),%R15 |
(5) 0x402e83 MOV %R15,%RDX |
(5) 0x402e86 CALL 405840 <__intel_avx_rep_memset> |
(5) 0x402e8b MOV %RBX,%RDI |
(5) 0x402e8e XOR %ESI,%ESI |
(5) 0x402e90 MOV %R15,%RDX |
(5) 0x402e93 MOV -0x88(%RBP),%R15 |
(5) 0x402e9a CALL 405840 <__intel_avx_rep_memset> |
(5) 0x402e9f MOV -0x58(%RBP),%R9 |
(5) 0x402ea3 TEST %R9D,%R9D |
(5) 0x402ea6 MOV -0xb8(%RBP),%RSI |
(5) 0x402ead JLE 402f67 |
(5) 0x402eb3 CMP $0x1,%R9D |
(5) 0x402eb7 JNE 402ec0 |
(5) 0x402eb9 XOR %EAX,%EAX |
(5) 0x402ebb JMP 402f2f |
0x402ebd NOPL (%RAX) |
(5) 0x402ec0 MOV -0x60(%RBP),%RCX |
(5) 0x402ec4 XOR %EAX,%EAX |
(5) 0x402ec6 NOPW %CS:(%RAX,%RAX,1) |
(9) 0x402ed0 MOVSXD (%R15,%RAX,4),%RDX |
(9) 0x402ed4 INCL (%R13,%RDX,4) |
(9) 0x402ed9 VMOVSD (%R14,%RDX,8),%XMM0 |
(9) 0x402edf VADDSD -0x18(%RCX),%XMM0,%XMM0 |
(9) 0x402ee4 VMOVSD %XMM0,(%R14,%RDX,8) |
(9) 0x402eea VMOVSD (%RBX,%RDX,8),%XMM0 |
(9) 0x402eef VADDSD -0x10(%RCX),%XMM0,%XMM0 |
(9) 0x402ef4 VMOVSD %XMM0,(%RBX,%RDX,8) |
(9) 0x402ef9 MOVSXD 0x4(%R15,%RAX,4),%RDX |
(9) 0x402efe INCL (%R13,%RDX,4) |
(9) 0x402f03 VMOVSD (%R14,%RDX,8),%XMM0 |
(9) 0x402f09 VADDSD -0x8(%RCX),%XMM0,%XMM0 |
(9) 0x402f0e VMOVSD %XMM0,(%R14,%RDX,8) |
(9) 0x402f14 VMOVSD (%RBX,%RDX,8),%XMM0 |
(9) 0x402f19 VADDSD (%RCX),%XMM0,%XMM0 |
(9) 0x402f1d VMOVSD %XMM0,(%RBX,%RDX,8) |
(9) 0x402f22 ADD $0x2,%RAX |
(9) 0x402f26 ADD $0x20,%RCX |
(9) 0x402f2a CMP %RAX,%RSI |
(9) 0x402f2d JNE 402ed0 |
(5) 0x402f2f TESTB $0x1,-0x80(%RBP) |
(5) 0x402f33 JE 402f67 |
(5) 0x402f35 MOVSXD (%R15,%RAX,4),%RCX |
(5) 0x402f39 INCL (%R13,%RCX,4) |
(5) 0x402f3e VMOVSD (%R14,%RCX,8),%XMM0 |
(5) 0x402f44 SAL $0x4,%RAX |
(5) 0x402f48 MOV -0x38(%RBP),%RDX |
(5) 0x402f4c VADDSD (%RDX,%RAX,1),%XMM0,%XMM0 |
(5) 0x402f51 VMOVSD %XMM0,(%R14,%RCX,8) |
(5) 0x402f57 VMOVSD (%RBX,%RCX,8),%XMM0 |
(5) 0x402f5c VADDSD 0x8(%RDX,%RAX,1),%XMM0,%XMM0 |
(5) 0x402f62 VMOVSD %XMM0,(%RBX,%RCX,8) |
(5) 0x402f67 CMPL $0,0x10(%RBP) |
(5) 0x402f6b JLE 402da0 |
(5) 0x402f71 MOV -0x2c(%RBP),%EAX |
(5) 0x402f74 SAL $0x4,%RAX |
(5) 0x402f78 MOV -0x50(%RBP),%RDI |
(5) 0x402f7c LEA (%RDI,%RAX,1),%RCX |
(5) 0x402f80 MOV -0x48(%RBP),%RDX |
(5) 0x402f84 INC %EDX |
(5) 0x402f86 IMUL 0x10(%RBP),%EDX |
(5) 0x402f8a MOV %RDX,%RSI |
(5) 0x402f8d SAL $0x4,%RSI |
(5) 0x402f91 ADD %RDI,%RSI |
(5) 0x402f94 CMP %RSI,-0x78(%RBP) |
(5) 0x402f98 JB 403020 |
(5) 0x402f9e ADD -0x70(%RBP),%RDX |
(5) 0x402fa2 SAL $0x4,%RDX |
(5) 0x402fa6 ADD %RDI,%RDX |
(5) 0x402fa9 ADD $0x8,%RDX |
(5) 0x402fad CMP %R12,%RDX |
(5) 0x402fb0 JB 403020 |
(5) 0x402fb2 XOR %EAX,%EAX |
(5) 0x402fb4 JMP 402ff9 |
0x402fb6 NOPW %CS:(%RAX,%RAX,1) |
(8) 0x402fc0 VCVTSI2SD %EDX,%XMM3,%XMM0 |
(8) 0x402fc4 VMOVSD (%R14,%RAX,4),%XMM1 |
(8) 0x402fca VMOVHPD (%RBX,%RAX,4),%XMM1,%XMM1 |
(8) 0x402fcf VDIVSD %XMM0,%XMM16,%XMM0 |
(8) 0x402fd5 VMOVDDUP %XMM0,%XMM0 |
(8) 0x402fd9 VMULPD %XMM0,%XMM1,%XMM0 |
(8) 0x402fdd VMOVUPD %XMM0,(%R12,%RAX,8) |
(8) 0x402fe3 VMOVUPD %XMM0,(%RCX,%RAX,8) |
(8) 0x402fe8 ADD $0x2,%RAX |
(8) 0x402fec CMP %RAX,-0xd0(%RBP) |
(8) 0x402ff3 JE 402da0 |
(8) 0x402ff9 MOV (%R13,%RAX,2),%EDX |
(8) 0x402ffe TEST %EDX,%EDX |
(8) 0x403000 JNE 402fc0 |
(8) 0x403002 VMOVUPD (%R12,%RAX,8),%XMM0 |
(8) 0x403008 VMOVUPD %XMM0,(%RCX,%RAX,8) |
(8) 0x40300d ADD $0x2,%RAX |
(8) 0x403011 CMP %RAX,-0xd0(%RBP) |
(8) 0x403018 JNE 402ff9 |
(5) 0x40301a JMP 402da0 |
0x40301f NOP |
(5) 0x403020 MOV -0x68(%RBP),%RDI |
(5) 0x403024 TEST %RDI,%RDI |
(5) 0x403027 JE 4030e0 |
(5) 0x40302d XOR %EDX,%EDX |
(5) 0x40302f XOR %ESI,%ESI |
(5) 0x403031 NOPW %CS:(%RAX,%RAX,1) |
(7) 0x403040 VMOVDQU (%R13,%RSI,4),%XMM0 |
(7) 0x403047 VCVTDQ2PD %XMM0,%YMM1 |
(7) 0x40304b VPTESTMD %XMM0,%XMM0,%K1 |
(7) 0x403051 VDIVPD %YMM1,%YMM17,%YMM0 |
(7) 0x403057 VMOVUPD (%R14,%RSI,8),%YMM1{%K1}{z} |
(7) 0x40305e VMOVUPD (%RBX,%RSI,8),%YMM2{%K1}{z} |
(7) 0x403065 VMULPD %YMM0,%YMM1,%YMM1 |
(7) 0x403069 VMULPD %YMM0,%YMM2,%YMM0 |
(7) 0x40306d VMOVAPD %YMM1,%YMM2 |
(7) 0x403071 VPERMT2PD %YMM0,%YMM18,%YMM2 |
(7) 0x403077 VPERMT2PD %YMM0,%YMM19,%YMM1 |
(7) 0x40307d VPMOVM2D %K1,%YMM0 |
(7) 0x403083 VPMOVSXDQ %XMM0,%YMM0 |
(7) 0x403088 VPMOVD2M %YMM0,%K1 |
(7) 0x40308e VMOVUPD %YMM1,(%R12,%RDX,1){%K1} |
(7) 0x403095 KSHIFTRB $0x4,%K1,%K1 |
(7) 0x40309b VMOVUPD %YMM2,0x20(%R12,%RDX,1){%K1} |
(7) 0x4030a3 VMOVUPD (%R12,%RDX,1),%YMM0 |
(7) 0x4030a9 VMOVUPD 0x20(%R12,%RDX,1),%YMM1 |
(7) 0x4030b0 VMOVUPD %YMM1,0x20(%RCX,%RDX,1) |
(7) 0x4030b6 VMOVUPD %YMM0,(%RCX,%RDX,1) |
(7) 0x4030bb ADD $0x4,%RSI |
(7) 0x4030bf ADD $0x40,%RDX |
(7) 0x4030c3 CMP %RDI,%RSI |
(7) 0x4030c6 JB 403040 |
(5) 0x4030cc MOV %RDI,%R8 |
(5) 0x4030cf CMP -0x40(%RBP),%RDI |
(5) 0x4030d3 JE 402da0 |
(5) 0x4030d9 JMP 4030e3 |
0x4030db NOPL (%RAX,%RAX,1) |
(5) 0x4030e0 XOR %R8D,%R8D |
(5) 0x4030e3 MOV %R8,%RCX |
(5) 0x4030e6 SAL $0x4,%RCX |
(5) 0x4030ea ADD %RCX,%RAX |
(5) 0x4030ed ADD -0x50(%RBP),%RAX |
(5) 0x4030f1 ADD %R12,%RCX |
(5) 0x4030f4 MOV -0x40(%RBP),%RDX |
(5) 0x4030f8 SUB %R8,%RDX |
(5) 0x4030fb LEA (%RBX,%R8,8),%RSI |
(5) 0x4030ff LEA (%R14,%R8,8),%RDI |
(5) 0x403103 LEA (,%R8,4),%R8 |
(5) 0x40310b ADD %R13,%R8 |
(5) 0x40310e XOR %R11D,%R11D |
(5) 0x403111 JMP 403158 |
0x403113 NOPW %CS:(%RAX,%RAX,1) |
(6) 0x403120 VCVTSI2SD %R10D,%XMM3,%XMM0 |
(6) 0x403125 VMOVSD (%RDI,%R11,2),%XMM1 |
(6) 0x40312b VMOVHPD (%RSI,%R11,2),%XMM1,%XMM1 |
(6) 0x403131 VDIVSD %XMM0,%XMM16,%XMM0 |
(6) 0x403137 VMOVDDUP %XMM0,%XMM0 |
(6) 0x40313b VMULPD %XMM0,%XMM1,%XMM0 |
(6) 0x40313f VMOVUPD %XMM0,(%RCX,%R11,4) |
(6) 0x403145 VMOVUPD %XMM0,(%RAX,%R11,4) |
(6) 0x40314b ADD $0x4,%R11 |
(6) 0x40314f DEC %RDX |
(6) 0x403152 JE 402da0 |
(6) 0x403158 MOV (%R8,%R11,1),%R10D |
(6) 0x40315c TEST %R10D,%R10D |
(6) 0x40315f JNE 403120 |
(6) 0x403161 VMOVUPD (%RCX,%R11,4),%XMM0 |
(6) 0x403167 JMP 403145 |
0x403169 NOPW %CS:(%RAX,%RAX,1) |
0x403178 NOPL (%RAX,%RAX,1) |
0x403180 LEA -0x28(%RBP),%RSP |
0x403184 POP %RBX |
0x403185 POP %R12 |
0x403187 POP %R13 |
0x403189 POP %R14 |
0x40318b POP %R15 |
0x40318d POP %RBP |
0x40318e VZEROUPPER |
0x403191 RET |
0x403192 NOPW %CS:(%RAX,%RAX,1) |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:125 | kmeans-icpx-O3-aggressive |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-icpx-O3-aggressive |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:125 | kmeans-icpx-O3-aggressive |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-icpx-O3-aggressive |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:125 | kmeans-icpx-O3-aggressive |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-icpx-O3-aggressive |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:125 | kmeans-icpx-O3-aggressive |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-icpx-O3-aggressive |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:125 | kmeans-icpx-O3-aggressive |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-icpx-O3-aggressive |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | main | main.cpp:125 | kmeans-icpx-O3-aggressive |
| ○ | __libc_init_first | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | kmeans-icpx-O3-aggressive |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-icpx-O3-aggressive |
| nb instructions | 78 |
| nb uops | 81 |
| loop length | 391 |
| used x86 registers | 14 |
| used mmx registers | 0 |
| used xmm registers | 1 |
| used ymm registers | 3 |
| used zmm registers | 0 |
| nb stack references | 22 |
| micro-operation queue | 20.25 cycles |
| front end | 20.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 5.50 | 5.50 | 13.00 | 13.00 | 26.00 | 5.50 | 5.50 | 13.00 |
| cycles | 5.50 | 6.50 | 13.00 | 13.00 | 26.00 | 5.50 | 5.50 | 13.00 |
| Cycles executing div or sqrt instructions | NA |
| FE+BE cycles | 26.12 |
| Stall cycles | 6.22 |
| SB full (events) | 9.56 |
| Front-end | 20.25 |
| Dispatch | 26.00 |
| Overall L1 | 26.00 |
| all | 3% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 16% |
| all | 50% |
| load | 50% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 9% |
| load | 40% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 14% |
| all | 11% |
| load | 6% |
| store | 11% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 11% |
| all | 31% |
| load | 31% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 12% |
| all | 14% |
| load | 26% |
| store | 11% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 11% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| PUSH %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| PUSH %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| SUB $0xa8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %R8,-0x50(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,-0x38(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| TEST %EDI,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JLE 403180 <_Z7k_meansiP7point_tS0_PiS0_ii+0x510> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RCX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| MOV %RDX,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| MOV 0x10(%RBP),%ECX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| LEA -0x1(%R9),%EAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0xa0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %ECX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| LEA (,%RDX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA (,%RDX,8),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0x90(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %EDI,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R9D,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| DEC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0xc0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA -0x1(%RDX),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| SAL $0x4,%RAX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| ADD %R12,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD $0x8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0x78(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,-0x80(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV %RSI,-0xb8(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %EDX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $0x7ffffffc,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA 0xf(,%RDX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| AND $-0x10,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0xb0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA 0xf(,%RDX,8),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VMOVSD 0x72e1(%RIP),%XMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VBROADCASTSD 0x72d7(%RIP),%YMM17 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 | scal (12.5%) |
| VMOVUPD 0x72dd(%RIP),%YMM18 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| AND $-0x10,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0xa8(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VMOVUPD 0x72e8(%RIP),%YMM19 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| MOV -0x38(%RBP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD $0x18,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0x60(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDX,-0x40(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA (%RDX,%RDX,1),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0xd0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %ECX,-0x2c(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOVQ $0,-0x48(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 2 | 1 | scal (6.3%) |
| MOV %R9,-0x58(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R15,-0x88(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| JMP 402dc9 <_Z7k_meansiP7point_tS0_PiS0_ii+0x159> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| LEA -0x28(%RBP),%RSP | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.00% of application time for run run_1_thread
| Source file and lines | main.cpp:55-96 |
| Module | kmeans-icpx-O3-aggressive |
| nb instructions | 78 |
| nb uops | 81 |
| loop length | 391 |
| used x86 registers | 14 |
| used mmx registers | 0 |
| used xmm registers | 1 |
| used ymm registers | 3 |
| used zmm registers | 0 |
| nb stack references | 22 |
| micro-operation queue | 20.25 cycles |
| front end | 20.25 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
|---|---|---|---|---|---|---|---|---|
| uops | 5.50 | 5.50 | 13.00 | 13.00 | 26.00 | 5.50 | 5.50 | 13.00 |
| cycles | 5.50 | 6.50 | 13.00 | 13.00 | 26.00 | 5.50 | 5.50 | 13.00 |
| Cycles executing div or sqrt instructions | NA |
| FE+BE cycles | 26.12 |
| Stall cycles | 6.22 |
| SB full (events) | 9.56 |
| Front-end | 20.25 |
| Dispatch | 26.00 |
| Overall L1 | 26.00 |
| all | 3% |
| load | 0% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 16% |
| all | 50% |
| load | 50% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 9% |
| load | 40% |
| store | 0% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 14% |
| all | 11% |
| load | 6% |
| store | 11% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 11% |
| all | 31% |
| load | 31% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 12% |
| all | 14% |
| load | 26% |
| store | 11% |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 12% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 11% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| PUSH %R15 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| PUSH %R14 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| PUSH %R13 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| PUSH %R12 | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
| SUB $0xa8,%RSP | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
| MOV %R8,-0x50(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,-0x38(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| TEST %EDI,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| JLE 403180 <_Z7k_meansiP7point_tS0_PiS0_ii+0x510> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
| MOV %RCX,%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| MOV %RDX,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (12.5%) |
| MOV 0x10(%RBP),%ECX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (6.3%) |
| LEA -0x1(%R9),%EAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0xa0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %ECX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| LEA (,%RDX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0x98(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA (,%RDX,8),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0x90(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %EDI,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| MOV %R9D,%ESI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | scal (6.3%) |
| DEC %RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0xc0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA -0x1(%RDX),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0x70(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| SAL $0x4,%RAX | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
| ADD %R12,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| ADD $0x8,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0x78(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RSI,-0x80(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| AND $0x7ffffffe,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (6.3%) |
| MOV %RSI,-0xb8(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %EDX,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| AND $0x7ffffffc,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0x68(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA 0xf(,%RDX,4),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| AND $-0x10,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0xb0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA 0xf(,%RDX,8),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| VMOVSD 0x72e1(%RIP),%XMM16 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | scal (12.5%) |
| VBROADCASTSD 0x72d7(%RIP),%YMM17 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 | scal (12.5%) |
| VMOVUPD 0x72dd(%RIP),%YMM18 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| AND $-0x10,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0xa8(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| VMOVUPD 0x72e8(%RIP),%YMM19 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 | vect (50.0%) |
| MOV -0x38(%RBP),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
| ADD $0x18,%RAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
| MOV %RAX,-0x60(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %RDX,-0x40(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| LEA (%RDX,%RDX,1),%RAX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| MOV %RAX,-0xd0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %ECX,-0x2c(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (6.3%) |
| MOVQ $0,-0x48(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 2 | 1 | scal (6.3%) |
| MOV %R9,-0x58(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| MOV %R15,-0x88(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | scal (12.5%) |
| JMP 402dc9 <_Z7k_meansiP7point_tS0_PiS0_ii+0x159> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| LEA -0x28(%RBP),%RSP | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
| POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %R12 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %R13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %R14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %R15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | vect (25.0%) |
| RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 | N/A |
| NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
| Run run_1_thread | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: closeOMP_NUM_THREADS: 1 |
|---|---|
| Run run_2_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: closeOMP_NUM_THREADS: 2 |
| Run run_4_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: closeOMP_NUM_THREADS: 4 |
| Run run_8_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: closeOMP_NUM_THREADS: 8 |
| Run run_16_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: closeOMP_NUM_THREADS: 16 |
| Run run_26_threads | Number processes: 1Number nodes: 1Run Command: <executable> input/100000000.in 1000 100000000 50 25MPI Command: Dataset: Run Directory: /home/fmusial/KMEANS_BenchmarksOMP_PROC_BIND: closeOMP_NUM_THREADS: 26 |
| (run_1_thread) Efficiency | (run_1_thread) Potential Speed-Up (%) | (run_2_threads) Efficiency | (run_2_threads) Potential Speed-Up (%) | (run_4_threads) Efficiency | (run_4_threads) Potential Speed-Up (%) | (run_8_threads) Efficiency | (run_8_threads) Potential Speed-Up (%) | (run_16_threads) Efficiency | (run_16_threads) Potential Speed-Up (%) | (run_26_threads) Efficiency | (run_26_threads) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0.96 | 0.27 | 0.9 | 0.71 | 0.8 | 1.2 | 0.7 | 1.53 | 0.63 | 1.54 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| run_1_thread | 1 | 1 | 1 | 1 | 11.210000038147 | 7.5592584609985 |
| run_2_threads | 1 | 0.96 | 1.93 | 2 | 11.16999912262 | 7.3025631904602 |
| run_4_threads | 1 | 0.9 | 3.59 | 4 | 11.174998283386 | 6.8756542205811 |
| run_8_threads | 1 | 0.8 | 6.44 | 8 | 11.174999237061 | 6.1521096229553 |
| run_16_threads | 1 | 0.7 | 11.17 | 16 | 11.174999237061 | 5.082085609436 |
| run_26_threads | 1 | 0.63 | 16.36 | 26 | 11.164999961853 | 4.1668996810913 |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼k_means(int, point_t*, point_t*, int*, point_t*, int, int)– | 7.56 | 11.21 |
| ▼Loop 5 - main.cpp:56-93 - kmeans-icpx-O3-aggressive– | 0.00 | 0.00 |
| ○Loop 9 - main.cpp:81-84 - kmeans-icpx-O3-aggressive | 7.56 | 11.21 |
| ○Loop 6 - main.cpp:86-92 - kmeans-icpx-O3-aggressive | 0.00 | 0.00 |
| ○Loop 8 - main.cpp:86-92 - kmeans-icpx-O3-aggressive | 0.00 | 0.00 |
| ○Loop 7 - main.cpp:86-93 - kmeans-icpx-O3-aggressive | 0.00 | 0.00 |
