Function: Step10_orig | Module: exec | Source: Step10_orig.c:10-41 [...] | Coverage: 99.96% |
---|
Function: Step10_orig | Module: exec | Source: Step10_orig.c:10-41 [...] | Coverage: 99.96% |
---|
/home/kcamus/qaas_runs/169-401-3406/intel/HACCmk/build/HACCmk/src/Step10_orig.c: 10 - 41 |
-------------------------------------------------------------------------------- |
10: { |
[...] |
19: for ( j = 0; j < count1; j++ ) |
20: { |
21: dxc = xx1[j] - xxi; |
22: dyc = yy1[j] - yyi; |
23: dzc = zz1[j] - zzi; |
24: |
25: r2 = dxc * dxc + dyc * dyc + dzc * dzc; |
26: |
27: m = ( r2 < fsrrmax2 ) ? mass1[j] : 0.0f; |
28: |
29: f = pow( r2 + mp_rsm2, -1.5 ) - ( ma0 + r2*(ma1 + r2*(ma2 + r2*(ma3 + r2*(ma4 + r2*ma5))))); |
30: |
31: f = ( r2 > 0.0f ) ? m * f : 0.0f; |
32: |
33: xi = xi + f * dxc; |
34: yi = yi + f * dyc; |
35: zi = zi + f * dzc; |
36: } |
37: |
38: *dxi = xi; |
39: *dyi = yi; |
40: *dzi = zi; |
41: } |
0x401b50 PUSH %RBP |
0x401b51 MOV %RSP,%RBP |
0x401b54 PUSH %RBX |
0x401b55 MOV 0x18(%RBP),%R10 |
0x401b59 MOV 0x10(%RBP),%R11 |
0x401b5d TEST %EDI,%EDI |
0x401b5f JLE 401d4f |
0x401b65 MOV %EDI,%EAX |
0x401b67 MOV $-0x8,%EDI |
0x401b6c AND %RAX,%RDI |
0x401b6f JE 401d60 |
0x401b75 VBROADCASTSS %XMM0,%YMM6 |
0x401b7a VBROADCASTSS %XMM1,%YMM5 |
0x401b7f VBROADCASTSS %XMM2,%YMM2 |
0x401b84 VBROADCASTSS %XMM3,%YMM1 |
0x401b89 VBROADCASTSS %XMM4,%YMM27 |
0x401b8f VXORPS %XMM22,%XMM22,%XMM22 |
0x401b95 VBROADCASTSD 0x669a(%RIP),%YMM8 |
0x401b9e VBROADCASTSS 0x6699(%RIP),%YMM9 |
0x401ba7 VBROADCASTSS 0x6694(%RIP),%YMM10 |
0x401bb0 VBROADCASTSS 0x668f(%RIP),%YMM11 |
0x401bb9 VBROADCASTSS 0x668a(%RIP),%YMM13 |
0x401bc2 VBROADCASTSS 0x6685(%RIP),%YMM14 |
0x401bcb XOR %EBX,%EBX |
0x401bcd VBROADCASTSS 0x667e(%RIP),%YMM15 |
0x401bd6 VXORPS %XMM12,%XMM12,%XMM12 |
0x401bdb VXORPS %XMM7,%XMM7,%XMM7 |
0x401bdf VXORPS %XMM4,%XMM4,%XMM4 |
0x401be3 NOPW %CS:(%RAX,%RAX,1) |
(5) 0x401bf0 VMOVUPS (%RSI,%RBX,4),%YMM16 |
(5) 0x401bf7 VSUBPS %YMM6,%YMM16,%YMM16 |
(5) 0x401bfd VMOVUPS (%RDX,%RBX,4),%YMM17 |
(5) 0x401c04 VSUBPS %YMM5,%YMM17,%YMM17 |
(5) 0x401c0a VMOVUPS (%RCX,%RBX,4),%YMM18 |
(5) 0x401c11 VSUBPS %YMM2,%YMM18,%YMM18 |
(5) 0x401c17 VMULPS %YMM16,%YMM16,%YMM19 |
(5) 0x401c1d VFMADD231PS %YMM17,%YMM17,%YMM19 |
(5) 0x401c23 VFMADD231PS %YMM18,%YMM18,%YMM19 |
(5) 0x401c29 VCMPPS $0x1,%YMM1,%YMM19,%K1 |
(5) 0x401c30 VMOVUPS (%R8,%RBX,4),%YMM20{%K1}{z} |
(5) 0x401c37 VADDPS %YMM27,%YMM19,%YMM21 |
(5) 0x401c3d VEXTRACTF32X4 $0x1,%YMM21,%XMM3 |
(5) 0x401c44 VCVTPS2PD %XMM3,%YMM3 |
(5) 0x401c48 VCVTPS2PD %XMM21,%YMM21 |
(5) 0x401c4e VSQRTPD %YMM21,%YMM23 |
(5) 0x401c54 VSQRTPD %YMM3,%YMM24 |
(5) 0x401c5a VMULPD %YMM21,%YMM21,%YMM21 |
(5) 0x401c60 VDIVPD %YMM21,%YMM8,%YMM21 |
(5) 0x401c66 VMULPD %YMM3,%YMM3,%YMM3 |
(5) 0x401c6a VMOVAPS %YMM9,%YMM25 |
(5) 0x401c70 VFMADD213PS %YMM10,%YMM19,%YMM25 |
(5) 0x401c76 VFMADD213PS %YMM11,%YMM19,%YMM25 |
(5) 0x401c7c VFMADD213PS %YMM13,%YMM19,%YMM25 |
(5) 0x401c82 VFMADD213PS %YMM14,%YMM19,%YMM25 |
(5) 0x401c88 VFMADD213PS %YMM15,%YMM19,%YMM25 |
(5) 0x401c8e VCVTPS2PD %XMM25,%YMM26 |
(5) 0x401c94 VDIVPD %YMM3,%YMM8,%YMM3 |
(5) 0x401c98 VFMADD231PD %YMM21,%YMM23,%YMM26 |
(5) 0x401c9e VEXTRACTF32X4 $0x1,%YMM25,%XMM0 |
(5) 0x401ca5 VCVTPS2PD %XMM0,%YMM0 |
(5) 0x401ca9 VFMADD231PD %YMM3,%YMM24,%YMM0 |
(5) 0x401caf VCVTPD2PS %YMM26,%XMM3 |
(5) 0x401cb5 VCVTPD2PS %YMM0,%XMM0 |
(5) 0x401cb9 VINSERTF128 $0x1,%XMM0,%YMM3,%YMM0 |
(5) 0x401cbf VCMPPS $0x1,%YMM19,%YMM22,%K1 |
(5) 0x401cc6 VMULPS %YMM0,%YMM20,%YMM0{%K1}{z} |
(5) 0x401ccc VFMADD231PS %YMM16,%YMM0,%YMM12 |
(5) 0x401cd2 VFMADD231PS %YMM17,%YMM0,%YMM7 |
(5) 0x401cd8 VFMADD231PS %YMM18,%YMM0,%YMM4 |
(5) 0x401cde ADD $0x8,%RBX |
(5) 0x401ce2 CMP %RDI,%RBX |
(5) 0x401ce5 JB 401bf0 |
0x401ceb VEXTRACTF128 $0x1,%YMM12,%XMM0 |
0x401cf1 VADDPS %XMM0,%XMM12,%XMM0 |
0x401cf5 VPERMILPD $0x1,%XMM0,%XMM3 |
0x401cfb VADDPS %XMM3,%XMM0,%XMM0 |
0x401cff VMOVSHDUP %XMM0,%XMM3 |
0x401d03 VADDSS %XMM3,%XMM0,%XMM3 |
0x401d07 VEXTRACTF128 $0x1,%YMM7,%XMM0 |
0x401d0d VADDPS %XMM0,%XMM7,%XMM0 |
0x401d11 VPERMILPD $0x1,%XMM0,%XMM7 |
0x401d17 VADDPS %XMM7,%XMM0,%XMM0 |
0x401d1b VMOVSHDUP %XMM0,%XMM7 |
0x401d1f VADDSS %XMM7,%XMM0,%XMM7 |
0x401d23 VEXTRACTF128 $0x1,%YMM4,%XMM0 |
0x401d29 VADDPS %XMM0,%XMM4,%XMM0 |
0x401d2d VPERMILPD $0x1,%XMM0,%XMM4 |
0x401d33 VADDPS %XMM4,%XMM0,%XMM0 |
0x401d37 VMOVSHDUP %XMM0,%XMM4 |
0x401d3b VADDSS %XMM4,%XMM0,%XMM4 |
0x401d3f VPBROADCASTQ %RAX,%YMM8 |
0x401d45 CMP %RAX,%RDI |
0x401d48 JNE 401d8e |
0x401d4a JMP 401f15 |
0x401d4f VXORPS %XMM3,%XMM3,%XMM3 |
0x401d53 VXORPS %XMM7,%XMM7,%XMM7 |
0x401d57 VXORPS %XMM4,%XMM4,%XMM4 |
0x401d5b JMP 401f15 |
0x401d60 VPBROADCASTQ %RAX,%YMM8 |
0x401d66 VBROADCASTSS %XMM0,%YMM6 |
0x401d6b VBROADCASTSS %XMM1,%YMM5 |
0x401d70 VBROADCASTSS %XMM2,%YMM2 |
0x401d75 VBROADCASTSS %XMM3,%YMM1 |
0x401d7a VBROADCASTSS %XMM4,%YMM27 |
0x401d80 VXORPS %XMM4,%XMM4,%XMM4 |
0x401d84 XOR %EDI,%EDI |
0x401d86 VXORPS %XMM7,%XMM7,%XMM7 |
0x401d8a VXORPS %XMM3,%XMM3,%XMM3 |
0x401d8e VPBROADCASTQ %RDI,%YMM9 |
0x401d94 VPADDQ 0x64c4(%RIP),%YMM9,%YMM10 |
0x401d9c VPADDQ 0x64dc(%RIP),%YMM9,%YMM9 |
0x401da4 VPCMPLTUQ %YMM8,%YMM9,%K0 |
0x401dab VPCMPLTUQ %YMM8,%YMM10,%K1 |
0x401db2 KSHIFTLB $0x4,%K1,%K1 |
0x401db8 KORB %K1,%K0,%K1 |
0x401dbc VMOVUPS (%RSI,%RDI,4),%YMM8{%K1}{z} |
0x401dc3 VSUBPS %YMM6,%YMM8,%YMM6 |
0x401dc7 VMOVUPS (%RDX,%RDI,4),%YMM8{%K1}{z} |
0x401dce VSUBPS %YMM5,%YMM8,%YMM5 |
0x401dd2 VMOVUPS (%RCX,%RDI,4),%YMM8{%K1}{z} |
0x401dd9 VSUBPS %YMM2,%YMM8,%YMM2 |
0x401ddd VMULPS %YMM6,%YMM6,%YMM8 |
0x401de1 VFMADD231PS %YMM5,%YMM5,%YMM8 |
0x401de6 VFMADD231PS %YMM2,%YMM2,%YMM8 |
0x401deb VCMPPS $0x1,%YMM1,%YMM8,%K2{%K1} |
0x401df2 VMOVUPS (%R8,%RDI,4),%YMM10{%K2}{z} |
0x401df9 VPXOR %XMM9,%XMM9,%XMM9 |
0x401dfe VADDPS %YMM27,%YMM8,%YMM0 |
0x401e04 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x401e0a VCVTPS2PD %XMM1,%YMM1 |
0x401e0e VCVTPS2PD %XMM0,%YMM0 |
0x401e12 VSQRTPD %YMM0,%YMM11 |
0x401e16 VSQRTPD %YMM1,%YMM12 |
0x401e1a VBROADCASTSD 0x6415(%RIP),%YMM13 |
0x401e23 VMULPD %YMM0,%YMM0,%YMM0 |
0x401e27 VDIVPD %YMM0,%YMM13,%YMM0 |
0x401e2b VMULPD %YMM1,%YMM1,%YMM1 |
0x401e2f VDIVPD %YMM1,%YMM13,%YMM1 |
0x401e33 VBROADCASTSS 0x6404(%RIP),%YMM13 |
0x401e3c VFMADD213PS 0x63fe(%RIP){1to8},%YMM8,%YMM13 |
0x401e46 VFMADD213PS 0x63f8(%RIP){1to8},%YMM8,%YMM13 |
0x401e50 VFMADD213PS 0x63f2(%RIP){1to8},%YMM8,%YMM13 |
0x401e5a VFMADD213PS 0x63ec(%RIP){1to8},%YMM8,%YMM13 |
0x401e64 VFMADD213PS 0x63e6(%RIP){1to8},%YMM8,%YMM13 |
0x401e6e VCVTPS2PD %XMM13,%YMM14 |
0x401e73 VFMADD231PD %YMM0,%YMM11,%YMM14 |
0x401e78 VEXTRACTF128 $0x1,%YMM13,%XMM0 |
0x401e7e VCVTPS2PD %XMM0,%YMM0 |
0x401e82 VCVTPD2PS %YMM14,%XMM11 |
0x401e87 VFMADD231PD %YMM1,%YMM12,%YMM0 |
0x401e8c VCVTPD2PS %YMM0,%XMM0 |
0x401e90 VINSERTF128 $0x1,%XMM0,%YMM11,%YMM0 |
0x401e96 VCMPPS $0x1,%YMM8,%YMM9,%K2 |
0x401e9d VMULPS %YMM0,%YMM10,%YMM0{%K2}{z} |
0x401ea3 VMULPS %YMM6,%YMM0,%YMM1{%K1}{z} |
0x401ea9 VMULPS %YMM5,%YMM0,%YMM5{%K1}{z} |
0x401eaf VMULPS %YMM2,%YMM0,%YMM0{%K1}{z} |
0x401eb5 VEXTRACTF128 $0x1,%YMM1,%XMM2 |
0x401ebb VADDPS %XMM2,%XMM1,%XMM1 |
0x401ebf VPERMILPD $0x1,%XMM1,%XMM2 |
0x401ec5 VADDPS %XMM2,%XMM1,%XMM1 |
0x401ec9 VMOVSHDUP %XMM1,%XMM2 |
0x401ecd VADDSS %XMM2,%XMM1,%XMM1 |
0x401ed1 VADDSS %XMM1,%XMM3,%XMM3 |
0x401ed5 VEXTRACTF128 $0x1,%YMM5,%XMM1 |
0x401edb VADDPS %XMM1,%XMM5,%XMM1 |
0x401edf VPERMILPD $0x1,%XMM1,%XMM2 |
0x401ee5 VADDPS %XMM2,%XMM1,%XMM1 |
0x401ee9 VMOVSHDUP %XMM1,%XMM2 |
0x401eed VADDSS %XMM2,%XMM1,%XMM1 |
0x401ef1 VADDSS %XMM1,%XMM7,%XMM7 |
0x401ef5 VEXTRACTF128 $0x1,%YMM0,%XMM1 |
0x401efb VADDPS %XMM1,%XMM0,%XMM0 |
0x401eff VPERMILPD $0x1,%XMM0,%XMM1 |
0x401f05 VADDPS %XMM1,%XMM0,%XMM0 |
0x401f09 VMOVSHDUP %XMM0,%XMM1 |
0x401f0d VADDSS %XMM1,%XMM0,%XMM0 |
0x401f11 VADDSS %XMM0,%XMM4,%XMM4 |
0x401f15 VMOVSS %XMM3,(%R9) |
0x401f1a VMOVSS %XMM7,(%R11) |
0x401f1f VMOVSS %XMM4,(%R10) |
0x401f24 POP %RBX |
0x401f25 POP %RBP |
0x401f26 VZEROUPPER |
0x401f29 RET |
0x401f2a NOPW (%RAX,%RAX,1) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | main.extracted.8 | main.c:142 | exec |
○ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_fork_call | libiomp5.so | |
○ | __kmpc_fork_call | libiomp5.so | |
○ | main | main.c:139 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Source file and lines | Step10_orig.c:10-41 |
Module | exec |
nb instructions | 143 |
nb uops | 150 |
loop length | 741 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 12 |
used ymm registers | 16 |
used zmm registers | 0 |
nb stack references | 2 |
ADD-SUB / MUL ratio | 3.57 |
micro-operation queue | 37.50 cycles |
front end | 37.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 27.00 | 27.00 | 12.00 | 12.00 | 5.00 | 43.00 | 10.00 | 6.00 |
cycles | 27.00 | 27.00 | 12.00 | 12.00 | 5.00 | 43.00 | 10.00 | 6.00 |
Cycles executing div or sqrt instructions | 34.00-40.00 |
FE+BE cycles | 142.17-149.17 |
Stall cycles | 104.95-111.95 |
RS full (events) | 128.60-135.60 |
Front-end | 37.50 |
Dispatch | 43.00 |
DIV/SQRT | 34.00-40.00 |
Overall L1 | 43.00 |
all | 54% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 44% |
all | 71% |
load | 50% |
store | 0% |
mul | 100% |
add-sub | 64% |
fma | 100% |
div/sqrt | 100% |
other | 67% |
all | 70% |
load | 55% |
store | 0% |
mul | 100% |
add-sub | 66% |
fma | 100% |
div/sqrt | 100% |
other | 64% |
all | 27% |
load | 50% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 22% |
all | 26% |
load | 28% |
store | 6% |
mul | 50% |
add-sub | 22% |
fma | 50% |
div/sqrt | 50% |
other | 19% |
all | 26% |
load | 30% |
store | 6% |
mul | 50% |
add-sub | 24% |
fma | 50% |
div/sqrt | 50% |
other | 19% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x18(%RBP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x10(%RBP),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
TEST %EDI,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JLE 401d4f | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV %EDI,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV $-0x8,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
AND %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 401d60 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VBROADCASTSS %XMM0,%YMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM1,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM2,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM3,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM4,%YMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VXORPS %XMM22,%XMM22,%XMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD 0x669a(%RIP),%YMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x6699(%RIP),%YMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x6694(%RIP),%YMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x668f(%RIP),%YMM11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x668a(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x6685(%RIP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSS 0x667e(%RIP),%YMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VXORPS %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM7,%XMM7,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VEXTRACTF128 $0x1,%YMM12,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM0,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM0,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM3,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM0,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM3,%XMM0,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM7,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM0,%XMM7,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM0,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM7,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM0,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM7,%XMM0,%XMM7 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM4,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM0,%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM0,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM4,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM0,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM4,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPBROADCASTQ %RAX,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CMP %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 401d8e | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
JMP 401f15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
VXORPS %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM7,%XMM7,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 401f15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
VPBROADCASTQ %RAX,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VBROADCASTSS %XMM0,%YMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM1,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM2,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM3,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM4,%YMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VXORPS %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM7,%XMM7,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VPBROADCASTQ %RDI,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPADDQ 0x64c4(%RIP),%YMM9,%YMM10 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 |
VPADDQ 0x64dc(%RIP),%YMM9,%YMM9 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 |
VPCMPLTUQ %YMM8,%YMM9,%K0 | |||||||||||
VPCMPLTUQ %YMM8,%YMM10,%K1 | |||||||||||
KSHIFTLB $0x4,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 |
KORB %K1,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPS (%RSI,%RDI,4),%YMM8{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPS %YMM6,%YMM8,%YMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPS (%RDX,%RDI,4),%YMM8{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPS %YMM5,%YMM8,%YMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPS (%RCX,%RDI,4),%YMM8{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPS %YMM2,%YMM8,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPS %YMM6,%YMM6,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PS %YMM5,%YMM5,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PS %YMM2,%YMM2,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPS $0x1,%YMM1,%YMM8,%K2{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPS (%R8,%RDI,4),%YMM10{%K2}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VPXOR %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VADDPS %YMM27,%YMM8,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VCVTPS2PD %XMM1,%YMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VCVTPS2PD %XMM0,%YMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VSQRTPD %YMM0,%YMM11 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 9-12 |
VSQRTPD %YMM1,%YMM12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 9-12 |
VBROADCASTSD 0x6415(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VMULPD %YMM0,%YMM0,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %YMM0,%YMM13,%YMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-14 | 8 |
VMULPD %YMM1,%YMM1,%YMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %YMM1,%YMM13,%YMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-14 | 8 |
VBROADCASTSS 0x6404(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VFMADD213PS 0x63fe(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PS 0x63f8(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PS 0x63f2(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PS 0x63ec(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PS 0x63e6(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCVTPS2PD %XMM13,%YMM14 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VFMADD231PD %YMM0,%YMM11,%YMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM13,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VCVTPS2PD %XMM0,%YMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VCVTPD2PS %YMM14,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VFMADD231PD %YMM1,%YMM12,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCVTPD2PS %YMM0,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VINSERTF128 $0x1,%XMM0,%YMM11,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VCMPPS $0x1,%YMM8,%YMM9,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMULPS %YMM0,%YMM10,%YMM0{%K2}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPS %YMM6,%YMM0,%YMM1{%K1}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPS %YMM5,%YMM0,%YMM5{%K1}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPS %YMM2,%YMM0,%YMM0{%K1}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSS %XMM1,%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM5,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM1,%XMM5,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSS %XMM1,%XMM7,%XMM7 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSS %XMM0,%XMM4,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSS %XMM3,(%R9) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSS %XMM7,(%R11) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSS %XMM4,(%R10) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
Source file and lines | Step10_orig.c:10-41 |
Module | exec |
nb instructions | 143 |
nb uops | 150 |
loop length | 741 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 12 |
used ymm registers | 16 |
used zmm registers | 0 |
nb stack references | 2 |
ADD-SUB / MUL ratio | 3.57 |
micro-operation queue | 37.50 cycles |
front end | 37.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 27.00 | 27.00 | 12.00 | 12.00 | 5.00 | 43.00 | 10.00 | 6.00 |
cycles | 27.00 | 27.00 | 12.00 | 12.00 | 5.00 | 43.00 | 10.00 | 6.00 |
Cycles executing div or sqrt instructions | 34.00-40.00 |
FE+BE cycles | 142.17-149.17 |
Stall cycles | 104.95-111.95 |
RS full (events) | 128.60-135.60 |
Front-end | 37.50 |
Dispatch | 43.00 |
DIV/SQRT | 34.00-40.00 |
Overall L1 | 43.00 |
all | 54% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 44% |
all | 71% |
load | 50% |
store | 0% |
mul | 100% |
add-sub | 64% |
fma | 100% |
div/sqrt | 100% |
other | 67% |
all | 70% |
load | 55% |
store | 0% |
mul | 100% |
add-sub | 66% |
fma | 100% |
div/sqrt | 100% |
other | 64% |
all | 27% |
load | 50% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 22% |
all | 26% |
load | 28% |
store | 6% |
mul | 50% |
add-sub | 22% |
fma | 50% |
div/sqrt | 50% |
other | 19% |
all | 26% |
load | 30% |
store | 6% |
mul | 50% |
add-sub | 24% |
fma | 50% |
div/sqrt | 50% |
other | 19% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
PUSH %RBX | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV 0x18(%RBP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x10(%RBP),%R11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
TEST %EDI,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JLE 401d4f | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV %EDI,%EAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV $-0x8,%EDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
AND %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 401d60 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VBROADCASTSS %XMM0,%YMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM1,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM2,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM3,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM4,%YMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VXORPS %XMM22,%XMM22,%XMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSD 0x669a(%RIP),%YMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x6699(%RIP),%YMM9 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x6694(%RIP),%YMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x668f(%RIP),%YMM11 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x668a(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VBROADCASTSS 0x6685(%RIP),%YMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
XOR %EBX,%EBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VBROADCASTSS 0x667e(%RIP),%YMM15 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VXORPS %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM7,%XMM7,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VEXTRACTF128 $0x1,%YMM12,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM0,%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM0,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM3,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM0,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM3,%XMM0,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM7,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM0,%XMM7,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM0,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM7,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM0,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM7,%XMM0,%XMM7 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM4,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM0,%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM0,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM4,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM0,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM4,%XMM0,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPBROADCASTQ %RAX,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
CMP %RAX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 401d8e | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
JMP 401f15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
VXORPS %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM7,%XMM7,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
JMP 401f15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1-2 |
VPBROADCASTQ %RAX,%YMM8 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VBROADCASTSS %XMM0,%YMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM1,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM2,%YMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM3,%YMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VBROADCASTSS %XMM4,%YMM27 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VXORPS %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
XOR %EDI,%EDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM7,%XMM7,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VXORPS %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VPBROADCASTQ %RDI,%YMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VPADDQ 0x64c4(%RIP),%YMM9,%YMM10 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 |
VPADDQ 0x64dc(%RIP),%YMM9,%YMM9 | 1 | 0.33 | 0.33 | 0.50 | 0.50 | 0 | 0.33 | 0 | 0 | 1 | 0.50 |
VPCMPLTUQ %YMM8,%YMM9,%K0 | |||||||||||
VPCMPLTUQ %YMM8,%YMM10,%K1 | |||||||||||
KSHIFTLB $0x4,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | 1 |
KORB %K1,%K0,%K1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VMOVUPS (%RSI,%RDI,4),%YMM8{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPS %YMM6,%YMM8,%YMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPS (%RDX,%RDI,4),%YMM8{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPS %YMM5,%YMM8,%YMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPS (%RCX,%RDI,4),%YMM8{%K1}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VSUBPS %YMM2,%YMM8,%YMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPS %YMM6,%YMM6,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PS %YMM5,%YMM5,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PS %YMM2,%YMM2,%YMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPS $0x1,%YMM1,%YMM8,%K2{%K1} | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMOVUPS (%R8,%RDI,4),%YMM10{%K2}{z} | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5-6 | 0.50 |
VPXOR %XMM9,%XMM9,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VADDPS %YMM27,%YMM8,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VCVTPS2PD %XMM1,%YMM1 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VCVTPS2PD %XMM0,%YMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VSQRTPD %YMM0,%YMM11 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 9-12 |
VSQRTPD %YMM1,%YMM12 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-19 | 9-12 |
VBROADCASTSD 0x6415(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VMULPD %YMM0,%YMM0,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %YMM0,%YMM13,%YMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-14 | 8 |
VMULPD %YMM1,%YMM1,%YMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VDIVPD %YMM1,%YMM13,%YMM1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-14 | 8 |
VBROADCASTSS 0x6404(%RIP),%YMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 5 | 0.50 |
VFMADD213PS 0x63fe(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PS 0x63f8(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PS 0x63f2(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PS 0x63ec(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PS 0x63e6(%RIP){1to8},%YMM8,%YMM13 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCVTPS2PD %XMM13,%YMM14 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VFMADD231PD %YMM0,%YMM11,%YMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM13,%XMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VCVTPS2PD %XMM0,%YMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VCVTPD2PS %YMM14,%XMM11 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VFMADD231PD %YMM1,%YMM12,%YMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCVTPD2PS %YMM0,%XMM0 | 2 | 0.50 | 0.50 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | 1 |
VINSERTF128 $0x1,%XMM0,%YMM11,%YMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VCMPPS $0x1,%YMM8,%YMM9,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VMULPS %YMM0,%YMM10,%YMM0{%K2}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPS %YMM6,%YMM0,%YMM1{%K1}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPS %YMM5,%YMM0,%YMM5{%K1}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPS %YMM2,%YMM0,%YMM0{%K1}{z} | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSS %XMM1,%XMM3,%XMM3 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM5,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM1,%XMM5,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM1,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM2,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSS %XMM1,%XMM7,%XMM7 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTF128 $0x1,%YMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 1 |
VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VPERMILPD $0x1,%XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDPS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSHDUP %XMM0,%XMM1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VADDSS %XMM1,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSS %XMM0,%XMM4,%XMM4 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSS %XMM3,(%R9) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSS %XMM7,(%R11) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSS %XMM4,(%R10) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
POP %RBX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 |
VZEROUPPER | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼Step10_orig– | 99.96 | 41.75 |
○Loop 5 - Step10_orig.c:19-35 - exec | 99.82 | 41.69 |