Loop Id: 278 | Module: exec | Source: advec_mom_kernel.f90:81-241 [...] | Coverage: 9.14% |
---|
Loop Id: 278 | Module: exec | Source: advec_mom_kernel.f90:81-241 [...] | Coverage: 9.14% |
---|
0x43a700 VCMPPD $0x1,%ZMM30,%ZMM15,%K2 |
0x43a707 VCMPPD $0x1,%ZMM29,%ZMM15,%K3 |
0x43a70e VPBLENDMD %ZMM21,%ZMM20,%ZMM2{%K1} |
0x43a714 VMOVSD (%RDI,%R11,8),%XMM6 [10] |
0x43a71a VANDPD %ZMM7,%ZMM12,%ZMM12 |
0x43a720 VANDPD %ZMM7,%ZMM11,%ZMM11 |
0x43a726 VANDPD %ZMM7,%ZMM3,%ZMM5 |
0x43a72c VANDPD %ZMM7,%ZMM31,%ZMM17 |
0x43a732 VBROADCASTSD 0xce6b4(%RIP),%ZMM0 [12] |
0x43a73c VSUBPD %ZMM27,%ZMM0,%ZMM18 |
0x43a742 VSUBPD %ZMM28,%ZMM0,%ZMM1 |
0x43a748 VMULPD %ZMM1,%ZMM17,%ZMM1 |
0x43a74e VMULPD %ZMM18,%ZMM5,%ZMM18 |
0x43a754 VMOVSD 0x1051c4(%RIP),%XMM0 [12] |
0x43a75c VDIVSD %XMM6,%XMM0,%XMM0 |
0x43a760 VBROADCASTSD %XMM0,%ZMM0 |
0x43a766 VMINPD %ZMM5,%ZMM11,%ZMM5 |
0x43a76c VFMADD213PD %ZMM11,%ZMM27,%ZMM11 |
0x43a772 VMINPD %ZMM17,%ZMM12,%ZMM17 |
0x43a778 VFMADD213PD %ZMM12,%ZMM28,%ZMM12 |
0x43a77e VEXTRACTI64X4 $0x1,%ZMM2,%YMM16 |
0x43a785 VPMOVSXDQ %YMM2,%ZMM2 |
0x43a78b VPMOVSXDQ %YMM16,%ZMM16 |
0x43a791 VPSUBQ %ZMM8,%ZMM16,%ZMM16 |
0x43a797 VPSUBQ %ZMM8,%ZMM2,%ZMM2 |
0x43a79d VPXOR %XMM4,%XMM4,%XMM4 |
0x43a7a1 VGATHERQPD (%RDI,%ZMM2,8),%ZMM4{%K3} [13] |
0x43a7a8 VXORPD %XMM2,%XMM2,%XMM2 |
0x43a7ac VGATHERQPD (%RDI,%ZMM16,8),%ZMM2{%K2} [5] |
0x43a7b3 VDIVPD %ZMM4,%ZMM11,%ZMM4 |
0x43a7b9 VDIVPD %ZMM2,%ZMM12,%ZMM2 |
0x43a7bf VFMADD231PD %ZMM18,%ZMM0,%ZMM4 |
0x43a7c5 VFMADD231PD %ZMM1,%ZMM0,%ZMM2 |
0x43a7cb VMULSD 0xce62d(%RIP),%XMM6,%XMM0 [12] |
0x43a7d3 VBROADCASTSD %XMM0,%ZMM0 |
0x43a7d9 VMULPD %ZMM2,%ZMM0,%ZMM1 |
0x43a7df VMULPD %ZMM4,%ZMM0,%ZMM0 |
0x43a7e5 VMINPD %ZMM17,%ZMM1,%ZMM11 |
0x43a7eb VMINPD %ZMM5,%ZMM0,%ZMM6 |
0x43a7f1 VFPCLASSPD $0x56,%ZMM3,%K1 |
0x43a7f8 VFPCLASSPD $0x56,%ZMM31,%K2 |
0x43a7ff VBROADCASTSD 0xce5ef(%RIP),%ZMM0 [12] |
0x43a809 VXORPD %ZMM0,%ZMM11,%ZMM11{%K2} |
0x43a80f VXORPD %ZMM0,%ZMM6,%ZMM6{%K1} |
0x43a815 VCMPPD $0x1,%ZMM29,%ZMM15,%K1 |
0x43a81c VCMPPD $0x1,%ZMM30,%ZMM15,%K2 |
0x43a823 VMOVAPD %ZMM11,%ZMM0{%K2}{z} |
0x43a829 VMOVAPD %ZMM6,%ZMM1{%K1}{z} |
0x43a82f VBROADCASTSD 0x1050e7(%RIP),%ZMM3 [12] |
0x43a839 VSUBPD %ZMM28,%ZMM3,%ZMM2 |
0x43a83f VSUBPD %ZMM27,%ZMM3,%ZMM3 |
0x43a845 VFMADD213PD %ZMM25,%ZMM1,%ZMM3 |
0x43a84b VFMADD213PD %ZMM26,%ZMM0,%ZMM2 |
0x43a851 VMULPD %ZMM24,%ZMM2,%ZMM0 |
0x43a857 VMULPD %ZMM23,%ZMM3,%ZMM1 |
0x43a85d VMOVUPD %ZMM1,(%R9,%RDX,8) [8] |
0x43a864 VMOVUPD %ZMM0,0x40(%R9,%RDX,8) [8] |
0x43a86c ADD $0x10,%RDX |
0x43a870 CMP %RBX,%RDX |
0x43a873 JA 43ab00 |
0x43a879 VMOVUPD (%R8,%RDX,8),%ZMM23 [1] |
0x43a880 VMOVUPD 0x40(%R8,%RDX,8),%ZMM24 [1] |
0x43a888 VFPCLASSPD $0x50,%ZMM23,%K0 |
0x43a88f VFPCLASSPD $0x50,%ZMM24,%K1 |
0x43a896 KUNPCKBW %K0,%K1,%K1 |
0x43a89a VPBLENDMD %ZMM21,%ZMM22,%ZMM3{%K1} |
0x43a8a0 VEXTRACTI64X4 $0x1,%ZMM3,%YMM6 |
0x43a8a7 VPMOVSXDQ %YMM6,%ZMM6 |
0x43a8ad VPMOVSXDQ %YMM3,%ZMM3 |
0x43a8b3 VPSUBQ %ZMM8,%ZMM3,%ZMM11 |
0x43a8b9 VPSUBQ %ZMM8,%ZMM6,%ZMM6 |
0x43a8bf VPXOR %XMM3,%XMM3,%XMM3 |
0x43a8c3 VPMULLQ %ZMM6,%ZMM9,%ZMM3 |
0x43a8c9 VPXOR %XMM12,%XMM12,%XMM12 |
0x43a8ce VPMULLQ %ZMM11,%ZMM9,%ZMM12 |
0x43a8d4 LEA (%R10,%RDX,1),%RSI |
0x43a8d8 VMOVQ %RSI,%XMM25 |
0x43a8de VPSUBQ 0x70(%RSP),%XMM25,%XMM25 [6] |
0x43a8e6 VPSLLQ $0x3,%XMM25,%XMM25 |
0x43a8ed VPBROADCASTQ %XMM25,%ZMM25 |
0x43a8f3 VPADDQ 0xcd443(%RIP),%ZMM25,%ZMM26 [12] |
0x43a8fd VPADDQ %ZMM26,%ZMM10,%ZMM27 |
0x43a903 VPADDQ %ZMM12,%ZMM27,%ZMM12 |
0x43a909 VPXORD %XMM27,%XMM27,%XMM27 |
0x43a90f KXNORW %K0,%K0,%K2 |
0x43a913 VGATHERQPD (,%ZMM12,1),%ZMM27{%K2} [15] |
0x43a91e VPADDQ 0xcd3d8(%RIP),%ZMM25,%ZMM12 [12] |
0x43a928 VPADDQ %ZMM12,%ZMM10,%ZMM25 |
0x43a92e VPADDQ %ZMM3,%ZMM25,%ZMM25 |
0x43a934 VPXOR %XMM3,%XMM3,%XMM3 |
0x43a938 KXNORW %K0,%K0,%K2 |
0x43a93c VGATHERQPD (,%ZMM25,1),%ZMM3{%K2} [14] |
0x43a947 VPMULLQ %ZMM11,%ZMM13,%ZMM11 |
0x43a94d VPADDQ %ZMM26,%ZMM14,%ZMM28 |
0x43a953 VPADDQ %ZMM11,%ZMM28,%ZMM11 |
0x43a959 VXORPD %XMM25,%XMM25,%XMM25 |
0x43a95f KXNORW %K0,%K0,%K2 |
0x43a963 VGATHERQPD (,%ZMM11,1),%ZMM25{%K2} [9] |
0x43a96e VPMULLQ %ZMM6,%ZMM13,%ZMM6 |
0x43a974 VPADDQ %ZMM12,%ZMM14,%ZMM11 |
0x43a97a VPADDQ %ZMM6,%ZMM11,%ZMM6 |
0x43a980 VPXORD %XMM26,%XMM26,%XMM26 |
0x43a986 KXNORW %K0,%K0,%K2 |
0x43a98a VGATHERQPD (,%ZMM6,1),%ZMM26{%K2} [4] |
0x43a995 VPBLENDMD %ZMM19,%ZMM20,%ZMM6{%K1} |
0x43a99b VEXTRACTI64X4 $0x1,%ZMM6,%YMM12 |
0x43a9a2 VPMOVSXDQ %YMM12,%ZMM12 |
0x43a9a8 VPMOVSXDQ %YMM6,%ZMM6 |
0x43a9ae VPSUBQ %ZMM8,%ZMM6,%ZMM6 |
0x43a9b4 VPMULLQ %ZMM6,%ZMM13,%ZMM6 |
0x43a9ba VPADDQ %ZMM6,%ZMM28,%ZMM6 |
0x43a9c0 VXORPD %XMM29,%XMM29,%XMM29 |
0x43a9c6 KXNORW %K0,%K0,%K2 |
0x43a9ca VGATHERQPD (,%ZMM6,1),%ZMM29{%K2} [7] |
0x43a9d5 VPSUBQ %ZMM8,%ZMM12,%ZMM6 |
0x43a9db VPMULLQ %ZMM6,%ZMM13,%ZMM6 |
0x43a9e1 VPADDQ %ZMM6,%ZMM11,%ZMM6 |
0x43a9e7 VPXOR %XMM12,%XMM12,%XMM12 |
0x43a9ec KXNORW %K0,%K0,%K2 |
0x43a9f0 VGATHERQPD (,%ZMM6,1),%ZMM12{%K2} [3] |
0x43a9fb VPBLENDMD %ZMM22,%ZMM21,%ZMM30{%K1} |
0x43aa01 VANDPD %ZMM7,%ZMM23,%ZMM31 |
0x43aa07 VANDPD %ZMM7,%ZMM24,%ZMM2 |
0x43aa0d VXORPD %XMM6,%XMM6,%XMM6 |
0x43aa11 VDIVPD %ZMM27,%ZMM31,%ZMM27 |
0x43aa17 VEXTRACTI64X4 $0x1,%ZMM30,%YMM31 |
0x43aa1e VPMOVSXDQ %YMM31,%ZMM31 |
0x43aa24 VPMOVSXDQ %YMM30,%ZMM30 |
0x43aa2a VPSUBQ %ZMM8,%ZMM30,%ZMM30 |
0x43aa30 VPMULLQ %ZMM30,%ZMM13,%ZMM30 |
0x43aa36 VPADDQ %ZMM30,%ZMM28,%ZMM28 |
0x43aa3c VPXORD %XMM30,%XMM30,%XMM30 |
0x43aa42 KXNORW %K0,%K0,%K2 |
0x43aa46 VGATHERQPD (,%ZMM28,1),%ZMM30{%K2} [2] |
0x43aa51 VPSUBQ %ZMM8,%ZMM31,%ZMM28 |
0x43aa57 VPMULLQ %ZMM28,%ZMM13,%ZMM28 |
0x43aa5d VPADDQ %ZMM28,%ZMM11,%ZMM11 |
0x43aa63 VPXORD %XMM31,%XMM31,%XMM31 |
0x43aa69 KXNORW %K0,%K0,%K2 |
0x43aa6d VGATHERQPD (,%ZMM11,1),%ZMM31{%K2} [11] |
0x43aa78 VDIVPD %ZMM3,%ZMM2,%ZMM28 |
0x43aa7e VSUBPD %ZMM12,%ZMM26,%ZMM12 |
0x43aa84 VSUBPD %ZMM29,%ZMM25,%ZMM11 |
0x43aa8a VSUBPD %ZMM26,%ZMM31,%ZMM31 |
0x43aa90 VSUBPD %ZMM25,%ZMM30,%ZMM3 |
0x43aa96 VMULPD %ZMM11,%ZMM3,%ZMM29 |
0x43aa9c VMULPD %ZMM12,%ZMM31,%ZMM30 |
0x43aaa2 VCMPPD $0x1,%ZMM30,%ZMM6,%K0 |
0x43aaa9 VCMPPD $0x1,%ZMM29,%ZMM6,%K2 |
0x43aab0 KORTESTB %K0,%K2 |
0x43aab4 JNE 43a700 |
0x43aaba VXORPD %XMM11,%XMM11,%XMM11 |
0x43aabf JMP 43a815 |
/scratch_na/users/xoserete/qaas_runs/171-415-7190/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/advec_mom_kernel.f90: 81 - 241 |
-------------------------------------------------------------------------------- |
81: IF(mom_sweep.EQ.1)THEN ! x 1 |
[...] |
215: IF(node_flux(j,k).LT.0.0)THEN |
[...] |
227: sigma=ABS(node_flux(j,k))/(node_mass_pre(j,donor)) |
228: width=celldy(k) |
229: vdiffuw=vel1(j,donor)-vel1(j,upwind) |
230: vdiffdw=vel1(j,downwind)-vel1(j,donor) |
231: limiter=0.0 |
232: IF(vdiffuw*vdiffdw.GT.0.0)THEN |
233: auw=ABS(vdiffuw) |
234: adw=ABS(vdiffdw) |
235: wind=1.0_8 |
236: IF(vdiffdw.LE.0.0) wind=-1.0_8 |
237: limiter=wind*MIN(width*((2.0_8-sigma)*adw/width+(1.0_8+sigma)*auw/celldy(dif))/6.0_8,auw,adw) |
238: ENDIF |
239: advec_vel_s=vel1(j,donor)+(1.0_8-sigma)*limiter |
240: mom_flux(j,k)=advec_vel_s*node_flux(j,k) |
241: ENDDO |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | __kmp_invoke_microtask | libiomp5.so | |
○ | __kmp_invoke_task_func | libiomp5.so |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.08 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.03 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | NA |
Bottlenecks | |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:215-215,advec_mom_kernel.f90:227-241 |
Source loop unroll info | unrolled by 16 |
Source loop unroll confidence level | high |
Unroll/vectorization loop type | main |
Unroll factor | 16 |
CQA cycles | 70.25 |
CQA cycles if no scalar integer | 65.25 |
CQA cycles if FP arith vectorized | 70.06 |
CQA cycles if fully vectorized | 68.44 |
Front-end cycles | 34.00 |
DIV/SQRT cycles | 70.25 |
P0 cycles | 8.75 |
P1 cycles | 26.83 |
P2 cycles | 26.83 |
P3 cycles | 1.00 |
P4 cycles | 70.25 |
P5 cycles | 2.00 |
P6 cycles | 1.00 |
P7 cycles | 1.00 |
P8 cycles | 1.00 |
P9 cycles | 1.00 |
P10 cycles | 26.83 |
P11 cycles | 50.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 71.56 - 110.63 |
Stall cycles (UFS) | 42.68 - 81.62 |
Nb insns | 130.00 |
Nb uops | 204.00 |
Nb loads | 17.50 |
Nb stores | 2.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.75 |
Nb FLOP add-sub | 56.00 |
Nb FLOP mul | 48.50 |
Nb FLOP fma | 32.00 |
Nb FLOP div | 24.50 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 14.47 |
Bytes prefetched | 0.00 |
Bytes loaded | 876.00 |
Bytes stored | 128.00 |
Stride 0 | 2.50 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 6.00 |
Vectorization ratio all | 94.34 |
Vectorization ratio load | 82.14 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 97.06 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 90.00 |
Vectorization ratio other | 91.81 |
Vector-efficiency ratio all | 80.64 |
Vector-efficiency ratio load | 79.91 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 97.43 |
Vector-efficiency ratio add_sub | 97.40 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 91.25 |
Vector-efficiency ratio other | 65.72 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.09 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.02 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | NA |
Bottlenecks | P0, P5, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:215-215,advec_mom_kernel.f90:227-241 |
Source loop unroll info | unrolled by 16 |
Source loop unroll confidence level | high |
Unroll/vectorization loop type | main |
Unroll factor | 16 |
CQA cycles | 59.00 |
CQA cycles if no scalar integer | 54.00 |
CQA cycles if FP arith vectorized | 59.00 |
CQA cycles if fully vectorized | 58.00 |
Front-end cycles | 29.50 |
DIV/SQRT cycles | 59.00 |
P0 cycles | 6.00 |
P1 cycles | 23.33 |
P2 cycles | 23.33 |
P3 cycles | 1.00 |
P4 cycles | 59.00 |
P5 cycles | 2.00 |
P6 cycles | 1.00 |
P7 cycles | 1.00 |
P8 cycles | 1.00 |
P9 cycles | 1.00 |
P10 cycles | 23.33 |
P11 cycles | 32.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 59.50 - 95.27 |
Stall cycles (UFS) | 34.78 - 70.39 |
Nb insns | 109.00 |
Nb uops | 177.00 |
Nb loads | 14.00 |
Nb stores | 2.00 |
Nb stack references | 1.00 |
FLOP/cycle | 2.17 |
Nb FLOP add-sub | 48.00 |
Nb FLOP mul | 32.00 |
Nb FLOP fma | 16.00 |
Nb FLOP div | 16.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 15.59 |
Bytes prefetched | 0.00 |
Bytes loaded | 792.00 |
Bytes stored | 128.00 |
Stride 0 | 2.00 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 5.00 |
Vectorization ratio all | 96.77 |
Vectorization ratio load | 92.86 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 93.48 |
Vector-efficiency ratio all | 81.05 |
Vector-efficiency ratio load | 88.39 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 97.22 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 63.32 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.07 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.03 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | NA |
Bottlenecks | P0, P5, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:215-215,advec_mom_kernel.f90:227-241 |
Source loop unroll info | unrolled by 16 |
Source loop unroll confidence level | high |
Unroll/vectorization loop type | main |
Unroll factor | 16 |
CQA cycles | 81.50 |
CQA cycles if no scalar integer | 76.50 |
CQA cycles if FP arith vectorized | 81.13 |
CQA cycles if fully vectorized | 78.88 |
Front-end cycles | 38.50 |
DIV/SQRT cycles | 81.50 |
P0 cycles | 11.50 |
P1 cycles | 30.33 |
P2 cycles | 30.33 |
P3 cycles | 1.00 |
P4 cycles | 81.50 |
P5 cycles | 2.00 |
P6 cycles | 1.00 |
P7 cycles | 1.00 |
P8 cycles | 1.00 |
P9 cycles | 1.00 |
P10 cycles | 30.33 |
P11 cycles | 68.00 |
Inter-iter dependencies cycles | 0 |
FE+BE cycles (UFS) | 83.63 - 125.99 |
Stall cycles (UFS) | 50.57 - 92.86 |
Nb insns | 151.00 |
Nb uops | 231.00 |
Nb loads | 21.00 |
Nb stores | 2.00 |
Nb stack references | 1.00 |
FLOP/cycle | 3.17 |
Nb FLOP add-sub | 64.00 |
Nb FLOP mul | 65.00 |
Nb FLOP fma | 48.00 |
Nb FLOP div | 33.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 13.35 |
Bytes prefetched | 0.00 |
Bytes loaded | 960.00 |
Bytes stored | 128.00 |
Stride 0 | 3.00 |
Stride 1 | 2.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 7.00 |
Vectorization ratio all | 91.91 |
Vectorization ratio load | 71.43 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 94.12 |
Vectorization ratio add_sub | 100.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 80.00 |
Vectorization ratio other | 90.14 |
Vector-efficiency ratio all | 80.24 |
Vector-efficiency ratio load | 71.43 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 94.85 |
Vector-efficiency ratio add_sub | 97.58 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 82.50 |
Vector-efficiency ratio other | 68.13 |
Path / |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 130 |
nb uops | 204 |
loop length | 820.50 |
used x86 registers | 8 |
used mmx registers | 0 |
used xmm registers | 11 |
used ymm registers | 6 |
used zmm registers | 29.50 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.19 |
micro-operation queue | 34.00 cycles |
front end | 34.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 70.25 | 6.50 | 26.83 | 26.83 | 1.00 | 70.25 | 2.00 | 1.00 | 1.00 | 1.00 | 1.00 | 26.83 |
cycles | 70.25 | 8.75 | 26.83 | 26.83 | 1.00 | 70.25 | 2.00 | 1.00 | 1.00 | 1.00 | 1.00 | 26.83 |
Cycles executing div or sqrt instructions | 50.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 71.56-110.63 |
Stall cycles | 42.68-81.62 |
RS full (events) | 70.19-0.58 |
PRF_FLOAT full (events) | 0.03-84.60 |
Front-end | 34.00 |
Dispatch | 70.25 |
DIV/SQRT | 50.00 |
Data deps. | 0.00 |
Overall L1 | 70.25 |
all | 96% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 92% |
all | 92% |
load | 78% |
store | 100% |
mul | 94% |
add-sub | 100% |
fma | 100% |
div/sqrt | 90% |
other | 92% |
all | 94% |
load | 82% |
store | 100% |
mul | 97% |
add-sub | 100% |
fma | 100% |
div/sqrt | 90% |
other | 91% |
all | 73% |
load | 75% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 96% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 44% |
all | 88% |
load | 81% |
store | 100% |
mul | 95% |
add-sub | 100% |
fma | 100% |
div/sqrt | 91% |
other | 83% |
all | 80% |
load | 79% |
store | 100% |
mul | 97% |
add-sub | 97% |
fma | 100% |
div/sqrt | 91% |
other | 65% |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 109 |
nb uops | 177 |
loop length | 687 |
used x86 registers | 7 |
used mmx registers | 0 |
used xmm registers | 10 |
used ymm registers | 5 |
used zmm registers | 27 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 1.50 |
micro-operation queue | 29.50 cycles |
front end | 29.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 59.00 | 6.00 | 23.33 | 23.33 | 1.00 | 59.00 | 2.00 | 1.00 | 1.00 | 1.00 | 1.00 | 23.33 |
cycles | 59.00 | 6.00 | 23.33 | 23.33 | 1.00 | 59.00 | 2.00 | 1.00 | 1.00 | 1.00 | 1.00 | 23.33 |
Cycles executing div or sqrt instructions | 32.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 59.50-95.27 |
Stall cycles | 34.78-70.39 |
RS full (events) | 58.06-0.57 |
Front-end | 29.50 |
Dispatch | 59.00 |
DIV/SQRT | 32.00 |
Data deps. | 0.00 |
Overall L1 | 59.00 |
all | 96% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 91% |
all | 97% |
load | 90% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 95% |
all | 96% |
load | 92% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 93% |
all | 73% |
load | 75% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 96% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 43% |
all | 90% |
load | 92% |
store | 100% |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 83% |
all | 81% |
load | 88% |
store | 100% |
mul | 100% |
add-sub | 97% |
fma | 100% |
div/sqrt | 100% |
other | 63% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VCMPPD $0x1,%ZMM29,%ZMM15,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VCMPPD $0x1,%ZMM30,%ZMM15,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM11,%ZMM0{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM6,%ZMM1{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VBROADCASTSD 0x1050e7(%RIP),%ZMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.33 |
VSUBPD %ZMM28,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM27,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VFMADD213PD %ZMM25,%ZMM1,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM26,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM24,%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM23,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%R9,%RDX,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
VMOVUPD %ZMM0,0x40(%R9,%RDX,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
ADD $0x10,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %RBX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JA 43ab00 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2d00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVUPD (%R8,%RDX,8),%ZMM23 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVUPD 0x40(%R8,%RDX,8),%ZMM24 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VFPCLASSPD $0x50,%ZMM23,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VFPCLASSPD $0x50,%ZMM24,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KUNPCKBW %K0,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VPBLENDMD %ZMM21,%ZMM22,%ZMM3{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VEXTRACTI64X4 $0x1,%ZMM3,%YMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM6,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM3,%ZMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM8,%ZMM3,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPSUBQ %ZMM8,%ZMM6,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM6,%ZMM9,%ZMM3 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPXOR %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM11,%ZMM9,%ZMM12 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
LEA (%R10,%RDX,1),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVQ %RSI,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPSUBQ 0x70(%RSP),%XMM25,%XMM25 | 1 | 0.33 | 0.33 | 0.33 | 0.33 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VPSLLQ $0x3,%XMM25,%XMM25 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 0.50 |
VPBROADCASTQ %XMM25,%ZMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPADDQ 0xcd443(%RIP),%ZMM25,%ZMM26 | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.67 |
VPADDQ %ZMM26,%ZMM10,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM12,%ZMM27,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM27,%XMM27,%XMM27 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM12,1),%ZMM27{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPADDQ 0xcd3d8(%RIP),%ZMM25,%ZMM12 | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.67 |
VPADDQ %ZMM12,%ZMM10,%ZMM25 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM3,%ZMM25,%ZMM25 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM25,1),%ZMM3{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMULLQ %ZMM11,%ZMM13,%ZMM11 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM26,%ZMM14,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM11,%ZMM28,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM25,%XMM25,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM11,1),%ZMM25{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMULLQ %ZMM6,%ZMM13,%ZMM6 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM12,%ZMM14,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM6,%ZMM11,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM26,%XMM26,%XMM26 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM6,1),%ZMM26{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPBLENDMD %ZMM19,%ZMM20,%ZMM6{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VEXTRACTI64X4 $0x1,%ZMM6,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM12,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM6,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM8,%ZMM6,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM6,%ZMM13,%ZMM6 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM6,%ZMM28,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM29,%XMM29,%XMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM6,1),%ZMM29{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPSUBQ %ZMM8,%ZMM12,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM6,%ZMM13,%ZMM6 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM6,%ZMM11,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM6,1),%ZMM12{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPBLENDMD %ZMM22,%ZMM21,%ZMM30{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM23,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM24,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VDIVPD %ZMM27,%ZMM31,%ZMM27 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VEXTRACTI64X4 $0x1,%ZMM30,%YMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM31,%ZMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM30,%ZMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM8,%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM30,%ZMM13,%ZMM30 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM30,%ZMM28,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM30,%XMM30,%XMM30 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM28,1),%ZMM30{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPSUBQ %ZMM8,%ZMM31,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM28,%ZMM13,%ZMM28 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM28,%ZMM11,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM31,%XMM31,%XMM31 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM11,1),%ZMM31{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VDIVPD %ZMM3,%ZMM2,%ZMM28 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VSUBPD %ZMM12,%ZMM26,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM29,%ZMM25,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM26,%ZMM31,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM25,%ZMM30,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM11,%ZMM3,%ZMM29 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM12,%ZMM31,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM30,%ZMM6,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VCMPPD $0x1,%ZMM29,%ZMM6,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KORTESTB %K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JNE 43a700 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2900> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VXORPD %XMM11,%XMM11,%XMM11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 43a815 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2a15> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 151 |
nb uops | 231 |
loop length | 954 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 12 |
used ymm registers | 7 |
used zmm registers | 32 |
nb stack references | 1 |
ADD-SUB / MUL ratio | 0.89 |
micro-operation queue | 38.50 cycles |
front end | 38.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 81.50 | 7.00 | 30.33 | 30.33 | 1.00 | 81.50 | 2.00 | 1.00 | 1.00 | 1.00 | 1.00 | 30.33 |
cycles | 81.50 | 11.50 | 30.33 | 30.33 | 1.00 | 81.50 | 2.00 | 1.00 | 1.00 | 1.00 | 1.00 | 30.33 |
Cycles executing div or sqrt instructions | 68.00 |
Longest recurrence chain latency (RecMII) | 0.00 |
FE+BE cycles | 83.63-125.99 |
Stall cycles | 50.57-92.86 |
RS full (events) | 82.32-0.60 |
PRF_FLOAT full (events) | 0.06-96.99 |
Front-end | 38.50 |
Dispatch | 81.50 |
DIV/SQRT | 68.00 |
Data deps. | 0.00 |
Overall L1 | 81.50 |
all | 96% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 92% |
all | 88% |
load | 66% |
store | 100% |
mul | 88% |
add-sub | 100% |
fma | 100% |
div/sqrt | 80% |
other | 88% |
all | 91% |
load | 71% |
store | 100% |
mul | 94% |
add-sub | 100% |
fma | 100% |
div/sqrt | 80% |
other | 90% |
all | 72% |
load | 75% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 96% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 45% |
all | 85% |
load | 70% |
store | 100% |
mul | 90% |
add-sub | 100% |
fma | 100% |
div/sqrt | 82% |
other | 82% |
all | 80% |
load | 71% |
store | 100% |
mul | 94% |
add-sub | 97% |
fma | 100% |
div/sqrt | 82% |
other | 68% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
VCMPPD $0x1,%ZMM30,%ZMM15,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VCMPPD $0x1,%ZMM29,%ZMM15,%K3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBLENDMD %ZMM21,%ZMM20,%ZMM2{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VMOVSD (%RDI,%R11,8),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VANDPD %ZMM7,%ZMM12,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM11,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM3,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM31,%ZMM17 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VBROADCASTSD 0xce6b4(%RIP),%ZMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.33 |
VSUBPD %ZMM27,%ZMM0,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM28,%ZMM0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM1,%ZMM17,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM18,%ZMM5,%ZMM18 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD 0x1051c4(%RIP),%XMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VDIVSD %XMM6,%XMM0,%XMM0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 13-15 | 4 |
VBROADCASTSD %XMM0,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMINPD %ZMM5,%ZMM11,%ZMM5 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM11,%ZMM27,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM17,%ZMM12,%ZMM17 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM12,%ZMM28,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VEXTRACTI64X4 $0x1,%ZMM2,%YMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM2,%ZMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM16,%ZMM16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM8,%ZMM16,%ZMM16 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPSUBQ %ZMM8,%ZMM2,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPXOR %XMM4,%XMM4,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (%RDI,%ZMM2,8),%ZMM4{%K3} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VXORPD %XMM2,%XMM2,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VGATHERQPD (%RDI,%ZMM16,8),%ZMM2{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VDIVPD %ZMM4,%ZMM11,%ZMM4 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VDIVPD %ZMM2,%ZMM12,%ZMM2 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VFMADD231PD %ZMM18,%ZMM0,%ZMM4 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231PD %ZMM1,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0xce62d(%RIP),%XMM6,%XMM0 | 1 | 0.50 | 0.50 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4 | 0.50 |
VBROADCASTSD %XMM0,%ZMM0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMULPD %ZMM2,%ZMM0,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM4,%ZMM0,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM17,%ZMM1,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMINPD %ZMM5,%ZMM0,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFPCLASSPD $0x56,%ZMM3,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VFPCLASSPD $0x56,%ZMM31,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VBROADCASTSD 0xce5ef(%RIP),%ZMM0 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.33 |
VXORPD %ZMM0,%ZMM11,%ZMM11{%K2} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VXORPD %ZMM0,%ZMM6,%ZMM6{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VCMPPD $0x1,%ZMM29,%ZMM15,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VCMPPD $0x1,%ZMM30,%ZMM15,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VMOVAPD %ZMM11,%ZMM0{%K2}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VMOVAPD %ZMM6,%ZMM1{%K1}{z} | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.17 |
VBROADCASTSD 0x1050e7(%RIP),%ZMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 3 | 0.33 |
VSUBPD %ZMM28,%ZMM3,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM27,%ZMM3,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VFMADD213PD %ZMM25,%ZMM1,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD213PD %ZMM26,%ZMM0,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM24,%ZMM2,%ZMM0 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM23,%ZMM3,%ZMM1 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVUPD %ZMM1,(%R9,%RDX,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
VMOVUPD %ZMM0,0x40(%R9,%RDX,8) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0-1 | 1 |
ADD $0x10,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %RBX,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JA 43ab00 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2d00> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
VMOVUPD (%R8,%RDX,8),%ZMM23 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VMOVUPD 0x40(%R8,%RDX,8),%ZMM24 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.50 |
VFPCLASSPD $0x50,%ZMM23,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VFPCLASSPD $0x50,%ZMM24,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KUNPCKBW %K0,%K1,%K1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
VPBLENDMD %ZMM21,%ZMM22,%ZMM3{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VEXTRACTI64X4 $0x1,%ZMM3,%YMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM6,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM3,%ZMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM8,%ZMM3,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPSUBQ %ZMM8,%ZMM6,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM6,%ZMM9,%ZMM3 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPXOR %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VPMULLQ %ZMM11,%ZMM9,%ZMM12 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
LEA (%R10,%RDX,1),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVQ %RSI,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VPSUBQ 0x70(%RSP),%XMM25,%XMM25 | 1 | 0.33 | 0.33 | 0.33 | 0.33 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
VPSLLQ $0x3,%XMM25,%XMM25 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2-4 | 0.50 |
VPBROADCASTQ %XMM25,%ZMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPADDQ 0xcd443(%RIP),%ZMM25,%ZMM26 | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.67 |
VPADDQ %ZMM26,%ZMM10,%ZMM27 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM12,%ZMM27,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM27,%XMM27,%XMM27 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM12,1),%ZMM27{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPADDQ 0xcd3d8(%RIP),%ZMM25,%ZMM12 | 1 | 0.50 | 0 | 0.33 | 0.33 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.67 |
VPADDQ %ZMM12,%ZMM10,%ZMM25 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM3,%ZMM25,%ZMM25 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM3,%XMM3,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM25,1),%ZMM3{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMULLQ %ZMM11,%ZMM13,%ZMM11 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM26,%ZMM14,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM11,%ZMM28,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM25,%XMM25,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM11,1),%ZMM25{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPMULLQ %ZMM6,%ZMM13,%ZMM6 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM12,%ZMM14,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPADDQ %ZMM6,%ZMM11,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM26,%XMM26,%XMM26 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM6,1),%ZMM26{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPBLENDMD %ZMM19,%ZMM20,%ZMM6{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VEXTRACTI64X4 $0x1,%ZMM6,%YMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM12,%ZMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM6,%ZMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM8,%ZMM6,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM6,%ZMM13,%ZMM6 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM6,%ZMM28,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM29,%XMM29,%XMM29 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM6,1),%ZMM29{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPSUBQ %ZMM8,%ZMM12,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM6,%ZMM13,%ZMM6 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM6,%ZMM11,%ZMM6 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXOR %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM6,1),%ZMM12{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPBLENDMD %ZMM22,%ZMM21,%ZMM30{%K1} | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM23,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VANDPD %ZMM7,%ZMM24,%ZMM2 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VXORPD %XMM6,%XMM6,%XMM6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VDIVPD %ZMM27,%ZMM31,%ZMM27 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VEXTRACTI64X4 $0x1,%ZMM30,%YMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM31,%ZMM31 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPMOVSXDQ %YMM30,%ZMM30 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPSUBQ %ZMM8,%ZMM30,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM30,%ZMM13,%ZMM30 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM30,%ZMM28,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM30,%XMM30,%XMM30 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM28,1),%ZMM30{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VPSUBQ %ZMM8,%ZMM31,%ZMM28 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.50 |
VPMULLQ %ZMM28,%ZMM13,%ZMM28 | 5 | 1.50 | 0 | 0 | 0 | 0 | 1.50 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | 1.50 |
VPADDQ %ZMM28,%ZMM11,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
VPXORD %XMM31,%XMM31,%XMM31 | 1 | 0.33 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0-1 | 0.33 |
KXNORW %K0,%K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
VGATHERQPD (,%ZMM11,1),%ZMM31{%K2} | 5 | 1 | 0 | 2.67 | 2.67 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2.67 | 0-29 | 2.67 |
VDIVPD %ZMM3,%ZMM2,%ZMM28 | 3 | 2.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 22-24 | 16 |
VSUBPD %ZMM12,%ZMM26,%ZMM12 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM29,%ZMM25,%ZMM11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM26,%ZMM31,%ZMM31 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VSUBPD %ZMM25,%ZMM30,%ZMM3 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
VMULPD %ZMM11,%ZMM3,%ZMM29 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULPD %ZMM12,%ZMM31,%ZMM30 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCMPPD $0x1,%ZMM30,%ZMM6,%K0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VCMPPD $0x1,%ZMM29,%ZMM6,%K2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
KORTESTB %K0,%K2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
JNE 43a700 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2900> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |