Loop Id: 276 | Module: exec | Source: advec_mom_kernel.f90:81-241 [...] | Coverage: 0.06% |
---|
Loop Id: 276 | Module: exec | Source: advec_mom_kernel.f90:81-241 [...] | Coverage: 0.06% |
---|
0x43a600 MOV 0x90(%RSP),%RCX |
0x43a608 MOV 0x30(%RSP),%R9 |
0x43a60d ADD %RCX,%R9 |
0x43a610 MOV 0x98(%RSP),%RAX |
0x43a618 MOV 0x38(%RSP),%R8 |
0x43a61d ADD %RAX,%R8 |
0x43a620 ADD %RCX,0x50(%RSP) |
0x43a625 ADD %RAX,0x40(%RSP) |
0x43a62a MOV 0x120(%RSP),%RSI |
0x43a632 CMP 0x58(%RSP),%RSI |
0x43a637 LEA 0x1(%RSI),%RSI |
0x43a63b MOV 0x68(%RSP),%R10 |
0x43a640 JE 43a4c0 |
0x43a646 MOV 0x60(%RSP),%R15 |
0x43a64b AND $-0x10,%R15 |
0x43a64f MOV %R8,0x38(%RSP) |
0x43a654 MOV %R9,0x30(%RSP) |
0x43a659 MOV %RSI,0x120(%RSP) |
0x43a661 JE 43ab40 |
0x43a667 LEA -0x1(%R15),%RBX |
0x43a66b MOV 0x88(%RSP),%RAX |
0x43a673 LEA -0x1(%RSI,%RAX,1),%RAX |
0x43a678 MOV %RAX,%R11 |
0x43a67b SUB %R13,%R11 |
0x43a67e MOV 0xa8(%RBP),%RCX |
0x43a685 MOV 0xb0(%RSP),%RDX |
0x43a68d ADD %ECX,%EDX |
0x43a68f LEA (%RDX,%RSI,1),%ECX |
0x43a692 LEA 0x1(%RSI,%RDX,1),%EDI |
0x43a696 MOV %EDI,0x18(%RSP) |
0x43a69a VPBROADCASTD %EDI,%ZMM19 |
0x43a6a0 LEA -0x2(%RSI,%RDX,1),%EDX |
0x43a6a4 MOV %EDX,0x10(%RSP) |
0x43a6a8 VPBROADCASTD %EDX,%ZMM20 |
0x43a6ae VPBROADCASTD %ECX,%ZMM21 |
0x43a6b4 VPBROADCASTD %EAX,%ZMM22 |
0x43a6ba XOR %EDX,%EDX |
0x43a6bc MOV 0x28(%RBP),%RDI |
0x43a6c0 JMP 43a879 |
(278) 0x43a700 VCMPPD $0x1,%ZMM30,%ZMM15,%K2 |
(278) 0x43a707 VCMPPD $0x1,%ZMM29,%ZMM15,%K3 |
(278) 0x43a70e VPBLENDMD %ZMM21,%ZMM20,%ZMM2{%K1} |
(278) 0x43a714 VMOVSD (%RDI,%R11,8),%XMM6 |
(278) 0x43a71a VANDPD %ZMM7,%ZMM12,%ZMM12 |
(278) 0x43a720 VANDPD %ZMM7,%ZMM11,%ZMM11 |
(278) 0x43a726 VANDPD %ZMM7,%ZMM3,%ZMM5 |
(278) 0x43a72c VANDPD %ZMM7,%ZMM31,%ZMM17 |
(278) 0x43a732 VBROADCASTSD 0xce6b4(%RIP),%ZMM0 |
(278) 0x43a73c VSUBPD %ZMM27,%ZMM0,%ZMM18 |
(278) 0x43a742 VSUBPD %ZMM28,%ZMM0,%ZMM1 |
(278) 0x43a748 VMULPD %ZMM1,%ZMM17,%ZMM1 |
(278) 0x43a74e VMULPD %ZMM18,%ZMM5,%ZMM18 |
(278) 0x43a754 VMOVSD 0x1051c4(%RIP),%XMM0 |
(278) 0x43a75c VDIVSD %XMM6,%XMM0,%XMM0 |
(278) 0x43a760 VBROADCASTSD %XMM0,%ZMM0 |
(278) 0x43a766 VMINPD %ZMM5,%ZMM11,%ZMM5 |
(278) 0x43a76c VFMADD213PD %ZMM11,%ZMM27,%ZMM11 |
(278) 0x43a772 VMINPD %ZMM17,%ZMM12,%ZMM17 |
(278) 0x43a778 VFMADD213PD %ZMM12,%ZMM28,%ZMM12 |
(278) 0x43a77e VEXTRACTI64X4 $0x1,%ZMM2,%YMM16 |
(278) 0x43a785 VPMOVSXDQ %YMM2,%ZMM2 |
(278) 0x43a78b VPMOVSXDQ %YMM16,%ZMM16 |
(278) 0x43a791 VPSUBQ %ZMM8,%ZMM16,%ZMM16 |
(278) 0x43a797 VPSUBQ %ZMM8,%ZMM2,%ZMM2 |
(278) 0x43a79d VPXOR %XMM4,%XMM4,%XMM4 |
(278) 0x43a7a1 VGATHERQPD (%RDI,%ZMM2,8),%ZMM4{%K3} |
(278) 0x43a7a8 VXORPD %XMM2,%XMM2,%XMM2 |
(278) 0x43a7ac VGATHERQPD (%RDI,%ZMM16,8),%ZMM2{%K2} |
(278) 0x43a7b3 VDIVPD %ZMM4,%ZMM11,%ZMM4 |
(278) 0x43a7b9 VDIVPD %ZMM2,%ZMM12,%ZMM2 |
(278) 0x43a7bf VFMADD231PD %ZMM18,%ZMM0,%ZMM4 |
(278) 0x43a7c5 VFMADD231PD %ZMM1,%ZMM0,%ZMM2 |
(278) 0x43a7cb VMULSD 0xce62d(%RIP),%XMM6,%XMM0 |
(278) 0x43a7d3 VBROADCASTSD %XMM0,%ZMM0 |
(278) 0x43a7d9 VMULPD %ZMM2,%ZMM0,%ZMM1 |
(278) 0x43a7df VMULPD %ZMM4,%ZMM0,%ZMM0 |
(278) 0x43a7e5 VMINPD %ZMM17,%ZMM1,%ZMM11 |
(278) 0x43a7eb VMINPD %ZMM5,%ZMM0,%ZMM6 |
(278) 0x43a7f1 VFPCLASSPD $0x56,%ZMM3,%K1 |
(278) 0x43a7f8 VFPCLASSPD $0x56,%ZMM31,%K2 |
(278) 0x43a7ff VBROADCASTSD 0xce5ef(%RIP),%ZMM0 |
(278) 0x43a809 VXORPD %ZMM0,%ZMM11,%ZMM11{%K2} |
(278) 0x43a80f VXORPD %ZMM0,%ZMM6,%ZMM6{%K1} |
(278) 0x43a815 VCMPPD $0x1,%ZMM29,%ZMM15,%K1 |
(278) 0x43a81c VCMPPD $0x1,%ZMM30,%ZMM15,%K2 |
(278) 0x43a823 VMOVAPD %ZMM11,%ZMM0{%K2}{z} |
(278) 0x43a829 VMOVAPD %ZMM6,%ZMM1{%K1}{z} |
(278) 0x43a82f VBROADCASTSD 0x1050e7(%RIP),%ZMM3 |
(278) 0x43a839 VSUBPD %ZMM28,%ZMM3,%ZMM2 |
(278) 0x43a83f VSUBPD %ZMM27,%ZMM3,%ZMM3 |
(278) 0x43a845 VFMADD213PD %ZMM25,%ZMM1,%ZMM3 |
(278) 0x43a84b VFMADD213PD %ZMM26,%ZMM0,%ZMM2 |
(278) 0x43a851 VMULPD %ZMM24,%ZMM2,%ZMM0 |
(278) 0x43a857 VMULPD %ZMM23,%ZMM3,%ZMM1 |
(278) 0x43a85d VMOVUPD %ZMM1,(%R9,%RDX,8) |
(278) 0x43a864 VMOVUPD %ZMM0,0x40(%R9,%RDX,8) |
(278) 0x43a86c ADD $0x10,%RDX |
(278) 0x43a870 CMP %RBX,%RDX |
(278) 0x43a873 JA 43ab00 |
(278) 0x43a879 VMOVUPD (%R8,%RDX,8),%ZMM23 |
(278) 0x43a880 VMOVUPD 0x40(%R8,%RDX,8),%ZMM24 |
(278) 0x43a888 VFPCLASSPD $0x50,%ZMM23,%K0 |
(278) 0x43a88f VFPCLASSPD $0x50,%ZMM24,%K1 |
(278) 0x43a896 KUNPCKBW %K0,%K1,%K1 |
(278) 0x43a89a VPBLENDMD %ZMM21,%ZMM22,%ZMM3{%K1} |
(278) 0x43a8a0 VEXTRACTI64X4 $0x1,%ZMM3,%YMM6 |
(278) 0x43a8a7 VPMOVSXDQ %YMM6,%ZMM6 |
(278) 0x43a8ad VPMOVSXDQ %YMM3,%ZMM3 |
(278) 0x43a8b3 VPSUBQ %ZMM8,%ZMM3,%ZMM11 |
(278) 0x43a8b9 VPSUBQ %ZMM8,%ZMM6,%ZMM6 |
(278) 0x43a8bf VPXOR %XMM3,%XMM3,%XMM3 |
(278) 0x43a8c3 VPMULLQ %ZMM6,%ZMM9,%ZMM3 |
(278) 0x43a8c9 VPXOR %XMM12,%XMM12,%XMM12 |
(278) 0x43a8ce VPMULLQ %ZMM11,%ZMM9,%ZMM12 |
(278) 0x43a8d4 LEA (%R10,%RDX,1),%RSI |
(278) 0x43a8d8 VMOVQ %RSI,%XMM25 |
(278) 0x43a8de VPSUBQ 0x70(%RSP),%XMM25,%XMM25 |
(278) 0x43a8e6 VPSLLQ $0x3,%XMM25,%XMM25 |
(278) 0x43a8ed VPBROADCASTQ %XMM25,%ZMM25 |
(278) 0x43a8f3 VPADDQ 0xcd443(%RIP),%ZMM25,%ZMM26 |
(278) 0x43a8fd VPADDQ %ZMM26,%ZMM10,%ZMM27 |
(278) 0x43a903 VPADDQ %ZMM12,%ZMM27,%ZMM12 |
(278) 0x43a909 VPXORD %XMM27,%XMM27,%XMM27 |
(278) 0x43a90f KXNORW %K0,%K0,%K2 |
(278) 0x43a913 VGATHERQPD (,%ZMM12,1),%ZMM27{%K2} |
(278) 0x43a91e VPADDQ 0xcd3d8(%RIP),%ZMM25,%ZMM12 |
(278) 0x43a928 VPADDQ %ZMM12,%ZMM10,%ZMM25 |
(278) 0x43a92e VPADDQ %ZMM3,%ZMM25,%ZMM25 |
(278) 0x43a934 VPXOR %XMM3,%XMM3,%XMM3 |
(278) 0x43a938 KXNORW %K0,%K0,%K2 |
(278) 0x43a93c VGATHERQPD (,%ZMM25,1),%ZMM3{%K2} |
(278) 0x43a947 VPMULLQ %ZMM11,%ZMM13,%ZMM11 |
(278) 0x43a94d VPADDQ %ZMM26,%ZMM14,%ZMM28 |
(278) 0x43a953 VPADDQ %ZMM11,%ZMM28,%ZMM11 |
(278) 0x43a959 VXORPD %XMM25,%XMM25,%XMM25 |
(278) 0x43a95f KXNORW %K0,%K0,%K2 |
(278) 0x43a963 VGATHERQPD (,%ZMM11,1),%ZMM25{%K2} |
(278) 0x43a96e VPMULLQ %ZMM6,%ZMM13,%ZMM6 |
(278) 0x43a974 VPADDQ %ZMM12,%ZMM14,%ZMM11 |
(278) 0x43a97a VPADDQ %ZMM6,%ZMM11,%ZMM6 |
(278) 0x43a980 VPXORD %XMM26,%XMM26,%XMM26 |
(278) 0x43a986 KXNORW %K0,%K0,%K2 |
(278) 0x43a98a VGATHERQPD (,%ZMM6,1),%ZMM26{%K2} |
(278) 0x43a995 VPBLENDMD %ZMM19,%ZMM20,%ZMM6{%K1} |
(278) 0x43a99b VEXTRACTI64X4 $0x1,%ZMM6,%YMM12 |
(278) 0x43a9a2 VPMOVSXDQ %YMM12,%ZMM12 |
(278) 0x43a9a8 VPMOVSXDQ %YMM6,%ZMM6 |
(278) 0x43a9ae VPSUBQ %ZMM8,%ZMM6,%ZMM6 |
(278) 0x43a9b4 VPMULLQ %ZMM6,%ZMM13,%ZMM6 |
(278) 0x43a9ba VPADDQ %ZMM6,%ZMM28,%ZMM6 |
(278) 0x43a9c0 VXORPD %XMM29,%XMM29,%XMM29 |
(278) 0x43a9c6 KXNORW %K0,%K0,%K2 |
(278) 0x43a9ca VGATHERQPD (,%ZMM6,1),%ZMM29{%K2} |
(278) 0x43a9d5 VPSUBQ %ZMM8,%ZMM12,%ZMM6 |
(278) 0x43a9db VPMULLQ %ZMM6,%ZMM13,%ZMM6 |
(278) 0x43a9e1 VPADDQ %ZMM6,%ZMM11,%ZMM6 |
(278) 0x43a9e7 VPXOR %XMM12,%XMM12,%XMM12 |
(278) 0x43a9ec KXNORW %K0,%K0,%K2 |
(278) 0x43a9f0 VGATHERQPD (,%ZMM6,1),%ZMM12{%K2} |
(278) 0x43a9fb VPBLENDMD %ZMM22,%ZMM21,%ZMM30{%K1} |
(278) 0x43aa01 VANDPD %ZMM7,%ZMM23,%ZMM31 |
(278) 0x43aa07 VANDPD %ZMM7,%ZMM24,%ZMM2 |
(278) 0x43aa0d VXORPD %XMM6,%XMM6,%XMM6 |
(278) 0x43aa11 VDIVPD %ZMM27,%ZMM31,%ZMM27 |
(278) 0x43aa17 VEXTRACTI64X4 $0x1,%ZMM30,%YMM31 |
(278) 0x43aa1e VPMOVSXDQ %YMM31,%ZMM31 |
(278) 0x43aa24 VPMOVSXDQ %YMM30,%ZMM30 |
(278) 0x43aa2a VPSUBQ %ZMM8,%ZMM30,%ZMM30 |
(278) 0x43aa30 VPMULLQ %ZMM30,%ZMM13,%ZMM30 |
(278) 0x43aa36 VPADDQ %ZMM30,%ZMM28,%ZMM28 |
(278) 0x43aa3c VPXORD %XMM30,%XMM30,%XMM30 |
(278) 0x43aa42 KXNORW %K0,%K0,%K2 |
(278) 0x43aa46 VGATHERQPD (,%ZMM28,1),%ZMM30{%K2} |
(278) 0x43aa51 VPSUBQ %ZMM8,%ZMM31,%ZMM28 |
(278) 0x43aa57 VPMULLQ %ZMM28,%ZMM13,%ZMM28 |
(278) 0x43aa5d VPADDQ %ZMM28,%ZMM11,%ZMM11 |
(278) 0x43aa63 VPXORD %XMM31,%XMM31,%XMM31 |
(278) 0x43aa69 KXNORW %K0,%K0,%K2 |
(278) 0x43aa6d VGATHERQPD (,%ZMM11,1),%ZMM31{%K2} |
(278) 0x43aa78 VDIVPD %ZMM3,%ZMM2,%ZMM28 |
(278) 0x43aa7e VSUBPD %ZMM12,%ZMM26,%ZMM12 |
(278) 0x43aa84 VSUBPD %ZMM29,%ZMM25,%ZMM11 |
(278) 0x43aa8a VSUBPD %ZMM26,%ZMM31,%ZMM31 |
(278) 0x43aa90 VSUBPD %ZMM25,%ZMM30,%ZMM3 |
(278) 0x43aa96 VMULPD %ZMM11,%ZMM3,%ZMM29 |
(278) 0x43aa9c VMULPD %ZMM12,%ZMM31,%ZMM30 |
(278) 0x43aaa2 VCMPPD $0x1,%ZMM30,%ZMM6,%K0 |
(278) 0x43aaa9 VCMPPD $0x1,%ZMM29,%ZMM6,%K2 |
(278) 0x43aab0 KORTESTB %K0,%K2 |
(278) 0x43aab4 JNE 43a700 |
(278) 0x43aaba VXORPD %XMM11,%XMM11,%XMM11 |
(278) 0x43aabf JMP 43a815 |
0x43ab00 MOV %R11,0x20(%RSP) |
0x43ab05 CMP %R15,0x60(%RSP) |
0x43ab0a VMOVDDUP 0xce26e(%RIP),%XMM5 |
0x43ab12 VXORPD %XMM12,%XMM12,%XMM12 |
0x43ab17 VMOVSD 0xce2cf(%RIP),%XMM16 |
0x43ab21 VMOVSD 0x104df5(%RIP),%XMM17 |
0x43ab2b VMOVDDUP 0xce2c3(%RIP),%XMM18 |
0x43ab35 JE 43a600 |
0x43ab3b JMP 43ab76 |
0x43ab40 MOV 0x88(%RSP),%RAX |
0x43ab48 LEA -0x1(%RSI,%RAX,1),%RAX |
0x43ab4d MOV %RAX,%RCX |
0x43ab50 SUB %R13,%RCX |
0x43ab53 MOV %RCX,0x20(%RSP) |
0x43ab58 MOV 0x158(%RSP),%RDX |
0x43ab60 LEA (%RDX,%RSI,1),%ECX |
0x43ab63 LEA 0x1(%RSI,%RDX,1),%EDI |
0x43ab67 MOV %EDI,0x18(%RSP) |
0x43ab6b LEA -0x2(%RSI,%RDX,1),%EDX |
0x43ab6f MOV %EDX,0x10(%RSP) |
0x43ab73 XOR %R15D,%R15D |
0x43ab76 MOV 0xb8(%RSP),%RDX |
0x43ab7e SUB %R15,%RDX |
0x43ab81 MOV 0x150(%RSP),%RSI |
0x43ab89 ADD %R15,%RSI |
0x43ab8c MOV 0x50(%RSP),%RDI |
0x43ab91 LEA (%RDI,%RSI,8),%RBX |
0x43ab95 ADD 0xa0(%RSP),%R15 |
0x43ab9d MOV 0x98(%RBP),%RDI |
0x43aba4 LEA (%RDI,%R15,8),%R9 |
0x43aba8 MOV 0x50(%RBP),%RDI |
0x43abac LEA (%RDI,%R15,8),%R15 |
0x43abb0 MOV 0x40(%RSP),%RDI |
0x43abb5 LEA (%RDI,%RSI,8),%RDI |
0x43abb9 XOR %R12D,%R12D |
0x43abbc VMOVSD 0xce23a(%RIP),%XMM22 |
0x43abc6 JMP 43ac24 |
(277) 0x43ac00 VSUBSD %XMM3,%XMM17,%XMM0 |
(277) 0x43ac06 VFMADD213SD %XMM20,%XMM21,%XMM0 |
(277) 0x43ac0c VMULSD %XMM19,%XMM0,%XMM0 |
(277) 0x43ac12 VMOVSD %XMM0,(%RBX,%R12,8) |
(277) 0x43ac18 INC %R12 |
(277) 0x43ac1b CMP %R12,%RDX |
(277) 0x43ac1e JE 43a600 |
(277) 0x43ac24 VMOVSD (%RDI,%R12,8),%XMM19 |
(277) 0x43ac2b VPXORD %XMM21,%XMM21,%XMM21 |
(277) 0x43ac31 VUCOMISD %XMM19,%XMM21 |
(277) 0x43ac37 MOV 0x10(%RSP),%ESI |
(277) 0x43ac3b CMOVA 0x18(%RSP),%ESI |
(277) 0x43ac40 MOV %EAX,%R8D |
(277) 0x43ac43 CMOVA %ECX,%R8D |
(277) 0x43ac47 MOV %ECX,%R10D |
(277) 0x43ac4a CMOVA %EAX,%R10D |
(277) 0x43ac4e VANDPD %XMM5,%XMM19,%XMM0 |
(277) 0x43ac54 MOVSXD %R8D,%R8 |
(277) 0x43ac57 SUB %R13,%R8 |
(277) 0x43ac5a MOV 0x28(%RSP),%R11 |
(277) 0x43ac5f IMUL %R8,%R11 |
(277) 0x43ac63 ADD %R15,%R11 |
(277) 0x43ac66 VDIVSD (%R11,%R12,8),%XMM0,%XMM3 |
(277) 0x43ac6c IMUL %R14,%R8 |
(277) 0x43ac70 ADD %R9,%R8 |
(277) 0x43ac73 VMOVSD (%R8,%R12,8),%XMM20 |
(277) 0x43ac7a MOVSXD %R10D,%R8 |
(277) 0x43ac7d SUB %R13,%R8 |
(277) 0x43ac80 IMUL %R14,%R8 |
(277) 0x43ac84 ADD %R9,%R8 |
(277) 0x43ac87 VMOVHPD (%R8,%R12,8),%XMM20,%XMM0 |
(277) 0x43ac8e MOVSXD %ESI,%RSI |
(277) 0x43ac91 SUB %R13,%RSI |
(277) 0x43ac94 IMUL %R14,%RSI |
(277) 0x43ac98 ADD %R9,%RSI |
(277) 0x43ac9b VMOVSD (%RSI,%R12,8),%XMM1 |
(277) 0x43aca1 VPUNPCKLQDQ %XMM20,%XMM1,%XMM1 |
(277) 0x43aca7 VSUBPD %XMM1,%XMM0,%XMM11 |
(277) 0x43acab VSHUFPD $0x1,%XMM11,%XMM11,%XMM6 |
(277) 0x43acb1 VMULSD %XMM6,%XMM11,%XMM0 |
(277) 0x43acb5 VUCOMISD %XMM21,%XMM0 |
(277) 0x43acbb JBE 43ac00 |
(277) 0x43acc1 VUCOMISD %XMM19,%XMM12 |
(277) 0x43acc7 MOV 0x10(%RSP),%ESI |
(277) 0x43accb CMOVA %ECX,%ESI |
(277) 0x43acce MOV 0x28(%RBP),%R8 |
(277) 0x43acd2 MOV 0x20(%RSP),%R10 |
(277) 0x43acd7 VMOVSD (%R8,%R10,8),%XMM0 |
(277) 0x43acdd VANDPD %XMM5,%XMM11,%XMM1 |
(277) 0x43ace1 VSUBSD %XMM3,%XMM16,%XMM2 |
(277) 0x43ace7 VADDSD %XMM17,%XMM3,%XMM4 |
(277) 0x43aced VPUNPCKLQDQ %XMM2,%XMM4,%XMM2 |
(277) 0x43acf1 MOVSXD %ESI,%RSI |
(277) 0x43acf4 SUB %R13,%RSI |
(277) 0x43acf7 VMULPD %XMM2,%XMM1,%XMM2 |
(277) 0x43acfb VMOVSD (%R8,%RSI,8),%XMM4 |
(277) 0x43ad01 VPUNPCKLQDQ %XMM0,%XMM4,%XMM4 |
(277) 0x43ad05 VDIVPD %XMM4,%XMM2,%XMM2 |
(277) 0x43ad09 VSHUFPD $0x1,%XMM2,%XMM2,%XMM4 |
(277) 0x43ad0e VADDSD %XMM4,%XMM2,%XMM2 |
(277) 0x43ad12 VMULSD %XMM22,%XMM0,%XMM0 |
(277) 0x43ad18 VMULSD %XMM2,%XMM0,%XMM0 |
(277) 0x43ad1c VSHUFPS $0x4e,%XMM1,%XMM1,%XMM2 |
(277) 0x43ad21 VMINSD %XMM2,%XMM1,%XMM1 |
(277) 0x43ad25 VMINSD %XMM1,%XMM0,%XMM21 |
(277) 0x43ad2b VXORPD %XMM18,%XMM21,%XMM0 |
(277) 0x43ad31 VCMPSD $0x2,%XMM12,%XMM6,%K1 |
(277) 0x43ad38 VMOVSD %XMM0,%XMM21,%XMM21{%K1} |
(277) 0x43ad3e JMP 43ac00 |
/scratch_na/users/xoserete/qaas_runs/171-322-0339/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/advec_mom_kernel.f90: 81 - 241 |
-------------------------------------------------------------------------------- |
81: IF(mom_sweep.EQ.1)THEN ! x 1 |
[...] |
215: IF(node_flux(j,k).LT.0.0)THEN |
[...] |
227: sigma=ABS(node_flux(j,k))/(node_mass_pre(j,donor)) |
228: width=celldy(k) |
229: vdiffuw=vel1(j,donor)-vel1(j,upwind) |
230: vdiffdw=vel1(j,downwind)-vel1(j,donor) |
231: limiter=0.0 |
232: IF(vdiffuw*vdiffdw.GT.0.0)THEN |
233: auw=ABS(vdiffuw) |
234: adw=ABS(vdiffdw) |
235: wind=1.0_8 |
236: IF(vdiffdw.LE.0.0) wind=-1.0_8 |
237: limiter=wind*MIN(width*((2.0_8-sigma)*adw/width+(1.0_8+sigma)*auw/celldy(dif))/6.0_8,auw,adw) |
238: ENDIF |
239: advec_vel_s=vel1(j,donor)+(1.0_8-sigma)*limiter |
240: mom_flux(j,k)=advec_vel_s*node_flux(j,k) |
241: ENDDO |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.25 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 12.28 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.34 |
Bottlenecks | micro-operation queue, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:215-215,advec_mom_kernel.f90:241-241 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 13.00 |
CQA cycles if no scalar integer | 4.00 |
CQA cycles if FP arith vectorized | 13.00 |
CQA cycles if fully vectorized | 1.06 |
Front-end cycles | 13.00 |
DIV/SQRT cycles | 5.50 |
P0 cycles | 9.00 |
P1 cycles | 9.67 |
P2 cycles | 9.67 |
P3 cycles | 5.50 |
P4 cycles | 5.60 |
P5 cycles | 5.50 |
P6 cycles | 5.50 |
P7 cycles | 5.50 |
P8 cycles | 5.50 |
P9 cycles | 5.40 |
P10 cycles | 9.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 13.48 |
Stall cycles (UFS) | 0.00 |
Nb insns | 76.00 |
Nb uops | 78.00 |
Nb loads | 29.00 |
Nb stores | 11.00 |
Nb stack references | 23.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 23.38 |
Bytes prefetched | 0.00 |
Bytes loaded | 232.00 |
Bytes stored | 72.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 2.86 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 10.00 |
Vector-efficiency ratio all | 11.25 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 10.23 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 10.63 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.25 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 12.28 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.34 |
Bottlenecks | micro-operation queue, |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source | advec_mom_kernel.f90:81-81,advec_mom_kernel.f90:215-215,advec_mom_kernel.f90:241-241 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 13.00 |
CQA cycles if no scalar integer | 4.00 |
CQA cycles if FP arith vectorized | 13.00 |
CQA cycles if fully vectorized | 1.06 |
Front-end cycles | 13.00 |
DIV/SQRT cycles | 5.50 |
P0 cycles | 9.00 |
P1 cycles | 9.67 |
P2 cycles | 9.67 |
P3 cycles | 5.50 |
P4 cycles | 5.60 |
P5 cycles | 5.50 |
P6 cycles | 5.50 |
P7 cycles | 5.50 |
P8 cycles | 5.50 |
P9 cycles | 5.40 |
P10 cycles | 9.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 13.48 |
Stall cycles (UFS) | 0.00 |
Nb insns | 76.00 |
Nb uops | 78.00 |
Nb loads | 29.00 |
Nb stores | 11.00 |
Nb stack references | 23.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 23.38 |
Bytes prefetched | 0.00 |
Bytes loaded | 232.00 |
Bytes stored | 72.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 2.86 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 10.00 |
Vector-efficiency ratio all | 11.25 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 10.23 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 10.63 |
Path / |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 76 |
nb uops | 78 |
loop length | 394 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 0 |
used zmm registers | 4 |
nb stack references | 23 |
micro-operation queue | 13.00 cycles |
front end | 13.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.50 | 6.00 | 9.67 | 9.67 | 5.50 | 5.60 | 5.50 | 5.50 | 5.50 | 5.50 | 5.40 | 9.67 |
cycles | 5.50 | 9.00 | 9.67 | 9.67 | 5.50 | 5.60 | 5.50 | 5.50 | 5.50 | 5.50 | 5.40 | 9.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 13.48 |
Stall cycles | 0.00 |
Front-end | 13.00 |
Dispatch | 9.67 |
Overall L1 | 13.00 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 16% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 2% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
all | 10% |
load | 12% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 9% |
all | 14% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 12% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0x90(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RSP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RCX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x98(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x38(%RSP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RAX,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %RCX,0x50(%RSP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
ADD %RAX,0x40(%RSP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
MOV 0x120(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP 0x58(%RSP),%RSI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x1(%RSI),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x68(%RSP),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JE 43a4c0 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x26c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x60(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
AND $-0x10,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV %R8,0x38(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,0x30(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RSI,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JE 43ab40 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2d40> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA -0x1(%R15),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x88(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RSI,%RAX,1),%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RAX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R13,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0xa8(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0xb0(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA (%RDX,%RSI,1),%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA 0x1(%RSI,%RDX,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %EDI,0x18(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTD %EDI,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA -0x2(%RSI,%RDX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %EDX,0x10(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTD %EDX,%ZMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %ECX,%ZMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EAX,%ZMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x28(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 43a879 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2a79> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
MOV %R11,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
CMP %R15,0x60(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVDDUP 0xce26e(%RIP),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD 0xce2cf(%RIP),%XMM16 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x104df5(%RIP),%XMM17 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVDDUP 0xce2c3(%RIP),%XMM18 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JE 43a600 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2800> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
JMP 43ab76 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2d76> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV 0x88(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RSI,%RAX,1),%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RAX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R13,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x158(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDX,%RSI,1),%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA 0x1(%RSI,%RDX,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %EDI,0x18(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA -0x2(%RSI,%RDX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %EDX,0x10(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0xb8(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %R15,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x150(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %R15,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x50(%RSP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDI,%RSI,8),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD 0xa0(%RSP),%R15 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV 0x98(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDI,%R15,8),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x50(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDI,%R15,8),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x40(%RSP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDI,%RSI,8),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD 0xce23a(%RIP),%XMM22 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 43ac24 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2e24> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
Function | advec_mom_kernel_.DIR.OMP.PARALLEL.2 |
Source file and lines | advec_mom_kernel.f90:81-241 |
Module | exec |
nb instructions | 76 |
nb uops | 78 |
loop length | 394 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 6 |
used ymm registers | 0 |
used zmm registers | 4 |
nb stack references | 23 |
micro-operation queue | 13.00 cycles |
front end | 13.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.50 | 6.00 | 9.67 | 9.67 | 5.50 | 5.60 | 5.50 | 5.50 | 5.50 | 5.50 | 5.40 | 9.67 |
cycles | 5.50 | 9.00 | 9.67 | 9.67 | 5.50 | 5.60 | 5.50 | 5.50 | 5.50 | 5.50 | 5.40 | 9.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 13.48 |
Stall cycles | 0.00 |
Front-end | 13.00 |
Dispatch | 9.67 |
Overall L1 | 13.00 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 16% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 2% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
all | 10% |
load | 12% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 9% |
all | 14% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 11% |
load | 12% |
store | 10% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0x90(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x30(%RSP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RCX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x98(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x38(%RSP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %RAX,%R8 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %RCX,0x50(%RSP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
ADD %RAX,0x40(%RSP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 |
MOV 0x120(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
CMP 0x58(%RSP),%RSI | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
LEA 0x1(%RSI),%RSI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x68(%RSP),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JE 43a4c0 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x26c0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x60(%RSP),%R15 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
AND $-0x10,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOV %R8,0x38(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R9,0x30(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %RSI,0x120(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
JE 43ab40 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2d40> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA -0x1(%R15),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x88(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RSI,%RAX,1),%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RAX,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R13,%R11 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0xa8(%RBP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0xb0(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %ECX,%EDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA (%RDX,%RSI,1),%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA 0x1(%RSI,%RDX,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %EDI,0x18(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTD %EDI,%ZMM19 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LEA -0x2(%RSI,%RDX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %EDX,0x10(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
VPBROADCASTD %EDX,%ZMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %ECX,%ZMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VPBROADCASTD %EAX,%ZMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
XOR %EDX,%EDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x28(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 43a879 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2a79> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
MOV %R11,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
CMP %R15,0x60(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
VMOVDDUP 0xce26e(%RIP),%XMM5 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VXORPD %XMM12,%XMM12,%XMM12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD 0xce2cf(%RIP),%XMM16 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x104df5(%RIP),%XMM17 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVDDUP 0xce2c3(%RIP),%XMM18 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JE 43a600 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2800> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
JMP 43ab76 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2d76> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
MOV 0x88(%RSP),%RAX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%RSI,%RAX,1),%RAX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %RAX,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %R13,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV %RCX,0x20(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV 0x158(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDX,%RSI,1),%ECX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
LEA 0x1(%RSI,%RDX,1),%EDI | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %EDI,0x18(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA -0x2(%RSI,%RDX,1),%EDX | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOV %EDX,0x10(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 |
XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0xb8(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
SUB %R15,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x150(%RSP),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD %R15,%RSI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOV 0x50(%RSP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDI,%RSI,8),%RBX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD 0xa0(%RSP),%R15 | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 |
MOV 0x98(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDI,%R15,8),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x50(%RBP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDI,%R15,8),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV 0x40(%RSP),%RDI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%RDI,%RSI,8),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
XOR %R12D,%R12D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
VMOVSD 0xce23a(%RIP),%XMM22 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
JMP 43ac24 <advec_mom_kernel_mod_mp_advec_mom_kernel_.DIR.OMP.PARALLEL.2+0x2e24> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |