Loop Id: 142 | Module: libqmcparticle.so | Source: ParticleIOUtility.h:70-91 [...] | Coverage: 0.37% |
---|
Loop Id: 142 | Module: libqmcparticle.so | Source: ParticleIOUtility.h:70-91 [...] | Coverage: 0.37% |
---|
0x174f8 INC %R13 |
0x174fb CMP %R13,%RBX |
0x174fe JE 17758 |
0x17504 MOV -0x7a0(%RBP),%R10 [9] |
0x1750b CMP %R14D,(%R10,%R13,4) [2] |
0x1750f JNE 174f8 |
0x17511 MOV -0x5b8(%RBP),%RDX [9] |
0x17518 LEA (%R13,%R13,2),%R8 |
0x1751d LEA (%RDX,%R8,8),%R9 |
0x17521 VMOVSD 0x8(%R9),%XMM10 [10] |
0x17527 VMOVUPD (%R9),%XMM8 [10] |
0x1752c VMOVSD (%R9),%XMM7 [10] |
0x17531 VMOVSD 0x10(%R9),%XMM1 [10] |
0x17537 VRNDSCALESD $0x9,%XMM10,%XMM10,%XMM13 |
0x1753e VRNDSCALEPD $0x9,%XMM8,%XMM6 |
0x17545 VSUBSD %XMM13,%XMM10,%XMM14 |
0x1754a VRNDSCALESD $0x9,%XMM7,%XMM7,%XMM11 |
0x17551 VSUBSD %XMM11,%XMM7,%XMM12 |
0x17556 VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 |
0x1755d VMOVDDUP %XMM14,%XMM3 |
0x17562 VMULPD -0x720(%RBP),%XMM3,%XMM15 [9] |
0x1756a VMOVDDUP %XMM12,%XMM9 |
0x1756f VSUBPD %XMM6,%XMM8,%XMM5 |
0x17573 VSUBSD %XMM0,%XMM1,%XMM8 |
0x17577 VMOVAPD -0x730(%RBP),%XMM6 [9] |
0x1757f VMOVSD -0x610(%RBP),%XMM1 [9] |
0x17587 VFMADD132PD -0x740(%RBP),%XMM15,%XMM9 [9] |
0x17590 VMOVDDUP %XMM8,%XMM2 |
0x17595 VMOVSD %XMM5,%XMM5,%XMM7 |
0x17599 VFMADD132PD -0x7c0(%RBP),%XMM6,%XMM2 [9] |
0x175a2 VFMADD132SD -0x628(%RBP),%XMM1,%XMM8 [9] |
0x175ab VUNPCKHPD %XMM5,%XMM5,%XMM5 |
0x175af VMULSD -0x620(%RBP),%XMM5,%XMM10 [9] |
0x175b7 VADDPD -0x770(%RBP),%XMM9,%XMM4 [9] |
0x175bf VADDSD -0x758(%RBP),%XMM8,%XMM11 [9] |
0x175c7 VFMADD132SD -0x618(%RBP),%XMM10,%XMM7 [9] |
0x175d0 VADDPD %XMM2,%XMM4,%XMM6 |
0x175d4 VUNPCKHPD %XMM6,%XMM6,%XMM4 |
0x175d8 VMOVAPD %XMM6,%XMM3 |
0x175dc VMOVAPD %XMM6,-0x700(%RBP) [9] |
0x175e4 VMULSD 0xc8(%R15),%XMM4,%XMM0 [1] |
0x175ed VADDSD %XMM7,%XMM11,%XMM12 |
0x175f1 VMULSD 0xd0(%R15),%XMM4,%XMM1 [1] |
0x175fa VMULSD 0xd8(%R15),%XMM4,%XMM2 [1] |
0x17603 VMOVSD %XMM12,-0x6f0(%RBP) [9] |
0x1760b VFMADD231SD 0xb0(%R15),%XMM6,%XMM0 [1] |
0x17614 VFMADD231SD 0xb8(%R15),%XMM6,%XMM1 [1] |
0x1761d VFMADD231SD 0xc0(%R15),%XMM6,%XMM2 [1] |
0x17626 VFMADD231SD 0xe0(%R15),%XMM12,%XMM0 [1] |
0x1762f VFMADD231SD 0xe8(%R15),%XMM12,%XMM1 [1] |
0x17638 VFMADD231SD 0xf0(%R15),%XMM12,%XMM2 [1] |
0x17641 VCOMISD 0x7127(%RIP),%XMM0 [6] |
0x17649 SETAE %R11B |
0x1764d VCOMISD 0x7123(%RIP),%XMM0 [6] |
0x17655 SETB %SIL |
0x17659 AND %ESI,%R11D |
0x1765c VCOMISD 0x710c(%RIP),%XMM1 [6] |
0x17664 SETAE %CL |
0x17667 AND %ECX,%R11D |
0x1766a VCOMISD 0x7106(%RIP),%XMM1 [6] |
0x17672 SETB %DIL |
0x17676 AND %EDI,%R11D |
0x17679 VCOMISD 0x70ef(%RIP),%XMM2 [6] |
0x17681 SETAE %AL |
0x17684 TEST %AL,%R11B |
0x17687 JE 174f8 |
0x1768d VCOMISD 0x70e3(%RIP),%XMM2 [6] |
0x17695 JAE 174f8 |
0x1769b LEA -0x4f0(%RBP),%RDI |
0x176a2 LEA 0x6297(%RIP),%RDX |
0x176a9 MOV $0x1f4,%ESI |
0x176ae MOV %R14D,%ECX |
0x176b1 VMOVSD %XMM12,%XMM12,%XMM5 |
0x176b5 MOV $0x6,%EAX |
0x176ba MOV %RDI,-0x600(%RBP) [9] |
0x176c1 CALL 5450 <snprintf@plt> |
0x176c6 MOV -0x600(%RBP),%RDI [9] |
0x176cd CALL 5150 <strlen@plt> |
0x176d2 MOV 0xa8cf(%RIP),%R10 [6] |
0x176d9 MOV -0x600(%RBP),%RSI [9] |
0x176e0 MOV %RAX,%RDX |
0x176e3 MOV (%R10),%RDI [3] |
0x176e6 CALL 5310 <_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l@plt> |
0x176eb MOVSXD -0x6b0(%RBP),%R8 [9] |
0x176f2 MOV 0x5e8(%R15),%RDX [1] |
0x176f9 MOV 0x5c0(%R15),%RCX [1] |
0x17700 MOV 0x570(%R15),%RAX [1] |
0x17707 VMOVAPD -0x700(%RBP),%XMM13 [9] |
0x1770f VMOVSD -0x6f0(%RBP),%XMM14 [9] |
0x17717 LEA (%R8,%R8,2),%R11 |
0x1771b MOV %R8,%R9 |
0x1771e MOV 0x598(%R15),%RDI [1] |
0x17725 LEA (%RDX,%R11,8),%RSI |
0x17729 INC %R9D |
0x1772c VMOVUPD %XMM13,(%RSI) [7] |
0x17730 VMOVSD %XMM14,0x10(%RSI) [7] |
0x17735 MOV %R14D,(%RCX,%R8,4) [8] |
0x17739 MOV %R8D,(%RAX,%R8,4) [4] |
0x1773d MOV %R13D,(%RDI,%R8,4) [5] |
0x17741 INC %R13 |
0x17744 MOV %R9D,-0x6b0(%RBP) [9] |
0x1774b CMP %R13,%RBX |
0x1774e JNE 17504 |
/usr/include/c++/13.1.1/ostream: 667 - 667 |
-------------------------------------------------------------------------------- |
667: __ostream_insert(__out, __s, |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/PETE/OperatorTags.h: 43 - 183 |
-------------------------------------------------------------------------------- |
43: return (a + b); |
[...] |
183: return (const_cast<T1&>(a) = b); |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Particle/ParticleIOUtility.h: 70 - 91 |
-------------------------------------------------------------------------------- |
70: for (int iat = 0; iat < primPos.size(); iat++) |
71: { |
72: if (primTypes[iat] != ns) |
73: continue; |
74: SingleParticlePos_t uPrim = primPos[iat]; |
75: for (int i = 0; i < 3; i++) |
76: uPrim[i] -= std::floor(uPrim[i]); |
77: SingleParticlePos_t r = PrimCell.toCart(uPrim) + (double)i0 * PrimCell.a(0) + (double)i1 * PrimCell.a(1) + |
78: (double)i2 * PrimCell.a(2); |
79: SingleParticlePos_t uSuper = ref_.Lattice.toUnit(r); |
80: if ((uSuper[0] >= -1.0e-6) && (uSuper[0] < 0.9999) && (uSuper[1] >= -1.0e-6) && (uSuper[1] < 0.9999) && |
81: (uSuper[2] >= -1.0e-6) && (uSuper[2] < 0.9999)) |
82: { |
83: char buff[500]; |
84: snprintf(buff, 500, " %10.4f %10.4f %10.4f %12.6f %12.6f %12.6f %d\n", uSuper[0], uSuper[1], |
85: uSuper[2], r[0], r[1], r[2], ns); |
86: app_log() << buff; |
87: ref_.R[index] = r; |
88: ref_.GroupID[index] = ns; // primTypes[iat]; |
89: ref_.ID[index] = index; |
90: ref_.PCID[index] = iat; |
91: index++; |
/usr/include/c++/13.1.1/bits/char_traits.h: 409 - 409 |
-------------------------------------------------------------------------------- |
409: return __builtin_strlen(__s); |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/OhmmsVector.h: 223 - 223 |
-------------------------------------------------------------------------------- |
223: return X[i]; |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/Numerics/OhmmsPETE/TinyVectorTensorOps.h: 150 - 152 |
-------------------------------------------------------------------------------- |
150: return TinyVector<Type_t, 3>(lhs[0] * rhs[0] + lhs[1] * rhs[3] + lhs[2] * rhs[6], |
151: lhs[0] * rhs[1] + lhs[1] * rhs[4] + lhs[2] * rhs[7], |
152: lhs[0] * rhs[2] + lhs[1] * rhs[5] + lhs[2] * rhs[8]); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►100.00+ | main | nio.hpp:19 | exec |
○ | __libc_init_first | libc.so.6 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.21 |
CQA speedup if FP arith vectorized | 1.42 |
CQA speedup if fully vectorized | 7.69 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.14 |
Bottlenecks | |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source | ostream:667-667,OperatorTags.h:43-183,ParticleIOUtility.h:70-91,char_traits.h:409-409,OhmmsVector.h:223-223,TinyVectorTensorOps.h:150-152 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 17.50 |
CQA cycles if no scalar integer | 14.50 |
CQA cycles if FP arith vectorized | 12.36 |
CQA cycles if fully vectorized | 2.27 |
Front-end cycles | 17.06 |
DIV/SQRT cycles | 14.75 |
P0 cycles | 14.75 |
P1 cycles | 13.50 |
P2 cycles | 13.50 |
P3 cycles | 4.00 |
P4 cycles | 9.50 |
P5 cycles | 9.50 |
P6 cycles | 4.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 3 |
FE+BE cycles (UFS) | 21.72 |
Stall cycles (UFS) | 4.48 |
Nb insns | 60.00 |
Nb uops | 67.25 |
Nb loads | 27.00 |
Nb stores | 3.25 |
Nb stack references | 11.50 |
FLOP/cycle | 1.76 |
Nb FLOP add-sub | 8.25 |
Nb FLOP mul | 4.50 |
Nb FLOP fma | 9.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 13.37 |
Bytes prefetched | 0.00 |
Bytes loaded | 249.00 |
Bytes stored | 30.00 |
Stride 0 | 2.50 |
Stride 1 | 1.00 |
Stride n | 0.50 |
Stride unknown | 1.25 |
Stride indirect | 0.25 |
Vectorization ratio all | 16.62 |
Vectorization ratio load | 20.85 |
Vectorization ratio store | 40.74 |
Vectorization ratio mul | 20.00 |
Vectorization ratio add_sub | 37.50 |
Vectorization ratio fma | 20.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 8.04 |
Vector-efficiency ratio all | 14.38 |
Vector-efficiency ratio load | 14.90 |
Vector-efficiency ratio store | 16.67 |
Vector-efficiency ratio mul | 15.00 |
Vector-efficiency ratio add_sub | 17.19 |
Vector-efficiency ratio fma | 15.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 13.25 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 2.00 |
Bottlenecks | |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source | ostream:667-667,OperatorTags.h:43-183,ParticleIOUtility.h:70-91,char_traits.h:409-409,OhmmsVector.h:223-223,TinyVectorTensorOps.h:150-152 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 3.00 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 3.00 |
CQA cycles if fully vectorized | 0.38 |
Front-end cycles | 1.50 |
DIV/SQRT cycles | 1.00 |
P0 cycles | 1.00 |
P1 cycles | 1.00 |
P2 cycles | 1.00 |
P3 cycles | 0.00 |
P4 cycles | 1.00 |
P5 cycles | 1.00 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 3 |
FE+BE cycles (UFS) | 1.63 |
Stall cycles (UFS) | 0.00 |
Nb insns | 6.00 |
Nb uops | 5.00 |
Nb loads | 2.00 |
Nb stores | 0.00 |
Nb stack references | 1.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 4.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 12.00 |
Bytes stored | 0.00 |
Stride 0 | 1.00 |
Stride 1 | 1.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.06 |
CQA speedup if FP arith vectorized | 1.30 |
CQA speedup if fully vectorized | 6.76 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.01 |
Bottlenecks | P0, P1, |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source | ostream:667-667,OperatorTags.h:43-183,ParticleIOUtility.h:70-91,char_traits.h:409-409,OhmmsVector.h:223-223,TinyVectorTensorOps.h:150-152 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 19.00 |
CQA cycles if no scalar integer | 18.00 |
CQA cycles if FP arith vectorized | 14.63 |
CQA cycles if fully vectorized | 2.81 |
Front-end cycles | 18.75 |
DIV/SQRT cycles | 19.00 |
P0 cycles | 19.00 |
P1 cycles | 15.50 |
P2 cycles | 15.50 |
P3 cycles | 2.00 |
P4 cycles | 10.00 |
P5 cycles | 10.00 |
P6 cycles | 2.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 3 |
FE+BE cycles (UFS) | 25.32 |
Stall cycles (UFS) | 6.12 |
Nb insns | 66.00 |
Nb uops | 74.00 |
Nb loads | 31.00 |
Nb stores | 2.00 |
Nb stack references | 14.00 |
FLOP/cycle | 2.16 |
Nb FLOP add-sub | 11.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 12.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.63 |
Bytes prefetched | 0.00 |
Bytes loaded | 292.00 |
Bytes stored | 24.00 |
Stride 0 | 3.00 |
Stride 1 | 1.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 22.45 |
Vectorization ratio load | 20.69 |
Vectorization ratio store | 50.00 |
Vectorization ratio mul | 20.00 |
Vectorization ratio add_sub | 37.50 |
Vectorization ratio fma | 20.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 11.11 |
Vector-efficiency ratio all | 15.18 |
Vector-efficiency ratio load | 14.87 |
Vector-efficiency ratio store | 18.75 |
Vector-efficiency ratio mul | 15.00 |
Vector-efficiency ratio add_sub | 17.19 |
Vector-efficiency ratio fma | 15.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 13.54 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.53 |
CQA speedup if FP arith vectorized | 1.69 |
CQA speedup if fully vectorized | 9.30 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.31 |
Bottlenecks | micro-operation queue, |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source | ostream:667-667,OperatorTags.h:43-183,ParticleIOUtility.h:70-91,char_traits.h:409-409,OhmmsVector.h:223-223,TinyVectorTensorOps.h:150-152 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 28.25 |
CQA cycles if no scalar integer | 18.50 |
CQA cycles if FP arith vectorized | 16.69 |
CQA cycles if fully vectorized | 3.04 |
Front-end cycles | 28.25 |
DIV/SQRT cycles | 19.50 |
P0 cycles | 19.50 |
P1 cycles | 21.50 |
P2 cycles | 21.50 |
P3 cycles | 12.00 |
P4 cycles | 16.00 |
P5 cycles | 16.00 |
P6 cycles | 12.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 3 |
FE+BE cycles (UFS) | 33.51 |
Stall cycles (UFS) | 5.58 |
Nb insns | 100.00 |
Nb uops | 112.00 |
Nb loads | 43.00 |
Nb stores | 9.00 |
Nb stack references | 17.00 |
FLOP/cycle | 1.45 |
Nb FLOP add-sub | 11.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 12.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.42 |
Bytes prefetched | 0.00 |
Bytes loaded | 392.00 |
Bytes stored | 72.00 |
Stride 0 | 3.00 |
Stride 1 | 1.00 |
Stride n | 2.00 |
Stride unknown | 3.00 |
Stride indirect | 1.00 |
Vectorization ratio all | 22.03 |
Vectorization ratio load | 21.88 |
Vectorization ratio store | 22.22 |
Vectorization ratio mul | 20.00 |
Vectorization ratio add_sub | 37.50 |
Vectorization ratio fma | 20.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 10.53 |
Vector-efficiency ratio all | 14.72 |
Vector-efficiency ratio load | 15.04 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | 15.00 |
Vector-efficiency ratio add_sub | 17.19 |
Vector-efficiency ratio fma | 15.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 13.49 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.07 |
CQA speedup if FP arith vectorized | 1.31 |
CQA speedup if fully vectorized | 6.87 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.01 |
Bottlenecks | micro-operation queue, |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source | ostream:667-667,OperatorTags.h:43-183,ParticleIOUtility.h:70-91,char_traits.h:409-409,OhmmsVector.h:223-223,TinyVectorTensorOps.h:150-152 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 19.75 |
CQA cycles if no scalar integer | 18.50 |
CQA cycles if FP arith vectorized | 15.13 |
CQA cycles if fully vectorized | 2.88 |
Front-end cycles | 19.75 |
DIV/SQRT cycles | 19.50 |
P0 cycles | 19.50 |
P1 cycles | 16.00 |
P2 cycles | 16.00 |
P3 cycles | 2.00 |
P4 cycles | 11.00 |
P5 cycles | 11.00 |
P6 cycles | 2.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 3 |
FE+BE cycles (UFS) | 26.41 |
Stall cycles (UFS) | 6.21 |
Nb insns | 68.00 |
Nb uops | 78.00 |
Nb loads | 32.00 |
Nb stores | 2.00 |
Nb stack references | 14.00 |
FLOP/cycle | 2.08 |
Nb FLOP add-sub | 11.00 |
Nb FLOP mul | 6.00 |
Nb FLOP fma | 12.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.41 |
Bytes prefetched | 0.00 |
Bytes loaded | 300.00 |
Bytes stored | 24.00 |
Stride 0 | 3.00 |
Stride 1 | 1.00 |
Stride n | 0.00 |
Stride unknown | 1.00 |
Stride indirect | 0.00 |
Vectorization ratio all | 22.00 |
Vectorization ratio load | 20.00 |
Vectorization ratio store | 50.00 |
Vectorization ratio mul | 20.00 |
Vectorization ratio add_sub | 37.50 |
Vectorization ratio fma | 20.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 10.53 |
Vector-efficiency ratio all | 15.13 |
Vector-efficiency ratio load | 14.79 |
Vector-efficiency ratio store | 18.75 |
Vector-efficiency ratio mul | 15.00 |
Vector-efficiency ratio add_sub | 17.19 |
Vector-efficiency ratio fma | 15.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 13.49 |
Path / |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source file and lines | ParticleIOUtility.h:70-91 |
Module | libqmcparticle.so |
nb instructions | 60 |
nb uops | 67.25 |
loop length | 360.25 |
used x86 registers | 11.75 |
used mmx registers | 0 |
used xmm registers | 12 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 11.50 |
micro-operation queue | 17.06 cycles |
front end | 17.06 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 14.75 | 14.75 | 13.50 | 13.50 | 4.00 | 9.50 | 9.50 | 4.00 |
cycles | 14.75 | 14.75 | 13.50 | 13.50 | 4.00 | 9.50 | 9.50 | 4.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 3.00 |
FE+BE cycles | 21.72 |
Stall cycles | 4.48 |
RS full (events) | 0.25 |
LB full (events) | 7.23 |
Front-end | 17.06 |
Dispatch | 15.25 |
Data deps. | 3.00 |
Overall L1 | 17.50 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 23% |
load | 21% |
store | 50% |
mul | 20% |
add-sub | 37% |
fma | 20% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
all | 16% |
load | 20% |
store | 40% |
mul | 20% |
add-sub | 37% |
fma | 20% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 8% |
all | 9% |
load | 6% |
store | 7% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 9% |
all | 15% |
load | 15% |
store | 18% |
mul | 15% |
add-sub | 17% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
all | 14% |
load | 14% |
store | 16% |
mul | 15% |
add-sub | 17% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source file and lines | ParticleIOUtility.h:70-91 |
Module | libqmcparticle.so |
nb instructions | 6 |
nb uops | 5 |
loop length | 25 |
used x86 registers | 5 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
micro-operation queue | 1.50 cycles |
front end | 1.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 1.00 | 1.00 | 1.00 | 1.00 | 0.00 | 1.00 | 1.00 | 0.00 |
cycles | 1.00 | 1.00 | 1.00 | 1.00 | 0.00 | 1.00 | 1.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 3.00 |
FE+BE cycles | 1.63 |
Stall cycles | 0.00 |
Front-end | 1.50 |
Dispatch | 1.00 |
Data deps. | 3.00 |
Overall L1 | 3.00 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
INC %R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %R13,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 17758 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0x7a0(%RBP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CMP %R14D,(%R10,%R13,4) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JNE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source file and lines | ParticleIOUtility.h:70-91 |
Module | libqmcparticle.so |
nb instructions | 66 |
nb uops | 74 |
loop length | 405 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 14 |
ADD-SUB / MUL ratio | 1.60 |
micro-operation queue | 18.75 cycles |
front end | 18.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 19.00 | 19.00 | 15.50 | 15.50 | 2.00 | 10.00 | 10.00 | 2.00 |
cycles | 19.00 | 19.00 | 15.50 | 15.50 | 2.00 | 10.00 | 10.00 | 2.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 3.00 |
FE+BE cycles | 25.32 |
Stall cycles | 6.12 |
RS full (events) | 0.22 |
LB full (events) | 10.54 |
Front-end | 18.75 |
Dispatch | 19.00 |
Data deps. | 3.00 |
Overall L1 | 19.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 23% |
load | 21% |
store | 50% |
mul | 20% |
add-sub | 37% |
fma | 20% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
all | 22% |
load | 20% |
store | 50% |
mul | 20% |
add-sub | 37% |
fma | 20% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
all | 9% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 9% |
all | 15% |
load | 15% |
store | 18% |
mul | 15% |
add-sub | 17% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 14% |
all | 15% |
load | 14% |
store | 18% |
mul | 15% |
add-sub | 17% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
INC %R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %R13,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 17758 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0x7a0(%RBP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CMP %R14D,(%R10,%R13,4) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JNE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0x5b8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%R13,%R13,2),%R8 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA (%RDX,%R8,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x8(%R9),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%R9),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R9),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x10(%R9),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VRNDSCALESD $0x9,%XMM10,%XMM10,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VRNDSCALEPD $0x9,%XMM8,%XMM6 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBSD %XMM13,%XMM10,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM7,%XMM7,%XMM11 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBSD %XMM11,%XMM7,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMOVDDUP %XMM14,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMULPD -0x720(%RBP),%XMM3,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP %XMM12,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VSUBPD %XMM6,%XMM8,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM0,%XMM1,%XMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD -0x730(%RBP),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD -0x610(%RBP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD132PD -0x740(%RBP),%XMM15,%XMM9 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP %XMM8,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM5,%XMM5,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD132PD -0x7c0(%RBP),%XMM6,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD -0x628(%RBP),%XMM1,%XMM8 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKHPD %XMM5,%XMM5,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMULSD -0x620(%RBP),%XMM5,%XMM10 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDPD -0x770(%RBP),%XMM9,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD -0x758(%RBP),%XMM8,%XMM11 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD -0x618(%RBP),%XMM10,%XMM7 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDPD %XMM2,%XMM4,%XMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKHPD %XMM6,%XMM6,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVAPD %XMM6,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %XMM6,-0x700(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD 0xc8(%R15),%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM7,%XMM11,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0xd0(%R15),%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0xd8(%R15),%XMM4,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,-0x6f0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231SD 0xb0(%R15),%XMM6,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xb8(%R15),%XMM6,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xc0(%R15),%XMM6,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xe0(%R15),%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xe8(%R15),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xf0(%R15),%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD 0x7127(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %R11B | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VCOMISD 0x7123(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETB %SIL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %ESI,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x710c(%RIP),%XMM1 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %CL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %ECX,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x7106(%RIP),%XMM1 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETB %DIL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %EDI,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x70ef(%RIP),%XMM2 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %AL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
TEST %AL,%R11B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source file and lines | ParticleIOUtility.h:70-91 |
Module | libqmcparticle.so |
nb instructions | 100 |
nb uops | 112 |
loop length | 592 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 17 |
ADD-SUB / MUL ratio | 1.60 |
micro-operation queue | 28.25 cycles |
front end | 28.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 19.50 | 19.50 | 21.50 | 21.50 | 12.00 | 16.00 | 16.00 | 12.00 |
cycles | 19.50 | 19.50 | 21.50 | 21.50 | 12.00 | 16.00 | 16.00 | 12.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 3.00 |
FE+BE cycles | 33.51 |
Stall cycles | 5.58 |
LB full (events) | 9.91 |
Front-end | 28.25 |
Dispatch | 21.50 |
Data deps. | 3.00 |
Overall L1 | 28.25 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 24% |
load | 22% |
store | 50% |
mul | 20% |
add-sub | 37% |
fma | 20% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
all | 22% |
load | 21% |
store | 22% |
mul | 20% |
add-sub | 37% |
fma | 20% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
all | 7% |
load | 6% |
store | 7% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 6% |
all | 15% |
load | 15% |
store | 18% |
mul | 15% |
add-sub | 17% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
all | 14% |
load | 15% |
store | 12% |
mul | 15% |
add-sub | 17% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
MOV -0x7a0(%RBP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CMP %R14D,(%R10,%R13,4) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JNE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0x5b8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%R13,%R13,2),%R8 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA (%RDX,%R8,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x8(%R9),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%R9),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R9),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x10(%R9),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VRNDSCALESD $0x9,%XMM10,%XMM10,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VRNDSCALEPD $0x9,%XMM8,%XMM6 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBSD %XMM13,%XMM10,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM7,%XMM7,%XMM11 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBSD %XMM11,%XMM7,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMOVDDUP %XMM14,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMULPD -0x720(%RBP),%XMM3,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP %XMM12,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VSUBPD %XMM6,%XMM8,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM0,%XMM1,%XMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD -0x730(%RBP),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD -0x610(%RBP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD132PD -0x740(%RBP),%XMM15,%XMM9 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP %XMM8,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM5,%XMM5,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD132PD -0x7c0(%RBP),%XMM6,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD -0x628(%RBP),%XMM1,%XMM8 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKHPD %XMM5,%XMM5,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMULSD -0x620(%RBP),%XMM5,%XMM10 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDPD -0x770(%RBP),%XMM9,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD -0x758(%RBP),%XMM8,%XMM11 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD -0x618(%RBP),%XMM10,%XMM7 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDPD %XMM2,%XMM4,%XMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKHPD %XMM6,%XMM6,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVAPD %XMM6,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %XMM6,-0x700(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD 0xc8(%R15),%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM7,%XMM11,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0xd0(%R15),%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0xd8(%R15),%XMM4,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,-0x6f0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231SD 0xb0(%R15),%XMM6,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xb8(%R15),%XMM6,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xc0(%R15),%XMM6,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xe0(%R15),%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xe8(%R15),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xf0(%R15),%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD 0x7127(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %R11B | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VCOMISD 0x7123(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETB %SIL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %ESI,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x710c(%RIP),%XMM1 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %CL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %ECX,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x7106(%RIP),%XMM1 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETB %DIL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %EDI,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x70ef(%RIP),%XMM2 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %AL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
TEST %AL,%R11B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VCOMISD 0x70e3(%RIP),%XMM2 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
JAE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
LEA -0x4f0(%RBP),%RDI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA 0x6297(%RIP),%RDX | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV $0x1f4,%ESI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R14D,%ECX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVSD %XMM12,%XMM12,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
MOV $0x6,%EAX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %RDI,-0x600(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
CALL 5450 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV -0x600(%RBP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CALL 5150 | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOV 0xa8cf(%RIP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV -0x600(%RBP),%RSI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV %RAX,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV (%R10),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CALL 5310 <_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l@plt> | 2 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 1 | 0.33 | 0 | 1 |
MOVSXD -0x6b0(%RBP),%R8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x5e8(%R15),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x5c0(%R15),%RCX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
MOV 0x570(%R15),%RAX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVAPD -0x700(%RBP),%XMM13 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD -0x6f0(%RBP),%XMM14 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%R8,%R8,2),%R11 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
MOV %R8,%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
MOV 0x598(%R15),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%RDX,%R11,8),%RSI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
INC %R9D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VMOVUPD %XMM13,(%RSI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMOVSD %XMM14,0x10(%RSI) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R14D,(%RCX,%R8,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R8D,(%RAX,%R8,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
MOV %R13D,(%RDI,%R8,4) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
INC %R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
MOV %R9D,-0x6b0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
CMP %R13,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 17504 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | qmcplusplus::build_ions(qmcplusplus::ParticleSet&, qmcplusplus::Tensor |
Source file and lines | ParticleIOUtility.h:70-91 |
Module | libqmcparticle.so |
nb instructions | 68 |
nb uops | 78 |
loop length | 419 |
used x86 registers | 14 |
used mmx registers | 0 |
used xmm registers | 16 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 14 |
ADD-SUB / MUL ratio | 1.60 |
micro-operation queue | 19.75 cycles |
front end | 19.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 19.50 | 19.50 | 16.00 | 16.00 | 2.00 | 11.00 | 11.00 | 2.00 |
cycles | 19.50 | 19.50 | 16.00 | 16.00 | 2.00 | 11.00 | 11.00 | 2.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 3.00 |
FE+BE cycles | 26.41 |
Stall cycles | 6.21 |
RS full (events) | 0.77 |
LB full (events) | 8.48 |
Front-end | 19.75 |
Dispatch | 19.50 |
Data deps. | 3.00 |
Overall L1 | 19.75 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 22% |
load | 20% |
store | 50% |
mul | 20% |
add-sub | 37% |
fma | 20% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 11% |
all | 22% |
load | 20% |
store | 50% |
mul | 20% |
add-sub | 37% |
fma | 20% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 10% |
all | 9% |
load | 6% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 9% |
all | 15% |
load | 15% |
store | 18% |
mul | 15% |
add-sub | 17% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
all | 15% |
load | 14% |
store | 18% |
mul | 15% |
add-sub | 17% |
fma | 15% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 13% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
INC %R13 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
CMP %R13,%RBX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 17758 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0x7a0(%RBP),%R10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
CMP %R14D,(%R10,%R13,4) | 1 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0.25 | 0.25 | 0 | 1 | 0.50 |
JNE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
MOV -0x5b8(%RBP),%RDX | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
LEA (%R13,%R13,2),%R8 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
LEA (%RDX,%R8,8),%R9 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 |
VMOVSD 0x8(%R9),%XMM10 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVUPD (%R9),%XMM8 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD (%R9),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD 0x10(%R9),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VRNDSCALESD $0x9,%XMM10,%XMM10,%XMM13 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VRNDSCALEPD $0x9,%XMM8,%XMM6 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBSD %XMM13,%XMM10,%XMM14 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM7,%XMM7,%XMM11 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VSUBSD %XMM11,%XMM7,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0x9,%XMM1,%XMM1,%XMM0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VMOVDDUP %XMM14,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMULPD -0x720(%RBP),%XMM3,%XMM15 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP %XMM12,%XMM9 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VSUBPD %XMM6,%XMM8,%XMM5 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VSUBSD %XMM0,%XMM1,%XMM8 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVAPD -0x730(%RBP),%XMM6 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VMOVSD -0x610(%RBP),%XMM1 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 |
VFMADD132PD -0x740(%RBP),%XMM15,%XMM9 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVDDUP %XMM8,%XMM2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM5,%XMM5,%XMM7 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD132PD -0x7c0(%RBP),%XMM6,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD -0x628(%RBP),%XMM1,%XMM8 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKHPD %XMM5,%XMM5,%XMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMULSD -0x620(%RBP),%XMM5,%XMM10 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDPD -0x770(%RBP),%XMM9,%XMM4 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD -0x758(%RBP),%XMM8,%XMM11 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD -0x618(%RBP),%XMM10,%XMM7 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDPD %XMM2,%XMM4,%XMM6 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VUNPCKHPD %XMM6,%XMM6,%XMM4 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVAPD %XMM6,%XMM3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 |
VMOVAPD %XMM6,-0x700(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VMULSD 0xc8(%R15),%XMM4,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM7,%XMM11,%XMM12 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0xd0(%R15),%XMM4,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0xd8(%R15),%XMM4,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM12,-0x6f0(%RBP) | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 |
VFMADD231SD 0xb0(%R15),%XMM6,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xb8(%R15),%XMM6,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xc0(%R15),%XMM6,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xe0(%R15),%XMM12,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xe8(%R15),%XMM12,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD 0xf0(%R15),%XMM12,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VCOMISD 0x7127(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %R11B | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
VCOMISD 0x7123(%RIP),%XMM0 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETB %SIL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %ESI,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x710c(%RIP),%XMM1 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %CL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %ECX,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x7106(%RIP),%XMM1 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETB %DIL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
AND %EDI,%R11D | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VCOMISD 0x70ef(%RIP),%XMM2 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
SETAE %AL | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 |
TEST %AL,%R11B | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
VCOMISD 0x70e3(%RIP),%XMM2 | 2 | 1 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 3 | 1 |
JAE 174f8 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |