Loop Id: 340 | Module: libqmcwfs.so | Source: BsplineFunctor.h:246-260 | Coverage: 1.1% |
---|
Loop Id: 340 | Module: libqmcwfs.so | Source: BsplineFunctor.h:246-260 | Coverage: 1.1% |
---|
0x36178 VMULSD (%R11),%XMM19,%XMM29 [3] |
0x3617e ADD $0x10,%R11 |
0x36182 VRNDSCALESD $0xb,%XMM29,%XMM29,%XMM1 |
0x36189 VCVTTSD2SI %XMM29,%ESI |
0x3618f VSUBSD %XMM1,%XMM29,%XMM1 |
0x36195 VMULSD %XMM1,%XMM1,%XMM30 |
0x3619b VMOVSD %XMM1,%XMM1,%XMM25 |
0x361a1 VMOVSD %XMM1,%XMM1,%XMM21 |
0x361a7 VFMADD132SD %XMM8,%XMM7,%XMM25 |
0x361ad VFMADD132SD %XMM16,%XMM15,%XMM21 |
0x361b3 VMOVSD %XMM1,%XMM1,%XMM22 |
0x361b9 MOVSXD %ESI,%RCX |
0x361bc VFMADD132SD %XMM12,%XMM11,%XMM22 |
0x361c2 VMULSD %XMM30,%XMM1,%XMM2 |
0x361c8 VFMADD132SD %XMM4,%XMM3,%XMM1 |
0x361cd VMULSD %XMM18,%XMM30,%XMM31 |
0x361d3 VMULSD %XMM14,%XMM30,%XMM20 |
0x361d9 VMULSD %XMM10,%XMM30,%XMM27 |
0x361df VMULSD %XMM6,%XMM30,%XMM30 |
0x361e5 VFMADD231SD %XMM17,%XMM2,%XMM31 |
0x361eb VFMADD231SD %XMM13,%XMM2,%XMM20 |
0x361f1 VFMADD231SD %XMM9,%XMM2,%XMM27 |
0x361f7 VFMADD132SD %XMM5,%XMM30,%XMM2 |
0x361fd VADDSD %XMM31,%XMM21,%XMM24 |
0x36203 VADDSD %XMM22,%XMM20,%XMM26 |
0x36209 VADDSD %XMM25,%XMM27,%XMM28 |
0x3620f VADDSD %XMM1,%XMM2,%XMM1 |
0x36213 VMULSD 0x10(%RBX,%RCX,8),%XMM28,%XMM29 [1] |
0x3621b VMULSD 0x18(%RBX,%RCX,8),%XMM1,%XMM2 [1] |
0x36221 VFMADD132SD 0x8(%RBX,%RCX,8),%XMM29,%XMM26 [1] |
0x36229 VFMADD231SD (%RBX,%RCX,8),%XMM24,%XMM2 [1] |
0x36230 VADDSD %XMM2,%XMM26,%XMM1 |
0x36236 VADDSD %XMM1,%XMM0,%XMM2 |
0x3623a VMULSD -0x8(%R11),%XMM19,%XMM0 [2] |
0x36241 VRNDSCALESD $0xb,%XMM0,%XMM0,%XMM31 |
0x36248 VCVTTSD2SI %XMM0,%EDX |
0x3624c VSUBSD %XMM31,%XMM0,%XMM21 |
0x36252 VMULSD %XMM21,%XMM21,%XMM22 |
0x36258 VMOVSD %XMM21,%XMM21,%XMM24 |
0x3625e VMOVSD %XMM21,%XMM21,%XMM25 |
0x36264 VFMADD132SD %XMM16,%XMM15,%XMM24 |
0x3626a VFMADD132SD %XMM8,%XMM7,%XMM25 |
0x36270 VMOVSD %XMM21,%XMM21,%XMM20 |
0x36276 MOVSXD %EDX,%RDI |
0x36279 VFMADD132SD %XMM12,%XMM11,%XMM20 |
0x3627f VMULSD %XMM22,%XMM21,%XMM1 |
0x36285 VFMADD132SD %XMM4,%XMM3,%XMM21 |
0x3628b VMULSD %XMM18,%XMM22,%XMM0 |
0x36291 VMULSD %XMM10,%XMM22,%XMM27 |
0x36297 VMULSD %XMM6,%XMM22,%XMM30 |
0x3629d VFMADD231SD %XMM17,%XMM1,%XMM0 |
0x362a3 VFMADD231SD %XMM9,%XMM1,%XMM27 |
0x362a9 VADDSD %XMM0,%XMM24,%XMM26 |
0x362af VMULSD %XMM14,%XMM22,%XMM0 |
0x362b5 VADDSD %XMM25,%XMM27,%XMM28 |
0x362bb VMULSD 0x10(%RBX,%RDI,8),%XMM28,%XMM29 [4] |
0x362c3 VFMADD231SD %XMM13,%XMM1,%XMM0 |
0x362c8 VFMADD132SD %XMM5,%XMM30,%XMM1 |
0x362ce VADDSD %XMM21,%XMM1,%XMM1 |
0x362d4 VADDSD %XMM20,%XMM0,%XMM0 |
0x362da VMULSD 0x18(%RBX,%RDI,8),%XMM1,%XMM1 [4] |
0x362e0 VFMADD132SD 0x8(%RBX,%RDI,8),%XMM29,%XMM0 [4] |
0x362e8 VFMADD132SD (%RBX,%RDI,8),%XMM1,%XMM26 [4] |
0x362ef VADDSD %XMM26,%XMM0,%XMM0 |
0x362f5 VADDSD %XMM0,%XMM2,%XMM0 |
0x362f9 CMP %RAX,%R11 |
0x362fc JNE 36178 |
/home/kcamus/qaas_runs/169-390-4082/intel/miniqmc/build/miniqmc/src/QMCWaveFunctions/Jastrow/BsplineFunctor.h: 246 - 260 |
-------------------------------------------------------------------------------- |
246: for (int jat = 0; jat < iCount; jat++) |
247: { |
248: real_type r = distArrayCompressed[jat]; |
249: r *= DeltaRInv; |
250: int i = (int)r; |
251: real_type t = r - real_type(i); |
252: real_type tp0 = t * t * t; |
253: real_type tp1 = t * t; |
254: real_type tp2 = t; |
255: |
256: real_type d1 = SplineCoefs[i + 0] * (A[0] * tp0 + A[1] * tp1 + A[2] * tp2 + A[3]); |
257: real_type d2 = SplineCoefs[i + 1] * (A[4] * tp0 + A[5] * tp1 + A[6] * tp2 + A[7]); |
258: real_type d3 = SplineCoefs[i + 2] * (A[8] * tp0 + A[9] * tp1 + A[10] * tp2 + A[11]); |
259: real_type d4 = SplineCoefs[i + 3] * (A[12] * tp0 + A[13] * tp1 + A[14] * tp2 + A[15]); |
260: d += (d1 + d2 + d3 + d4); |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►66.67+ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:219 | libqmcwfs.so |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
►33.33+ | qmcplusplus::WaveFunction::rat[...] | WaveFunction.cpp:219 | libqmcwfs.so |
○ | main._omp_fn.1 | stl_vector.h:1123 | exec |
○ | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 4.14 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.54 |
Bottlenecks | P0, P1, |
Function | miniqmcreference::TwoBodyJastrowRef |
Source | BsplineFunctor.h:246-260 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 30.00 |
CQA cycles if no scalar integer | 30.00 |
CQA cycles if FP arith vectorized | 7.25 |
CQA cycles if fully vectorized | 3.75 |
Front-end cycles | 19.50 |
DIV/SQRT cycles | 30.00 |
P0 cycles | 30.00 |
P1 cycles | 5.00 |
P2 cycles | 5.00 |
P3 cycles | 0.00 |
P4 cycles | 6.00 |
P5 cycles | 4.00 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 8 |
FE+BE cycles (UFS) | 35.33 |
Stall cycles (UFS) | 15.44 |
Nb insns | 67.00 |
Nb uops | 70.00 |
Nb loads | 10.00 |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 2.40 |
Nb FLOP add-sub | 14.00 |
Nb FLOP mul | 18.00 |
Nb FLOP fma | 20.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 2.67 |
Bytes prefetched | 0.00 |
Bytes loaded | 80.00 |
Bytes stored | 0.00 |
Stride 0 | 0.00 |
Stride 1 | 1.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 2.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 4.14 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.54 |
Bottlenecks | P0, P1, |
Function | miniqmcreference::TwoBodyJastrowRef |
Source | BsplineFunctor.h:246-260 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 30.00 |
CQA cycles if no scalar integer | 30.00 |
CQA cycles if FP arith vectorized | 7.25 |
CQA cycles if fully vectorized | 3.75 |
Front-end cycles | 19.50 |
DIV/SQRT cycles | 30.00 |
P0 cycles | 30.00 |
P1 cycles | 5.00 |
P2 cycles | 5.00 |
P3 cycles | 0.00 |
P4 cycles | 6.00 |
P5 cycles | 4.00 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
Inter-iter dependencies cycles | 8 |
FE+BE cycles (UFS) | 35.33 |
Stall cycles (UFS) | 15.44 |
Nb insns | 67.00 |
Nb uops | 70.00 |
Nb loads | 10.00 |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 2.40 |
Nb FLOP add-sub | 14.00 |
Nb FLOP mul | 18.00 |
Nb FLOP fma | 20.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 2.67 |
Bytes prefetched | 0.00 |
Bytes loaded | 80.00 |
Bytes stored | 0.00 |
Stride 0 | 0.00 |
Stride 1 | 1.00 |
Stride n | 0.00 |
Stride unknown | 0.00 |
Stride indirect | 2.00 |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | 0.00 |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | 12.50 |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | 12.50 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 12.50 |
Path / |
Function | miniqmcreference::TwoBodyJastrowRef |
Source file and lines | BsplineFunctor.h:246-260 |
Module | libqmcwfs.so |
nb instructions | 67 |
nb uops | 70 |
loop length | 394 |
used x86 registers | 7 |
used mmx registers | 0 |
used xmm registers | 31 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 0.78 |
micro-operation queue | 19.50 cycles |
front end | 19.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 30.00 | 30.00 | 5.00 | 5.00 | 0.00 | 6.00 | 4.00 | 0.00 |
cycles | 30.00 | 30.00 | 5.00 | 5.00 | 0.00 | 6.00 | 4.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 8.00 |
FE+BE cycles | 35.33 |
Stall cycles | 15.44 |
RS full (events) | 30.71 |
Front-end | 19.50 |
Dispatch | 30.00 |
Data deps. | 8.00 |
Overall L1 | 30.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMULSD (%R11),%XMM19,%XMM29 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
ADD $0x10,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VRNDSCALESD $0xb,%XMM29,%XMM29,%XMM1 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VCVTTSD2SI %XMM29,%ESI | 2 | 1.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 1 |
VSUBSD %XMM1,%XMM29,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM30 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM1,%XMM1,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM1,%XMM1,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD132SD %XMM8,%XMM7,%XMM25 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM16,%XMM15,%XMM21 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM1,%XMM1,%XMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
MOVSXD %ESI,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM12,%XMM11,%XMM22 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM30,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM3,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM18,%XMM30,%XMM31 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM14,%XMM30,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM10,%XMM30,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM6,%XMM30,%XMM30 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM17,%XMM2,%XMM31 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM2,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM9,%XMM2,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM5,%XMM30,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM31,%XMM21,%XMM24 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM22,%XMM20,%XMM26 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM25,%XMM27,%XMM28 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM1,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x10(%RBX,%RCX,8),%XMM28,%XMM29 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x18(%RBX,%RCX,8),%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD 0x8(%RBX,%RCX,8),%XMM29,%XMM26 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD (%RBX,%RCX,8),%XMM24,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM2,%XMM26,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM1,%XMM0,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD -0x8(%R11),%XMM19,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0xb,%XMM0,%XMM0,%XMM31 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VCVTTSD2SI %XMM0,%EDX | 2 | 1.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 1 |
VSUBSD %XMM31,%XMM0,%XMM21 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM21,%XMM21,%XMM22 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM21,%XMM21,%XMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM21,%XMM21,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD132SD %XMM16,%XMM15,%XMM24 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM8,%XMM7,%XMM25 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM21,%XMM21,%XMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
MOVSXD %EDX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM12,%XMM11,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM22,%XMM21,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM3,%XMM21 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM18,%XMM22,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM10,%XMM22,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM6,%XMM22,%XMM30 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM17,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM9,%XMM1,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM0,%XMM24,%XMM26 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM14,%XMM22,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM25,%XMM27,%XMM28 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x10(%RBX,%RDI,8),%XMM28,%XMM29 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM5,%XMM30,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM21,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM20,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x18(%RBX,%RDI,8),%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD 0x8(%RBX,%RDI,8),%XMM29,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD (%RBX,%RDI,8),%XMM1,%XMM26 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM26,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMP %RAX,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 36178 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |
Function | miniqmcreference::TwoBodyJastrowRef |
Source file and lines | BsplineFunctor.h:246-260 |
Module | libqmcwfs.so |
nb instructions | 67 |
nb uops | 70 |
loop length | 394 |
used x86 registers | 7 |
used mmx registers | 0 |
used xmm registers | 31 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 0 |
ADD-SUB / MUL ratio | 0.78 |
micro-operation queue | 19.50 cycles |
front end | 19.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 30.00 | 30.00 | 5.00 | 5.00 | 0.00 | 6.00 | 4.00 | 0.00 |
cycles | 30.00 | 30.00 | 5.00 | 5.00 | 0.00 | 6.00 | 4.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 8.00 |
FE+BE cycles | 35.33 |
Stall cycles | 15.44 |
RS full (events) | 30.71 |
Front-end | 19.50 |
Dispatch | 30.00 |
Data deps. | 8.00 |
Overall L1 | 30.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 0% |
add-sub | 0% |
fma | 0% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | 12% |
add-sub | 12% |
fma | 12% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|
VMULSD (%R11),%XMM19,%XMM29 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
ADD $0x10,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VRNDSCALESD $0xb,%XMM29,%XMM29,%XMM1 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VCVTTSD2SI %XMM29,%ESI | 2 | 1.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 1 |
VSUBSD %XMM1,%XMM29,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM1,%XMM1,%XMM30 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM1,%XMM1,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM1,%XMM1,%XMM21 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD132SD %XMM8,%XMM7,%XMM25 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM16,%XMM15,%XMM21 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM1,%XMM1,%XMM22 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
MOVSXD %ESI,%RCX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM12,%XMM11,%XMM22 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM30,%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM3,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM18,%XMM30,%XMM31 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM14,%XMM30,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM10,%XMM30,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM6,%XMM30,%XMM30 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM17,%XMM2,%XMM31 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM2,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM9,%XMM2,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM5,%XMM30,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM31,%XMM21,%XMM24 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM22,%XMM20,%XMM26 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM25,%XMM27,%XMM28 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM1,%XMM2,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x10(%RBX,%RCX,8),%XMM28,%XMM29 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x18(%RBX,%RCX,8),%XMM1,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD 0x8(%RBX,%RCX,8),%XMM29,%XMM26 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD (%RBX,%RCX,8),%XMM24,%XMM2 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM2,%XMM26,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM1,%XMM0,%XMM2 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD -0x8(%R11),%XMM19,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VRNDSCALESD $0xb,%XMM0,%XMM0,%XMM31 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 1 |
VCVTTSD2SI %XMM0,%EDX | 2 | 1.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 1 |
VSUBSD %XMM31,%XMM0,%XMM21 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM21,%XMM21,%XMM22 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM21,%XMM21,%XMM24 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VMOVSD %XMM21,%XMM21,%XMM25 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
VFMADD132SD %XMM16,%XMM15,%XMM24 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM8,%XMM7,%XMM25 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMOVSD %XMM21,%XMM21,%XMM20 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
MOVSXD %EDX,%RDI | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
VFMADD132SD %XMM12,%XMM11,%XMM20 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM22,%XMM21,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM4,%XMM3,%XMM21 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM18,%XMM22,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM10,%XMM22,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM6,%XMM22,%XMM30 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM17,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM9,%XMM1,%XMM27 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM0,%XMM24,%XMM26 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD %XMM14,%XMM22,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM25,%XMM27,%XMM28 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x10(%RBX,%RDI,8),%XMM28,%XMM29 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD231SD %XMM13,%XMM1,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD %XMM5,%XMM30,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM21,%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM20,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VMULSD 0x18(%RBX,%RDI,8),%XMM1,%XMM1 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD 0x8(%RBX,%RDI,8),%XMM29,%XMM0 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VFMADD132SD (%RBX,%RDI,8),%XMM1,%XMM26 | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM26,%XMM0,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
VADDSD %XMM0,%XMM2,%XMM0 | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
CMP %RAX,%R11 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 |
JNE 36178 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 |