Loop Id: 51 | Module: exec | Source: initAtoms.c:221-228 | Coverage: 0.01% |
---|
Loop Id: 51 | Module: exec | Source: initAtoms.c:221-228 | Coverage: 0.01% |
---|
0x408dd8 MOVSXD (%R11,%R8,4),%R9 |
0x408ddc TEST %R9D,%R9D |
0x408ddf JLE 408f81 |
0x408de5 MOV 0x20(%R12),%RCX |
0x408dea MOVSXD %EDI,%R14 |
0x408ded MOV 0x28(%R12),%RSI |
0x408df2 MOV 0x10(%RCX),%R13 |
0x408df6 MOV 0x20(%RCX),%RDX |
0x408dfa LEA (%R13,%R14,4),%RCX |
0x408dff MOV %R8,%R14 |
0x408e02 ADD %R10,%RDX |
0x408e05 SAL $0x6,%R14 |
0x408e09 ADD %R14,%R9 |
0x408e0c LEA (%R13,%R9,4),%R9 |
0x408e11 MOV %R9,%R13 |
0x408e14 SUB %RCX,%R13 |
0x408e17 SUB $0x4,%R13 |
0x408e1b SHR $0x2,%R13 |
0x408e1f INC %R13 |
0x408e22 AND $0x3,%R13D |
0x408e26 JE 408ed0 |
0x408e2c CMP $0x1,%R13 |
0x408e30 JE 408e97 |
0x408e32 CMP $0x2,%R13 |
0x408e36 JE 408e67 |
0x408e38 MOVSXD (%RCX),%R14 |
0x408e3b VMOVSD 0x8(%RDX),%XMM2 |
0x408e40 ADD $0x4,%RCX |
0x408e44 ADD $0x18,%RDX |
0x408e48 SAL $0x4,%R14 |
0x408e4c VMOVHPD -0x18(%RDX),%XMM2,%XMM3 |
0x408e51 VMOVSD 0x8(%RSI,%R14,1),%XMM1 |
0x408e58 VMOVHPD -0x8(%RDX),%XMM1,%XMM4 |
0x408e5d VINSERTF128 $0x1,%XMM3,%YMM4,%YMM5 |
0x408e63 VADDPD %YMM5,%YMM0,%YMM0 |
0x408e67 MOVSXD (%RCX),%R13 |
0x408e6a VMOVSD 0x8(%RDX),%XMM6 |
0x408e6f ADD $0x4,%RCX |
0x408e73 ADD $0x18,%RDX |
0x408e77 SAL $0x4,%R13 |
0x408e7b VMOVHPD -0x18(%RDX),%XMM6,%XMM7 |
0x408e80 VMOVSD 0x8(%RSI,%R13,1),%XMM8 |
0x408e87 VMOVHPD -0x8(%RDX),%XMM8,%XMM9 |
0x408e8c VINSERTF128 $0x1,%XMM7,%YMM9,%YMM10 |
0x408e92 VADDPD %YMM10,%YMM0,%YMM0 |
0x408e97 MOVSXD (%RCX),%R14 |
0x408e9a VMOVSD 0x8(%RDX),%XMM11 |
0x408e9f ADD $0x4,%RCX |
0x408ea3 ADD $0x18,%RDX |
0x408ea7 SAL $0x4,%R14 |
0x408eab VMOVHPD -0x18(%RDX),%XMM11,%XMM12 |
0x408eb0 VMOVSD 0x8(%RSI,%R14,1),%XMM13 |
0x408eb7 VMOVHPD -0x8(%RDX),%XMM13,%XMM14 |
0x408ebc VINSERTF128 $0x1,%XMM12,%YMM14,%YMM15 |
0x408ec2 VADDPD %YMM15,%YMM0,%YMM0 |
0x408ec7 CMP %R9,%RCX |
0x408eca JE 408f81 |
(52) 0x408ed0 MOVSXD (%RCX),%R13 |
(52) 0x408ed3 VMOVSD 0x8(%RDX),%XMM2 |
(52) 0x408ed8 ADD $0x10,%RCX |
(52) 0x408edc ADD $0x60,%RDX |
(52) 0x408ee0 MOVSXD -0xc(%RCX),%R14 |
(52) 0x408ee4 VMOVSD -0x40(%RDX),%XMM7 |
(52) 0x408ee9 SAL $0x4,%R13 |
(52) 0x408eed VMOVHPD -0x60(%RDX),%XMM2,%XMM3 |
(52) 0x408ef2 VMOVSD -0x28(%RDX),%XMM13 |
(52) 0x408ef7 VMOVSD 0x8(%RSI,%R13,1),%XMM1 |
(52) 0x408efe SAL $0x4,%R14 |
(52) 0x408f02 MOVSXD -0x8(%RCX),%R13 |
(52) 0x408f06 VMOVHPD -0x48(%RDX),%XMM7,%XMM8 |
(52) 0x408f0b VMOVSD 0x8(%RSI,%R14,1),%XMM9 |
(52) 0x408f12 MOVSXD -0x4(%RCX),%R14 |
(52) 0x408f16 VMOVHPD -0x30(%RDX),%XMM13,%XMM14 |
(52) 0x408f1b VMOVHPD -0x50(%RDX),%XMM1,%XMM4 |
(52) 0x408f20 SAL $0x4,%R13 |
(52) 0x408f24 VMOVSD -0x10(%RDX),%XMM1 |
(52) 0x408f29 VINSERTF128 $0x1,%XMM3,%YMM4,%YMM5 |
(52) 0x408f2f VMOVHPD -0x38(%RDX),%XMM9,%XMM10 |
(52) 0x408f34 VMOVSD 0x8(%RSI,%R13,1),%XMM15 |
(52) 0x408f3b SAL $0x4,%R14 |
(52) 0x408f3f VADDPD %YMM5,%YMM0,%YMM6 |
(52) 0x408f43 VINSERTF128 $0x1,%XMM8,%YMM10,%YMM11 |
(52) 0x408f49 VMOVSD 0x8(%RSI,%R14,1),%XMM5 |
(52) 0x408f50 VMOVHPD -0x18(%RDX),%XMM1,%XMM4 |
(52) 0x408f55 VMOVHPD -0x20(%RDX),%XMM15,%XMM0 |
(52) 0x408f5a VINSERTF128 $0x1,%XMM14,%YMM0,%YMM2 |
(52) 0x408f60 VADDPD %YMM11,%YMM6,%YMM12 |
(52) 0x408f65 VMOVHPD -0x8(%RDX),%XMM5,%XMM6 |
(52) 0x408f6a VINSERTF128 $0x1,%XMM4,%YMM6,%YMM7 |
(52) 0x408f70 VADDPD %YMM2,%YMM12,%YMM3 |
(52) 0x408f74 VADDPD %YMM7,%YMM3,%YMM0 |
(52) 0x408f78 CMP %R9,%RCX |
(52) 0x408f7b JNE 408ed0 |
0x408f81 INC %R8 |
0x408f84 ADD $0x40,%EDI |
0x408f87 ADD $0x600,%R10 |
0x408f8e CMP %R8D,%EAX |
0x408f91 JG 408dd8 |
/scratch_na/users/xoserete/qaas_runs/171-322-9862/intel/CoMD/build/CoMD/CoMD/src-openmp/initAtoms.c: 221 - 228 |
-------------------------------------------------------------------------------- |
221: for (int iOff=MAXATOMS*iBox, ii=0; ii<s->boxes->nAtoms[iBox]; ++ii, ++iOff) |
222: { |
223: v0 += s->atoms->p[iOff][0]; |
224: v1 += s->atoms->p[iOff][1]; |
225: v2 += s->atoms->p[iOff][2]; |
226: |
227: int iSpecies = s->atoms->iSpecies[iOff]; |
228: v3 += s->species[iSpecies].mass; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.13 |
CQA speedup if FP arith vectorized | 1.13 |
CQA speedup if fully vectorized | 6.78 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.13 |
Bottlenecks | micro-operation queue, |
Function | computeVcm._omp_fn.0 |
Source | initAtoms.c:221-223,initAtoms.c:227-228 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 10.17 |
CQA cycles if no scalar integer | 9.00 |
CQA cycles if FP arith vectorized | 9.00 |
CQA cycles if fully vectorized | 1.50 |
Front-end cycles | 10.17 |
DIV/SQRT cycles | 6.30 |
P0 cycles | 6.20 |
P1 cycles | 6.67 |
P2 cycles | 6.67 |
P3 cycles | 0.00 |
P4 cycles | 9.00 |
P5 cycles | 6.30 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 6.20 |
P10 cycles | 6.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 10.69 - 10.93 |
Stall cycles (UFS) | 0.23 - 0.38 |
Nb insns | 62.00 |
Nb uops | 61.00 |
Nb loads | 20.00 |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 1.18 |
Nb FLOP add-sub | 12.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 14.16 |
Bytes prefetched | 0.00 |
Bytes loaded | 144.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 25.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 60.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 42.86 |
Vector-efficiency ratio all | 18.23 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 33.75 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 16.96 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.13 |
CQA speedup if FP arith vectorized | 1.13 |
CQA speedup if fully vectorized | 6.78 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.13 |
Bottlenecks | micro-operation queue, |
Function | computeVcm._omp_fn.0 |
Source | initAtoms.c:221-223,initAtoms.c:227-228 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 10.17 |
CQA cycles if no scalar integer | 9.00 |
CQA cycles if FP arith vectorized | 9.00 |
CQA cycles if fully vectorized | 1.50 |
Front-end cycles | 10.17 |
DIV/SQRT cycles | 6.30 |
P0 cycles | 6.20 |
P1 cycles | 6.67 |
P2 cycles | 6.67 |
P3 cycles | 0.00 |
P4 cycles | 9.00 |
P5 cycles | 6.30 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 6.20 |
P10 cycles | 6.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 10.69 - 10.93 |
Stall cycles (UFS) | 0.23 - 0.38 |
Nb insns | 62.00 |
Nb uops | 61.00 |
Nb loads | 20.00 |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 1.18 |
Nb FLOP add-sub | 12.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 14.16 |
Bytes prefetched | 0.00 |
Bytes loaded | 144.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 25.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 60.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 42.86 |
Vector-efficiency ratio all | 18.23 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 33.75 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 16.96 |
Path / |
Function | computeVcm._omp_fn.0 |
Source file and lines | initAtoms.c:221-228 |
Module | exec |
nb instructions | 62 |
nb uops | 61 |
loop length | 270 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 12 |
used ymm registers | 7 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 6.30 | 6.20 | 6.67 | 6.67 | 0.00 | 9.00 | 6.30 | 0.00 | 0.00 | 0.00 | 6.20 | 6.67 |
cycles | 6.30 | 6.20 | 6.67 | 6.67 | 0.00 | 9.00 | 6.30 | 0.00 | 0.00 | 0.00 | 6.20 | 6.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 10.69-10.93 |
Stall cycles | 0.23-0.38 |
LM full (events) | 0.31-0.53 |
Front-end | 10.17 |
Dispatch | 9.00 |
Overall L1 | 10.17 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 33% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 25% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 60% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 42% |
all | 10% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 9% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 10% |
all | 20% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 18% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 33% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 16% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOVSXD (%R11,%R8,4),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
TEST %R9D,%R9D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 408f81 <computeVcm._omp_fn.0+0x211> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x20(%R12),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %EDI,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV 0x28(%R12),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x10(%RCX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x20(%RCX),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R13,%R14,4),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD %R10,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x6,%R14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R14,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA (%R13,%R9,4),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R9,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RCX,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB $0x4,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x2,%R13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
INC %R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $0x3,%R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 408ed0 <computeVcm._omp_fn.0+0x160> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x1,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 408e97 <computeVcm._omp_fn.0+0x127> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x2,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 408e67 <computeVcm._omp_fn.0+0xf7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOVSXD (%RCX),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x8(%RDX),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x4,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD $0x18,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x4,%R14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
VMOVHPD -0x18(%RDX),%XMM2,%XMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD 0x8(%RSI,%R14,1),%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD -0x8(%RDX),%XMM1,%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM3,%YMM4,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPD %YMM5,%YMM0,%YMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
MOVSXD (%RCX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x8(%RDX),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x4,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD $0x18,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x4,%R13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
VMOVHPD -0x18(%RDX),%XMM6,%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD 0x8(%RSI,%R13,1),%XMM8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD -0x8(%RDX),%XMM8,%XMM9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM7,%YMM9,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPD %YMM10,%YMM0,%YMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
MOVSXD (%RCX),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x8(%RDX),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x4,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD $0x18,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x4,%R14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
VMOVHPD -0x18(%RDX),%XMM11,%XMM12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD 0x8(%RSI,%R14,1),%XMM13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD -0x8(%RDX),%XMM13,%XMM14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM12,%YMM14,%YMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPD %YMM15,%YMM0,%YMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
CMP %R9,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 408f81 <computeVcm._omp_fn.0+0x211> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
INC %R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD $0x40,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x600,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %R8D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JG 408dd8 <computeVcm._omp_fn.0+0x68> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
Function | computeVcm._omp_fn.0 |
Source file and lines | initAtoms.c:221-228 |
Module | exec |
nb instructions | 62 |
nb uops | 61 |
loop length | 270 |
used x86 registers | 12 |
used mmx registers | 0 |
used xmm registers | 12 |
used ymm registers | 7 |
used zmm registers | 0 |
nb stack references | 0 |
micro-operation queue | 10.17 cycles |
front end | 10.17 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 6.30 | 6.20 | 6.67 | 6.67 | 0.00 | 9.00 | 6.30 | 0.00 | 0.00 | 0.00 | 6.20 | 6.67 |
cycles | 6.30 | 6.20 | 6.67 | 6.67 | 0.00 | 9.00 | 6.30 | 0.00 | 0.00 | 0.00 | 6.20 | 6.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 10.69-10.93 |
Stall cycles | 0.23-0.38 |
LM full (events) | 0.31-0.53 |
Front-end | 10.17 |
Dispatch | 9.00 |
Overall L1 | 10.17 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 33% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 100% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 100% |
all | 25% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 60% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 42% |
all | 10% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 9% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 10% |
all | 20% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 50% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 25% |
all | 18% |
load | 12% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 33% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 16% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOVSXD (%R11,%R8,4),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
TEST %R9D,%R9D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 408f81 <computeVcm._omp_fn.0+0x211> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV 0x20(%R12),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOVSXD %EDI,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
MOV 0x28(%R12),%RSI | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x10(%RCX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x20(%RCX),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA (%R13,%R14,4),%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD %R10,%RDX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x6,%R14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
ADD %R14,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA (%R13,%R9,4),%R9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R9,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SUB %RCX,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SUB $0x4,%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SHR $0x2,%R13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
INC %R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $0x3,%R13D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 408ed0 <computeVcm._omp_fn.0+0x160> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x1,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 408e97 <computeVcm._omp_fn.0+0x127> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
CMP $0x2,%R13 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 408e67 <computeVcm._omp_fn.0+0xf7> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOVSXD (%RCX),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x8(%RDX),%XMM2 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x4,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD $0x18,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x4,%R14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
VMOVHPD -0x18(%RDX),%XMM2,%XMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD 0x8(%RSI,%R14,1),%XMM1 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD -0x8(%RDX),%XMM1,%XMM4 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM3,%YMM4,%YMM5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPD %YMM5,%YMM0,%YMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
MOVSXD (%RCX),%R13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x8(%RDX),%XMM6 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x4,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD $0x18,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x4,%R13 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
VMOVHPD -0x18(%RDX),%XMM6,%XMM7 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD 0x8(%RSI,%R13,1),%XMM8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD -0x8(%RDX),%XMM8,%XMM9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM7,%YMM9,%YMM10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPD %YMM10,%YMM0,%YMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
MOVSXD (%RCX),%R14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVSD 0x8(%RDX),%XMM11 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
ADD $0x4,%RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD $0x18,%RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x4,%R14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
VMOVHPD -0x18(%RDX),%XMM11,%XMM12 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VMOVSD 0x8(%RSI,%R14,1),%XMM13 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
VMOVHPD -0x8(%RDX),%XMM13,%XMM14 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0.33 | 4-12 | 1 |
VINSERTF128 $0x1,%XMM12,%YMM14,%YMM15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
VADDPD %YMM15,%YMM0,%YMM0 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
CMP %R9,%RCX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JE 408f81 <computeVcm._omp_fn.0+0x211> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
INC %R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
ADD $0x40,%EDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x600,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %R8D,%EAX | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JG 408dd8 <computeVcm._omp_fn.0+0x68> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |