Loop Id: 102 | Module: exec | Source: timestep.c:71-78 | Coverage: 0.03% |
---|
Loop Id: 102 | Module: exec | Source: timestep.c:71-78 | Coverage: 0.03% |
---|
0x410f60 ADD $0x40,%ESI |
0x410f63 CMP %RCX,%RDI |
0x410f66 LEA 0x1(%RDI),%RDI |
0x410f6a JE 410f0b |
0x410f6c LEA (%RDI,%RAX,1),%R8 |
0x410f70 MOV (%RDX,%R8,4),%R8D |
0x410f74 TEST %R8D,%R8D |
0x410f77 JLE 410f60 |
0x410f79 MOV %ESI,%R11D |
0x410f7c SAL $0x3,%R11 |
0x410f80 LEA (%RDI,%RAX,1),%R15D |
0x410f84 SAL $0x6,%R15D |
0x410f88 MOV 0x20(%RBX),%R10 |
0x410f8c MOV 0x20(%R10),%R9 |
0x410f90 MOV 0x28(%R10),%R10 |
0x410f94 LEA -0x1(%R8),%R14D |
0x410f98 MOVSXD %R14D,%R14 |
0x410f9b ADD %R15,%R14 |
0x410f9e SAL $0x3,%R14 |
0x410fa2 LEA (%R14,%R14,2),%R14 |
0x410fa6 LEA (%R10,%R14,1),%R12 |
0x410faa ADD $0x10,%R12 |
0x410fae SAL $0x3,%R15 |
0x410fb2 LEA (%R15,%R15,2),%R15 |
0x410fb6 LEA (%R9,%R15,1),%R13 |
0x410fba CMP %R13,%R12 |
0x410fbd JB 411030 |
0x410fbf ADD %R10,%R15 |
0x410fc2 ADD %R9,%R14 |
0x410fc5 ADD $0x10,%R14 |
0x410fc9 CMP %R15,%R14 |
0x410fcc JB 411030 |
0x410fce LEA (%R11,%R11,2),%R11 |
0x410fd2 ADD $0x10,%R11 |
0x410fd6 NOPW %CS:(%RAX,%RAX,1) |
(105) 0x410fe0 VMOVSD -0x10(%R10,%R11,1),%XMM1 |
(105) 0x410fe7 VFMADD213SD -0x10(%R9,%R11,1),%XMM0,%XMM1 |
(105) 0x410fee VMOVSD %XMM1,-0x10(%R9,%R11,1) |
(105) 0x410ff5 VMOVSD -0x8(%R10,%R11,1),%XMM1 |
(105) 0x410ffc VFMADD213SD -0x8(%R9,%R11,1),%XMM0,%XMM1 |
(105) 0x411003 VMOVSD %XMM1,-0x8(%R9,%R11,1) |
(105) 0x41100a VMOVSD (%R10,%R11,1),%XMM1 |
(105) 0x411010 VFMADD213SD (%R9,%R11,1),%XMM0,%XMM1 |
(105) 0x411016 VMOVSD %XMM1,(%R9,%R11,1) |
(105) 0x41101c ADD $0x18,%R11 |
(105) 0x411020 DEC %R8D |
(105) 0x411023 JNE 410fe0 |
0x411025 JMP 410f60 |
0x411030 LEA (%R11,%R11,2),%R14 |
0x411034 MOV %R8D,%R15D |
0x411037 AND $-0x8,%R15D |
0x41103b JE 411285 |
0x411041 LEA -0x1(%R15),%R12D |
0x411045 XOR %R13D,%R13D |
0x411048 MOV %R14,%R11 |
0x41104b NOPL (%RAX,%RAX,1) |
(104) 0x411050 VMOVUPD 0x20(%R10,%R11,1),%YMM3 |
(104) 0x411057 VMOVUPD 0x80(%R10,%R11,1),%YMM5 |
(104) 0x411061 VMOVUPD 0x10(%R10,%R11,1),%XMM6 |
(104) 0x411068 VMOVUPD 0x70(%R10,%R11,1),%XMM4 |
(104) 0x41106f VBLENDPD $0x3,0x60(%R10,%R11,1),%YMM5,%YMM7 |
(104) 0x411077 VINSERTF128 $0x1,0xa0(%R10,%R11,1),%YMM4,%YMM8 |
(104) 0x411082 VMOVUPD 0x20(%R10,%R11,1),%XMM12 |
(104) 0x411089 VBLENDPD $0xa,%YMM8,%YMM7,%YMM4 |
(104) 0x41108f VBLENDPD $0x3,(%R10,%R11,1),%YMM3,%YMM10 |
(104) 0x411096 VINSERTF128 $0x1,0x40(%R10,%R11,1),%YMM6,%YMM6 |
(104) 0x41109e VBLENDPD $0xa,%YMM6,%YMM10,%YMM9 |
(104) 0x4110a4 VSHUFPD $0x5,%YMM5,%YMM7,%YMM5 |
(104) 0x4110a9 VBROADCASTSD 0xb0(%R10,%R11,1),%YMM7 |
(104) 0x4110b3 VBLENDPD $0x8,%YMM7,%YMM5,%YMM5 |
(104) 0x4110b9 VSHUFPD $0x5,%YMM3,%YMM10,%YMM3 |
(104) 0x4110be VBROADCASTSD 0x50(%R10,%R11,1),%YMM7 |
(104) 0x4110c5 VBLENDPD $0x8,%YMM7,%YMM3,%YMM11 |
(104) 0x4110cb VMOVUPD 0x80(%R10,%R11,1),%XMM3 |
(104) 0x4110d5 VBLENDPD $0xc,0xa0(%R10,%R11,1),%YMM3,%YMM3 |
(104) 0x4110e0 VBLENDPD $0xa,%YMM3,%YMM8,%YMM10 |
(104) 0x4110e6 VBLENDPD $0xc,0x40(%R10,%R11,1),%YMM12,%YMM3 |
(104) 0x4110ee VBLENDPD $0xa,%YMM3,%YMM6,%YMM14 |
(104) 0x4110f4 VMOVUPD 0x20(%R9,%R11,1),%YMM6 |
(104) 0x4110fb VMOVUPD 0x80(%R9,%R11,1),%YMM7 |
(104) 0x411105 VMOVUPD 0x10(%R9,%R11,1),%XMM12 |
(104) 0x41110c VMOVUPD 0x70(%R9,%R11,1),%XMM3 |
(104) 0x411113 VBLENDPD $0x3,0x60(%R9,%R11,1),%YMM7,%YMM8 |
(104) 0x41111b VINSERTF128 $0x1,0xa0(%R9,%R11,1),%YMM3,%YMM13 |
(104) 0x411126 VBLENDPD $0xa,%YMM13,%YMM8,%YMM3 |
(104) 0x41112c VBLENDPD $0x3,(%R9,%R11,1),%YMM6,%YMM15 |
(104) 0x411133 VSHUFPD $0x5,%YMM7,%YMM8,%YMM7 |
(104) 0x411138 VBROADCASTSD 0xb0(%R9,%R11,1),%YMM8 |
(104) 0x411142 VBLENDPD $0x8,%YMM8,%YMM7,%YMM7 |
(104) 0x411148 VSHUFPD $0x5,%YMM6,%YMM15,%YMM6 |
(104) 0x41114d VBROADCASTSD 0x50(%R9,%R11,1),%YMM8 |
(104) 0x411154 VBLENDPD $0x8,%YMM8,%YMM6,%YMM6 |
(104) 0x41115a VMOVUPD 0x80(%R9,%R11,1),%XMM8 |
(104) 0x411164 VBLENDPD $0xc,0xa0(%R9,%R11,1),%YMM8,%YMM8 |
(104) 0x41116f VBLENDPD $0xa,%YMM8,%YMM13,%YMM8 |
(104) 0x411175 VMOVUPD 0x20(%R9,%R11,1),%XMM1 |
(104) 0x41117c VINSERTF128 $0x1,0x40(%R9,%R11,1),%YMM12,%YMM12 |
(104) 0x411184 VBLENDPD $0xa,%YMM12,%YMM15,%YMM13 |
(104) 0x41118a VBLENDPD $0xc,0x40(%R9,%R11,1),%YMM1,%YMM1 |
(104) 0x411192 VBLENDPD $0xa,%YMM1,%YMM12,%YMM12 |
(104) 0x411198 VFMADD231PD %YMM9,%YMM2,%YMM13 |
(104) 0x41119d VFMADD231PD %YMM4,%YMM2,%YMM3 |
(104) 0x4111a2 VFMADD231PD %YMM11,%YMM2,%YMM6 |
(104) 0x4111a7 VFMADD231PD %YMM5,%YMM2,%YMM7 |
(104) 0x4111ac VFMADD231PD %YMM14,%YMM2,%YMM12 |
(104) 0x4111b1 VFMADD231PD %YMM10,%YMM2,%YMM8 |
(104) 0x4111b6 VMOVDDUP %XMM6,%XMM1 |
(104) 0x4111ba VPERM2F128 $0x20,%YMM13,%YMM1,%YMM1 |
(104) 0x4111c0 VMOVDDUP %XMM7,%XMM4 |
(104) 0x4111c4 VPERM2F128 $0x20,%YMM3,%YMM4,%YMM4 |
(104) 0x4111ca VINSERTF128 $0x1,%XMM8,%YMM3,%YMM5 |
(104) 0x4111d0 VBLENDPD $0xa,%YMM4,%YMM5,%YMM4 |
(104) 0x4111d6 VINSERTF128 $0x1,%XMM12,%YMM13,%YMM5 |
(104) 0x4111dc VBLENDPD $0xa,%YMM1,%YMM5,%YMM1 |
(104) 0x4111e2 VSHUFPD $0x1,%YMM7,%YMM7,%YMM5 |
(104) 0x4111e7 VBLENDPD $0x4,%YMM3,%YMM5,%YMM5 |
(104) 0x4111ed VSHUFPD $0x4,%YMM7,%YMM7,%YMM7 |
(104) 0x4111f2 VPERM2F128 $0x31,%YMM8,%YMM3,%YMM3 |
(104) 0x4111f8 VPERM2F128 $0x31,%YMM7,%YMM8,%YMM7 |
(104) 0x4111fe VBLENDPD $0xa,%YMM3,%YMM7,%YMM3 |
(104) 0x411204 VSHUFPD $0x1,%YMM6,%YMM6,%YMM7 |
(104) 0x411209 VBLENDPD $0x4,%YMM13,%YMM7,%YMM7 |
(104) 0x41120f VSHUFPD $0x4,%YMM6,%YMM6,%YMM6 |
(104) 0x411214 VPERM2F128 $0x31,%YMM12,%YMM13,%YMM9 |
(104) 0x41121a VPERM2F128 $0x31,%YMM6,%YMM12,%YMM6 |
(104) 0x411220 VBLENDPD $0xa,%YMM9,%YMM6,%YMM6 |
(104) 0x411226 VBLENDPD $0x2,%YMM12,%YMM7,%YMM7 |
(104) 0x41122c VBLENDPD $0x2,%YMM8,%YMM5,%YMM5 |
(104) 0x411232 VMOVUPD %YMM6,0x40(%R9,%R11,1) |
(104) 0x411239 VMOVUPD %YMM3,0xa0(%R9,%R11,1) |
(104) 0x411243 VMOVUPD %YMM5,0x80(%R9,%R11,1) |
(104) 0x41124d VMOVUPD %YMM7,0x20(%R9,%R11,1) |
(104) 0x411254 VMOVUPD %YMM1,(%R9,%R11,1) |
(104) 0x41125a VMOVUPD %YMM4,0x60(%R9,%R11,1) |
(104) 0x411261 ADD $0x8,%R13D |
(104) 0x411265 ADD $0xc0,%R11 |
(104) 0x41126c CMP %R12D,%R13D |
(104) 0x41126f JLE 411050 |
0x411275 CMP %R15D,%R8D |
0x411278 VMOVDQU -0x50(%RBP),%XMM3 |
0x41127d JE 410f60 |
0x411283 JMP 411288 |
0x411285 XOR %R15D,%R15D |
0x411288 SUB %R15D,%R8D |
0x41128b MOVSXD %R15D,%R11 |
0x41128e SAL $0x3,%R11 |
0x411292 LEA (%R11,%R11,2),%R11 |
0x411296 ADD %R11,%R9 |
0x411299 ADD %R11,%R10 |
0x41129c NOPL (%RAX) |
(103) 0x4112a0 VMOVUPD (%R10,%R14,1),%XMM1 |
(103) 0x4112a6 VFMADD213PD (%R9,%R14,1),%XMM3,%XMM1 |
(103) 0x4112ac VMOVUPD %XMM1,(%R9,%R14,1) |
(103) 0x4112b2 VMOVSD 0x10(%R10,%R14,1),%XMM1 |
(103) 0x4112b9 VFMADD213SD 0x10(%R9,%R14,1),%XMM0,%XMM1 |
(103) 0x4112c0 VMOVSD %XMM1,0x10(%R9,%R14,1) |
(103) 0x4112c7 ADD $0x18,%R14 |
(103) 0x4112cb DEC %R8D |
(103) 0x4112ce JNE 4112a0 |
0x4112d0 JMP 410f60 |
/scratch_na/users/xoserete/qaas_runs/171-172-2581/intel/CoMD/build/CoMD/CoMD/src-openmp/timestep.c: 71 - 78 |
-------------------------------------------------------------------------------- |
71: #pragma omp parallel for |
72: for (int iBox=0; iBox<nBoxes; iBox++) |
73: { |
74: for (int iOff=MAXATOMS*iBox,ii=0; ii<s->boxes->nAtoms[iBox]; ii++,iOff++) |
75: { |
76: s->atoms->p[iOff][0] += dt*s->atoms->f[iOff][0]; |
77: s->atoms->p[iOff][1] += dt*s->atoms->f[iOff][1]; |
78: s->atoms->p[iOff][2] += dt*s->atoms->f[iOff][2]; |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.38 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 14.25 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.62 |
Bottlenecks | micro-operation queue, |
Function | advanceVelocity.extracted |
Source | timestep.c:71-78 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 9.50 |
CQA cycles if no scalar integer | 4.00 |
CQA cycles if FP arith vectorized | 9.50 |
CQA cycles if fully vectorized | 0.67 |
Front-end cycles | 9.50 |
DIV/SQRT cycles | 5.70 |
P0 cycles | 5.87 |
P1 cycles | 1.67 |
P2 cycles | 1.67 |
P3 cycles | 0.00 |
P4 cycles | 5.87 |
P5 cycles | 5.70 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 5.87 |
P10 cycles | 1.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 9.62 |
Stall cycles (UFS) | 0.00 |
Nb insns | 57.00 |
Nb uops | 57.00 |
Nb loads | 5.00 |
Nb stores | 0.00 |
Nb stack references | 1.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 4.63 |
Bytes prefetched | 0.00 |
Bytes loaded | 44.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 11.11 |
Vectorization ratio load | 100.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 11.11 |
Vector-efficiency ratio load | 25.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 9.38 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 9.38 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 2.38 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 14.25 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.62 |
Bottlenecks | micro-operation queue, |
Function | advanceVelocity.extracted |
Source | timestep.c:71-78 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 9.50 |
CQA cycles if no scalar integer | 4.00 |
CQA cycles if FP arith vectorized | 9.50 |
CQA cycles if fully vectorized | 0.67 |
Front-end cycles | 9.50 |
DIV/SQRT cycles | 5.70 |
P0 cycles | 5.87 |
P1 cycles | 1.67 |
P2 cycles | 1.67 |
P3 cycles | 0.00 |
P4 cycles | 5.87 |
P5 cycles | 5.70 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 5.87 |
P10 cycles | 1.67 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | 9.62 |
Stall cycles (UFS) | 0.00 |
Nb insns | 57.00 |
Nb uops | 57.00 |
Nb loads | 5.00 |
Nb stores | 0.00 |
Nb stack references | 1.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 4.63 |
Bytes prefetched | 0.00 |
Bytes loaded | 44.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 11.11 |
Vectorization ratio load | 100.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 11.11 |
Vector-efficiency ratio load | 25.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 9.38 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 9.38 |
Path / |
Function | advanceVelocity.extracted |
Source file and lines | timestep.c:71-78 |
Module | exec |
nb instructions | 57 |
nb uops | 57 |
loop length | 213 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
micro-operation queue | 9.50 cycles |
front end | 9.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.70 | 5.87 | 1.67 | 1.67 | 0.00 | 5.87 | 5.70 | 0.00 | 0.00 | 0.00 | 5.87 | 1.67 |
cycles | 5.70 | 5.87 | 1.67 | 1.67 | 0.00 | 5.87 | 5.70 | 0.00 | 0.00 | 0.00 | 5.87 | 1.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 9.62 |
Stall cycles | 0.00 |
Front-end | 9.50 |
Dispatch | 5.87 |
Overall L1 | 9.50 |
all | 11% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 9% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 9% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ADD $0x40,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %RCX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%RDI),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JE 410f0b <advanceVelocity.extracted+0x6b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%RDI,%RAX,1),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV (%RDX,%R8,4),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
TEST %R8D,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 410f60 <advanceVelocity.extracted+0xc0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %ESI,%R11D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x3,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%RDI,%RAX,1),%R15D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
SAL $0x6,%R15D | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV 0x20(%RBX),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x20(%R10),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x28(%R10),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOVSXD %R14D,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
ADD %R15,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%R14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%R14,%R14,2),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%R10,%R14,1),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x10,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x3,%R15 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%R15,%R15,2),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%R9,%R15,1),%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP %R13,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 411030 <advanceVelocity.extracted+0x190> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R10,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %R9,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x10,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R15,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 411030 <advanceVelocity.extracted+0x190> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%R11,%R11,2),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x10,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 410f60 <advanceVelocity.extracted+0xc0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
LEA (%R11,%R11,2),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x8,%R15D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 411285 <advanceVelocity.extracted+0x3e5> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA -0x1(%R15),%R12D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R14,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP %R15D,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVDQU -0x50(%RBP),%XMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
JE 410f60 <advanceVelocity.extracted+0xc0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
JMP 411288 <advanceVelocity.extracted+0x3e8> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SUB %R15D,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOVSXD %R15D,%R11 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
SAL $0x3,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%R11,%R11,2),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %R11,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %R11,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 410f60 <advanceVelocity.extracted+0xc0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
Function | advanceVelocity.extracted |
Source file and lines | timestep.c:71-78 |
Module | exec |
nb instructions | 57 |
nb uops | 57 |
loop length | 213 |
used x86 registers | 15 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
micro-operation queue | 9.50 cycles |
front end | 9.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 5.70 | 5.87 | 1.67 | 1.67 | 0.00 | 5.87 | 5.70 | 0.00 | 0.00 | 0.00 | 5.87 | 1.67 |
cycles | 5.70 | 5.87 | 1.67 | 1.67 | 0.00 | 5.87 | 5.70 | 0.00 | 0.00 | 0.00 | 5.87 | 1.67 |
Cycles executing div or sqrt instructions | NA |
FE+BE cycles | 9.62 |
Stall cycles | 0.00 |
Front-end | 9.50 |
Dispatch | 5.87 |
Overall L1 | 9.50 |
all | 11% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 11% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 9% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 9% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ADD $0x40,%ESI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
CMP %RCX,%RDI | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
LEA 0x1(%RDI),%RDI | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JE 410f0b <advanceVelocity.extracted+0x6b> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%RDI,%RAX,1),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV (%RDX,%R8,4),%R8D | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
TEST %R8D,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 2 | 0.20 |
JLE 410f60 <advanceVelocity.extracted+0xc0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
MOV %ESI,%R11D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x3,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%RDI,%RAX,1),%R15D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
SAL $0x6,%R15D | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
MOV 0x20(%RBX),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x20(%R10),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
MOV 0x28(%R10),%R10 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 |
LEA -0x1(%R8),%R14D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
MOVSXD %R14D,%R14 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
ADD %R15,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
SAL $0x3,%R14 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%R14,%R14,2),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%R10,%R14,1),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x10,%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
SAL $0x3,%R15 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%R15,%R15,2),%R15 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
LEA (%R9,%R15,1),%R13 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP %R13,%R12 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 411030 <advanceVelocity.extracted+0x190> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
ADD %R10,%R15 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %R9,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD $0x10,%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
CMP %R15,%R14 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
JB 411030 <advanceVelocity.extracted+0x190> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA (%R11,%R11,2),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD $0x10,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NOPW %CS:(%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 410f60 <advanceVelocity.extracted+0xc0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |
LEA (%R11,%R11,2),%R14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R8D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
AND $-0x8,%R15D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
JE 411285 <advanceVelocity.extracted+0x3e5> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
LEA -0x1(%R15),%R12D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1-2 | 0.20 |
XOR %R13D,%R13D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
MOV %R14,%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 |
NOPL (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
CMP %R15D,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
VMOVDQU -0x50(%RBP),%XMM3 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0-1 | 0.33 |
JE 410f60 <advanceVelocity.extracted+0xc0> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 |
JMP 411288 <advanceVelocity.extracted+0x3e8> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5.84 |
XOR %R15D,%R15D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
SUB %R15D,%R8D | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
MOVSXD %R15D,%R11 | 1 | 0 | 0.33 | 0 | 0 | 0 | 0.33 | 0 | 0 | 0 | 0 | 0.33 | 0 | 1 | 0.33 |
SAL $0x3,%R11 | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0-2 | 0.50 |
LEA (%R11,%R11,2),%R11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
ADD %R11,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
ADD %R11,%R10 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 |
JMP 410f60 <advanceVelocity.extracted+0xc0> | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2.08 |