Loop Id: 504 | Module: libkripke.so | Source: forall.hpp:59-59 [...] | Coverage: 0.04% |
---|
Loop Id: 504 | Module: libkripke.so | Source: forall.hpp:59-59 [...] | Coverage: 0.04% |
---|
0x591b0 MOV 0xb8(%RSP),%R9 |
0x591b8 MOV 0xb0(%RSP),%RBX |
0x591c0 MOV %RDX,0xd8(%RSP) |
0x591c8 MOV %RCX,0xd0(%RSP) |
0x591d0 ADD %RCX,%R9 |
0x591d3 LEA (%RBX,%RDX,1),%R12 |
0x591d7 LEA (%R13,%R9,8),%R8 |
0x591dc XOR %R9D,%R9D |
0x591df NOP |
(503) 0x591e0 MOV 0xe8(%RSP),%RDX |
(503) 0x591e8 MOV 0xe0(%RSP),%RCX |
(503) 0x591f0 MOV (%RDX,%R9,8),%RDX |
(503) 0x591f4 MOVSXD (%RCX,%R9,4),%RCX |
(503) 0x591f8 LEA (%RCX,%RDX,1),%RSI |
(503) 0x591fc MOV %ECX,0x10c(%RSP) |
(503) 0x59203 CMP %RSI,%RDX |
(503) 0x59206 JGE 595e8 |
(503) 0x5920c LEA -0x1(%RCX),%RDI |
(503) 0x59210 MOV %RDX,0x118(%RSP) |
(503) 0x59218 CMP $0x2,%RDI |
(503) 0x5921c JBE 595f1 |
(503) 0x59222 MOV %RCX,%RBX |
(503) 0x59225 LEA (,%RDX,8),%RSI |
(503) 0x5922d XOR %EAX,%EAX |
(503) 0x5922f XOR %EDI,%EDI |
(503) 0x59231 SHR $0x2,%RBX |
(503) 0x59235 LEA (%R15,%RSI,1),%R10 |
(503) 0x59239 VXORPD %XMM1,%XMM1,%XMM1 |
(503) 0x5923d ADD %R14,%RSI |
(503) 0x59240 MOV %RBX,0x110(%RSP) |
(503) 0x59248 AND $0x7,%EBX |
(503) 0x5924b JE 5938c |
(503) 0x59251 CMP $0x1,%RBX |
(503) 0x59255 JE 59358 |
(503) 0x5925b CMP $0x2,%RBX |
(503) 0x5925f JE 59334 |
(503) 0x59265 CMP $0x3,%RBX |
(503) 0x59269 JE 59310 |
(503) 0x5926f CMP $0x4,%RBX |
(503) 0x59273 JE 592ec |
(503) 0x59275 CMP $0x5,%RBX |
(503) 0x59279 JE 592c8 |
(503) 0x5927b CMP $0x6,%RBX |
(503) 0x5927f JE 592a5 |
(503) 0x59281 VXORPS %XMM4,%XMM4,%XMM4 |
(503) 0x59285 VPMULLQ (%R10),%YMM2,%YMM4 |
(503) 0x5928b KMOVB %K1,%K3 |
(503) 0x5928f VGATHERQPD (%R8,%YMM4,8),%YMM6{%K3} |
(503) 0x59296 MOV $0x1,%EDI |
(503) 0x5929b MOV $0x20,%EAX |
(503) 0x592a0 VFMADD231PD (%RSI),%YMM6,%YMM1 |
(503) 0x592a5 VXORPS %XMM7,%XMM7,%XMM7 |
(503) 0x592a9 VPMULLQ (%R10,%RAX,1),%YMM2,%YMM7 |
(503) 0x592b0 KMOVB %K1,%K4 |
(503) 0x592b4 VGATHERQPD (%R8,%YMM7,8),%YMM8{%K4} |
(503) 0x592bb INC %RDI |
(503) 0x592be VFMADD231PD (%RSI,%RAX,1),%YMM8,%YMM1 |
(503) 0x592c4 ADD $0x20,%RAX |
(503) 0x592c8 VXORPS %XMM9,%XMM9,%XMM9 |
(503) 0x592cd VPMULLQ (%R10,%RAX,1),%YMM2,%YMM9 |
(503) 0x592d4 KMOVB %K1,%K7 |
(503) 0x592d8 VGATHERQPD (%R8,%YMM9,8),%YMM10{%K7} |
(503) 0x592df INC %RDI |
(503) 0x592e2 VFMADD231PD (%RSI,%RAX,1),%YMM10,%YMM1 |
(503) 0x592e8 ADD $0x20,%RAX |
(503) 0x592ec VXORPS %XMM11,%XMM11,%XMM11 |
(503) 0x592f1 VPMULLQ (%R10,%RAX,1),%YMM2,%YMM11 |
(503) 0x592f8 KMOVB %K1,%K6 |
(503) 0x592fc VGATHERQPD (%R8,%YMM11,8),%YMM12{%K6} |
(503) 0x59303 INC %RDI |
(503) 0x59306 VFMADD231PD (%RSI,%RAX,1),%YMM12,%YMM1 |
(503) 0x5930c ADD $0x20,%RAX |
(503) 0x59310 VXORPS %XMM13,%XMM13,%XMM13 |
(503) 0x59315 VPMULLQ (%R10,%RAX,1),%YMM2,%YMM13 |
(503) 0x5931c KMOVB %K1,%K5 |
(503) 0x59320 VGATHERQPD (%R8,%YMM13,8),%YMM14{%K5} |
(503) 0x59327 INC %RDI |
(503) 0x5932a VFMADD231PD (%RSI,%RAX,1),%YMM14,%YMM1 |
(503) 0x59330 ADD $0x20,%RAX |
(503) 0x59334 VXORPS %XMM15,%XMM15,%XMM15 |
(503) 0x59339 VPMULLQ (%R10,%RAX,1),%YMM2,%YMM15 |
(503) 0x59340 KMOVB %K1,%K2 |
(503) 0x59344 VGATHERQPD (%R8,%YMM15,8),%YMM0{%K2} |
(503) 0x5934b INC %RDI |
(503) 0x5934e VFMADD231PD (%RSI,%RAX,1),%YMM0,%YMM1 |
(503) 0x59354 ADD $0x20,%RAX |
(503) 0x59358 MOV 0x110(%RSP),%RBX |
(503) 0x59360 VXORPS %XMM3,%XMM3,%XMM3 |
(503) 0x59364 VPMULLQ (%R10,%RAX,1),%YMM2,%YMM3 |
(503) 0x5936b KMOVB %K1,%K3 |
(503) 0x5936f INC %RDI |
(503) 0x59372 VGATHERQPD (%R8,%YMM3,8),%YMM4{%K3} |
(503) 0x59379 VFMADD231PD (%RSI,%RAX,1),%YMM4,%YMM1 |
(503) 0x5937f ADD $0x20,%RAX |
(503) 0x59383 CMP %RBX,%RDI |
(503) 0x59386 JE 594a6 |
(506) 0x5938c VXORPS %XMM6,%XMM6,%XMM6 |
(506) 0x59390 VPMULLQ (%R10,%RAX,1),%YMM2,%YMM6 |
(506) 0x59397 KMOVB %K1,%K4 |
(506) 0x5939b VGATHERQPD (%R8,%YMM6,8),%YMM7{%K4} |
(506) 0x593a2 VXORPS %XMM8,%XMM8,%XMM8 |
(506) 0x593a7 VPMULLQ 0x20(%R10,%RAX,1),%YMM2,%YMM8 |
(506) 0x593af KMOVB %K1,%K7 |
(506) 0x593b3 VGATHERQPD (%R8,%YMM8,8),%YMM9{%K7} |
(506) 0x593ba VFMADD231PD (%RSI,%RAX,1),%YMM7,%YMM1 |
(506) 0x593c0 VXORPS %XMM10,%XMM10,%XMM10 |
(506) 0x593c5 VPMULLQ 0x40(%R10,%RAX,1),%YMM2,%YMM10 |
(506) 0x593cd KMOVB %K1,%K6 |
(506) 0x593d1 VGATHERQPD (%R8,%YMM10,8),%YMM11{%K6} |
(506) 0x593d8 VXORPS %XMM12,%XMM12,%XMM12 |
(506) 0x593dd VPMULLQ 0x60(%R10,%RAX,1),%YMM2,%YMM12 |
(506) 0x593e5 KMOVB %K1,%K5 |
(506) 0x593e9 VXORPS %XMM13,%XMM13,%XMM13 |
(506) 0x593ee VPMULLQ 0x80(%R10,%RAX,1),%YMM2,%YMM13 |
(506) 0x593f6 KMOVB %K1,%K2 |
(506) 0x593fa VGATHERQPD (%R8,%YMM13,8),%YMM14{%K2} |
(506) 0x59401 VXORPS %XMM15,%XMM15,%XMM15 |
(506) 0x59406 VPMULLQ 0xa0(%R10,%RAX,1),%YMM2,%YMM15 |
(506) 0x5940e KMOVB %K1,%K3 |
(506) 0x59412 VGATHERQPD (%R8,%YMM15,8),%YMM4{%K3} |
(506) 0x59419 VXORPS %XMM3,%XMM3,%XMM3 |
(506) 0x5941d VPMULLQ 0xc0(%R10,%RAX,1),%YMM2,%YMM3 |
(506) 0x59425 KMOVB %K1,%K4 |
(506) 0x59429 VGATHERQPD (%R8,%YMM3,8),%YMM0{%K4} |
(506) 0x59430 MOV 0x110(%RSP),%RBX |
(506) 0x59438 VXORPS %XMM6,%XMM6,%XMM6 |
(506) 0x5943c VPMULLQ 0xe0(%R10,%RAX,1),%YMM2,%YMM6 |
(506) 0x59444 KMOVB %K1,%K7 |
(506) 0x59448 ADD $0x8,%RDI |
(506) 0x5944c VFMADD231PD 0x20(%RSI,%RAX,1),%YMM9,%YMM1 |
(506) 0x59453 VFMADD132PD 0x40(%RSI,%RAX,1),%YMM1,%YMM11 |
(506) 0x5945a VGATHERQPD (%R8,%YMM12,8),%YMM1{%K5} |
(506) 0x59461 VFMADD132PD 0x60(%RSI,%RAX,1),%YMM11,%YMM1 |
(506) 0x59468 VFMADD132PD 0x80(%RSI,%RAX,1),%YMM1,%YMM14 |
(506) 0x59472 VGATHERQPD (%R8,%YMM6,8),%YMM1{%K7} |
(506) 0x59479 VFMADD132PD 0xa0(%RSI,%RAX,1),%YMM14,%YMM4 |
(506) 0x59483 VFMADD132PD 0xc0(%RSI,%RAX,1),%YMM4,%YMM0 |
(506) 0x5948d VFMADD132PD 0xe0(%RSI,%RAX,1),%YMM0,%YMM1 |
(506) 0x59497 ADD $0x100,%RAX |
(506) 0x5949d CMP %RBX,%RDI |
(506) 0x594a0 JNE 5938c |
(503) 0x594a6 VEXTRACTF64X2 $0x1,%YMM1,%XMM7 |
(503) 0x594ad VADDPD %XMM1,%XMM7,%XMM8 |
(503) 0x594b1 VUNPCKHPD %XMM8,%XMM8,%XMM9 |
(503) 0x594b6 VADDPD %XMM8,%XMM9,%XMM0 |
(503) 0x594bb TESTB $0x3,0x10c(%RSP) |
(503) 0x594c3 JE 59532 |
(503) 0x594c5 MOV %RCX,%RAX |
(503) 0x594c8 VADDPD %XMM1,%XMM7,%XMM3 |
(503) 0x594cc AND $-0x4,%RAX |
(503) 0x594d0 ADD %RAX,%RDX |
(503) 0x594d3 SUB %RAX,%RCX |
(503) 0x594d6 CMP $0x1,%RCX |
(503) 0x594da JE 5951a |
(503) 0x594dc MOV 0x118(%RSP),%RSI |
(503) 0x594e4 KMOVB %K1,%K6 |
(503) 0x594e8 ADD %RAX,%RSI |
(503) 0x594eb VXORPS %XMM10,%XMM10,%XMM10 |
(503) 0x594f0 VPMULLQ (%R15,%RSI,8),%XMM5,%XMM10 |
(503) 0x594f7 VGATHERQPD (%R8,%XMM10,8),%XMM11{%K6} |
(503) 0x594fe VFMADD132PD (%R14,%RSI,8),%XMM3,%XMM11 |
(503) 0x59504 VUNPCKHPD %XMM11,%XMM11,%XMM12 |
(503) 0x59509 VADDPD %XMM11,%XMM12,%XMM0 |
(503) 0x5950e TEST $0x1,%CL |
(503) 0x59511 JE 59532 |
(503) 0x59513 AND $-0x2,%RCX |
(503) 0x59517 ADD %RCX,%RDX |
(503) 0x5951a MOV (%R15,%RDX,8),%RCX |
(503) 0x5951e IMUL %R11,%RCX |
(503) 0x59522 ADD %R12,%RCX |
(503) 0x59525 VMOVSD (%R13,%RCX,8),%XMM13 |
(503) 0x5952c VFMADD231SD (%R14,%RDX,8),%XMM13,%XMM0 |
(503) 0x59532 MOV 0xf0(%RSP),%RDI |
(503) 0x5953a MOV 0x100(%RSP),%RDX |
(503) 0x59542 MOV 0xf8(%RSP),%RAX |
(503) 0x5954a VMOVSD (%RDI,%R9,8),%XMM14 |
(503) 0x59550 VFMADD132SD (%RDX,%R9,8),%XMM14,%XMM0 |
(503) 0x59556 VMOVSD %XMM0,(%RDI,%R9,8) |
(503) 0x5955c INC %R9 |
(503) 0x5955f CMP %RAX,%R9 |
(503) 0x59562 JNE 591e0 |
0x59568 MOV 0xd8(%RSP),%RDX |
0x59570 MOV 0xd0(%RSP),%RCX |
0x59578 MOV 0xc8(%RSP),%R8 |
0x59580 INC %RDX |
0x59583 INC %RCX |
0x59586 ADD %R8,0x100(%RSP) |
0x5958e CMP %RDX,0xc0(%RSP) |
0x59596 JNE 591b0 |
(503) 0x595e8 VXORPD %XMM0,%XMM0,%XMM0 |
(503) 0x595ec JMP 59532 |
(503) 0x595f1 VXORPD %XMM3,%XMM3,%XMM3 |
(503) 0x595f5 XOR %EAX,%EAX |
(503) 0x595f7 VXORPD %XMM0,%XMM0,%XMM0 |
(503) 0x595fb JMP 594d3 |
/home/eoseret/qaas_runs_CPU_9468/171-147-9160/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/util/View.hpp: 110 - 110 |
-------------------------------------------------------------------------------- |
110: return data[idx]; |
/home/eoseret/qaas_runs_CPU_9468/171-147-9160/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/index/IndexValue.hpp: 105 - 105 |
-------------------------------------------------------------------------------- |
105: return TYPE(value + a); |
/home/eoseret/qaas_runs_CPU_9468/171-147-9160/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/util/Layout.hpp: 55 - 55 |
-------------------------------------------------------------------------------- |
55: return a * b; |
/home/eoseret/qaas_runs_CPU_9468/171-147-9160/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/util/Operators.hpp: 304 - 304 |
-------------------------------------------------------------------------------- |
304: RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs, |
/home/eoseret/qaas_runs_CPU_9468/171-147-9160/intel/Kripke/build/Kripke/src/Kripke/Kernel/Scattering.cpp: 87 - 97 |
-------------------------------------------------------------------------------- |
87: MixElem mix_start = zone_to_mixelem(z); |
88: MixElem mix_stop = mix_start + zone_to_num_mixelem(z); |
89: |
90: double sigs_z = 0.0; |
91: for(MixElem mix = mix_start;mix < mix_stop;++ mix){ |
92: Material mat = mixelem_to_material(mix); |
93: double fraction = mixelem_to_fraction(mix); |
94: |
95: sigs_z += sigs(mat, n, global_g, global_gp) * fraction; |
96: } |
97: phi_out(nm, g, z) += sigs_z * phi(nm, gp, z); |
/home/eoseret/qaas_runs_CPU_9468/171-147-9160/intel/Kripke/build/Kripke/tpl/raja/include/RAJA/policy/loop/forall.hpp: 59 - 59 |
-------------------------------------------------------------------------------- |
59: for (decltype(distance_it) i = 0; i < distance_it; ++i) { |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
○96.66 | gomp_thread_start | team.c:130 | libgomp.so.1.0.0 |
○3.34 | GOMP_parallel | libgomp.h:985 | libgomp.so.1.0.0 |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 |
Bottlenecks | micro-operation queue, |
Function | void RAJA::internal::StatementExecutor |
Source | View.hpp:110-110,forall.hpp:59-59 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 2.83 |
CQA cycles if no scalar integer | 2.83 |
CQA cycles if FP arith vectorized | 2.83 |
CQA cycles if fully vectorized | 0.35 |
Front-end cycles | 2.83 |
DIV/SQRT cycles | 0.70 |
P0 cycles | 0.60 |
P1 cycles | 2.33 |
P2 cycles | 2.33 |
P3 cycles | 1.50 |
P4 cycles | 0.60 |
P5 cycles | 0.50 |
P6 cycles | 1.50 |
P7 cycles | 1.50 |
P8 cycles | 1.50 |
P9 cycles | 0.60 |
P10 cycles | 2.33 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 17.00 |
Nb uops | 17.00 |
Nb loads | 7.00 |
Nb stores | 3.00 |
Nb stack references | 7.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 28.24 |
Bytes prefetched | 0.00 |
Bytes loaded | 56.00 |
Bytes stored | 24.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | NA |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | NA |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 8.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.21 |
Bottlenecks | micro-operation queue, |
Function | void RAJA::internal::StatementExecutor |
Source | View.hpp:110-110,forall.hpp:59-59 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 2.83 |
CQA cycles if no scalar integer | 2.83 |
CQA cycles if FP arith vectorized | 2.83 |
CQA cycles if fully vectorized | 0.35 |
Front-end cycles | 2.83 |
DIV/SQRT cycles | 0.70 |
P0 cycles | 0.60 |
P1 cycles | 2.33 |
P2 cycles | 2.33 |
P3 cycles | 1.50 |
P4 cycles | 0.60 |
P5 cycles | 0.50 |
P6 cycles | 1.50 |
P7 cycles | 1.50 |
P8 cycles | 1.50 |
P9 cycles | 0.60 |
P10 cycles | 2.33 |
P11 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 17.00 |
Nb uops | 17.00 |
Nb loads | 7.00 |
Nb stores | 3.00 |
Nb stack references | 7.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 28.24 |
Bytes prefetched | 0.00 |
Bytes loaded | 56.00 |
Bytes stored | 24.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | NA |
Vector-efficiency ratio all | 12.50 |
Vector-efficiency ratio load | 12.50 |
Vector-efficiency ratio store | 12.50 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 12.50 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | NA |
Path / |
nb instructions | 17 |
nb uops | 17 |
loop length | 100 |
used x86 registers | 8 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
micro-operation queue | 2.83 cycles |
front end | 2.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.70 | 0.60 | 2.33 | 2.33 | 1.50 | 0.60 | 0.50 | 1.50 | 1.50 | 1.50 | 0.60 | 2.33 |
cycles | 0.70 | 0.60 | 2.33 | 2.33 | 1.50 | 0.60 | 0.50 | 1.50 | 1.50 | 1.50 | 0.60 | 2.33 |
Cycles executing div or sqrt instructions | NA |
Front-end | 2.83 |
Dispatch | 2.33 |
Overall L1 | 2.83 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0xb8(%RSP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
MOV 0xb0(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
MOV %RDX,0xd8(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
MOV %RCX,0xd0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
ADD %RCX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
LEA (%RBX,%RDX,1),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
LEA (%R13,%R9,8),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
MOV 0xd8(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
MOV 0xd0(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
MOV 0xc8(%RSP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
INC %RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
INC %RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
ADD %R8,0x100(%RSP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 | scal (12.5%) |
CMP %RDX,0xc0(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | N/A |
JNE 591b0 <_ZN4RAJA8internal17StatementExecutorINS_9statement8CollapseINS_26omp_parallel_collapse_execEN4camp7int_seqIlJLl0ELl1EEEEJNS2_3ForILl2ENS_6policy4loop9loop_execEJNS8_ILl3ESB_JNS2_6LambdaILl0EJEEEEEEEEEEEEE4execIRNS0_8LoopDataINS5_4listIJSG_EEENS5_5tupleIJNS_4impl4SpanINS_9Iterators16numeric_iteratorIN6Kripke6MomentElPSS_EESS_EENSO_INSQ_INSR_5GroupElPSW_EESW_EESZ_NSO_INSQ_INSR_4ZoneElPS10_EES10_EEEEENSM_IJEEEJZNK14ScatteringSdomclINSR_11ArchLayoutTINSR_12ArchT_OpenMPENSR_11LayoutT_DGZEEEEEvT_NSR_6SdomIdES1D_RKNSR_4Core3SetES1H_S1H_RNS1E_5FieldIdJSS_SW_S10_EEES1K_RNS1I_IdJNSR_8MaterialENSR_8LegendreENSR_11GlobalGroupES1N_EEERNS1I_INSR_7MixElemEJS10_EEERNS1I_IiJS10_EEERNS1I_IS1L_JS1Q_EEERNS1I_IdJS1Q_EEERNS1I_IS1M_JSS_EEEEUlSS_SW_SW_S10_E_EEEEEvOS1C_._omp_fn.0+0x360> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |
nb instructions | 17 |
nb uops | 17 |
loop length | 100 |
used x86 registers | 8 |
used mmx registers | 0 |
used xmm registers | 0 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 7 |
micro-operation queue | 2.83 cycles |
front end | 2.83 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.70 | 0.60 | 2.33 | 2.33 | 1.50 | 0.60 | 0.50 | 1.50 | 1.50 | 1.50 | 0.60 | 2.33 |
cycles | 0.70 | 0.60 | 2.33 | 2.33 | 1.50 | 0.60 | 0.50 | 1.50 | 1.50 | 1.50 | 0.60 | 2.33 |
Cycles executing div or sqrt instructions | NA |
Front-end | 2.83 |
Dispatch | 2.33 |
Overall L1 | 2.83 |
all | 0% |
load | 0% |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 12% |
load | 12% |
store | 12% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | Latency | Recip. throughput | Vectorization |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MOV 0xb8(%RSP),%R9 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
MOV 0xb0(%RSP),%RBX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
MOV %RDX,0xd8(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
MOV %RCX,0xd0(%RSP) | 1 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0.50 | 0 | 0 | 1 | 0.50 | scal (12.5%) |
ADD %RCX,%R9 | 1 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0 | 1 | 0.20 | N/A |
LEA (%RBX,%RDX,1),%R12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
LEA (%R13,%R9,8),%R8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
XOR %R9D,%R9D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
NOP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.17 | N/A |
MOV 0xd8(%RSP),%RDX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | N/A |
MOV 0xd0(%RSP),%RCX | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
MOV 0xc8(%RSP),%R8 | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 1 | 0.33 | scal (12.5%) |
INC %RDX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | N/A |
INC %RCX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.17 | scal (12.5%) |
ADD %R8,0x100(%RSP) | 2 | 0.20 | 0.20 | 0.33 | 0.33 | 0.50 | 0.20 | 0.20 | 0.50 | 0.50 | 0.50 | 0.20 | 0.33 | 1 | 0.50 | scal (12.5%) |
CMP %RDX,0xc0(%RSP) | 1 | 0.20 | 0.20 | 0.33 | 0.33 | 0 | 0.20 | 0.20 | 0 | 0 | 0 | 0.20 | 0.33 | 1 | 0.33 | N/A |
JNE 591b0 <_ZN4RAJA8internal17StatementExecutorINS_9statement8CollapseINS_26omp_parallel_collapse_execEN4camp7int_seqIlJLl0ELl1EEEEJNS2_3ForILl2ENS_6policy4loop9loop_execEJNS8_ILl3ESB_JNS2_6LambdaILl0EJEEEEEEEEEEEEE4execIRNS0_8LoopDataINS5_4listIJSG_EEENS5_5tupleIJNS_4impl4SpanINS_9Iterators16numeric_iteratorIN6Kripke6MomentElPSS_EESS_EENSO_INSQ_INSR_5GroupElPSW_EESW_EESZ_NSO_INSQ_INSR_4ZoneElPS10_EES10_EEEEENSM_IJEEEJZNK14ScatteringSdomclINSR_11ArchLayoutTINSR_12ArchT_OpenMPENSR_11LayoutT_DGZEEEEEvT_NSR_6SdomIdES1D_RKNSR_4Core3SetES1H_S1H_RNS1E_5FieldIdJSS_SW_S10_EEES1K_RNS1I_IdJNSR_8MaterialENSR_8LegendreENSR_11GlobalGroupES1N_EEERNS1I_INSR_7MixElemEJS10_EEERNS1I_IiJS10_EEERNS1I_IS1L_JS1Q_EEERNS1I_IdJS1Q_EEERNS1I_IS1M_JSS_EEEEUlSS_SW_SW_S10_E_EEEEEvOS1C_._omp_fn.0+0x360> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | N/A |