Loop Id: 587 | Module: exec | Source: flux_calc_kernel.f90-pp.f90:54-62 | Coverage: 0.02% |
---|
Loop Id: 587 | Module: exec | Source: flux_calc_kernel.f90-pp.f90:54-62 | Coverage: 0.02% |
---|
0x4252e8 LDP X8, X9, [SP, #88] |
0x4252ec ADD X7, X7, X15 |
0x4252f0 ADD X20, X20, X15 |
0x4252f4 SUBS W11, W11, #1 |
0x4252f8 ADD X21, X21, X15 |
0x4252fc ADD X22, X22, X15 |
0x425300 ADD X1, X1, X15 |
0x425304 ADD X2, X2, X15 |
0x425308 ADD X5, X5, X15 |
0x42530c ADD X6, X6, X15 |
0x425310 ADD X23, X23, X8 |
0x425314 ADD X25, X25, X8 |
0x425318 LDR X8, [SP, #104] |
0x42531c ADD X27, X27, X9 |
0x425320 ADD X26, X26, X8 |
0x425324 ADD X29, X29, X8 |
0x425328 B.LE 4254b0 |
0x42532c LDR X8, [SP, #112] |
0x425330 LDR W0, [SP, #124] |
0x425334 CMP X8, X12 |
0x425338 ORR X8, XZR, X28 |
0x42533c B.CC 4253d4 |
0x425340 LDR X9, [SP, #112] |
0x425344 LDR W10, [SP, #124] |
0x425348 ORR X14, XZR, XZR |
0x42534c UDIV X8, X9, X12 |
0x425350 MADD X13, X8, X12, XZR |
0x425354 SUB X9, X9, X13 |
0x425358 ADD X8, X13, X28 |
0x42535c SUB W0, W10, W13 |
(588) 0x425360 LD1D {Z2.D}, P0/Z, [X20, X14,LSL #3] |
(588) 0x425364 LD1D {Z3.D}, P0/Z, [X6, X14,LSL #3] |
(588) 0x425368 UBFM X16, X14, #61, #60 |
(588) 0x42536c ADD X17, X22, X16 |
(588) 0x425370 LD1D {Z4.D}, P0/Z, [X7, X14,LSL #3] |
(588) 0x425374 LD1D {Z5.D}, P0/Z, [X5, X14,LSL #3] |
(588) 0x425378 ADD X16, X21, X16 |
(588) 0x42537c FADD Z2.D, Z3.D, Z2.D |
(588) 0x425380 FADD Z3.D, Z4.D, Z5.D |
(588) 0x425384 FADD Z2.D, Z2.D, Z3.D |
(588) 0x425388 LD1D {Z3.D}, P0/Z, [X2, X14,LSL #3] |
(588) 0x42538c FMUL Z3.D, Z3.D, Z1.D |
(588) 0x425390 FMUL Z2.D, Z3.D, Z2.D |
(588) 0x425394 ST1D {Z2.D}, P0, [X1, X14,LSL #3] |
(588) 0x425398 LD1D {Z2.D}, P0/Z, [X17, X19,LSL #3] |
(588) 0x42539c LD1D {Z3.D}, P0/Z, [X17, MUL VL] |
(588) 0x4253a0 LD1D {Z4.D}, P0/Z, [X16, X19,LSL #3] |
(588) 0x4253a4 LD1D {Z5.D}, P0/Z, [X16, MUL VL] |
(588) 0x4253a8 FADD Z2.D, Z3.D, Z2.D |
(588) 0x4253ac FADD Z3.D, Z4.D, Z5.D |
(588) 0x4253b0 FADD Z2.D, Z2.D, Z3.D |
(588) 0x4253b4 LD1D {Z3.D}, P0/Z, [X25, X14,LSL #3] |
(588) 0x4253b8 FMUL Z3.D, Z1.D, Z3.D |
(588) 0x4253bc FMUL Z2.D, Z2.D, Z3.D |
(588) 0x4253c0 ST1D {Z2.D}, P0, [X23, X14,LSL #3] |
(588) 0x4253c4 ADD X14, X14, X12 |
(588) 0x4253c8 CMP X13, X14 |
(588) 0x4253cc B.NE 425360 |
0x4253d0 CBZ X9, 4252e8 |
0x4253d4 LDP X17, X10, [SP, #64] |
0x4253d8 ADD X9, X8, X26 |
0x4253dc ADD X13, X8, X27 |
0x4253e0 ADD W0, W0, #1 |
0x4253e4 UBFM X9, X9, #61, #60 |
0x4253e8 ADD X8, X8, X29 |
0x4253ec UBFM X3, X13, #61, #60 |
0x4253f0 UBFM X13, X8, #61, #60 |
0x4253f4 ORR X16, XZR, XZR |
0x4253f8 ADD X28, X24, X9 |
0x4253fc ADD X4, X10, X9 |
0x425400 LDR X9, [SP, #80] |
0x425404 ADD X18, X17, X13 |
0x425408 ADD X17, X10, X13 |
0x42540c LDR X10, [SP, #32] |
0x425410 ADD X30, X9, X3 |
0x425414 LDP X9, X8, [SP, #16] |
0x425418 ADD X3, X10, X3 |
0x42541c ADD X14, X9, X13 |
0x425420 LDR X9, [SP, #56] |
0x425424 ADD X8, X8, X13 |
0x425428 ADD X9, X9, X13 |
0x42542c ADD X13, X24, X13 |
0x425430 HINT #0 |
0x425434 HINT #0 |
0x425438 HINT #0 |
0x42543c HINT #0 |
(586) 0x425440 LDR D2, [X28, X16] |
(586) 0x425444 LDR D3, [X13, X16] |
(586) 0x425448 LDR D4, [X4, X16] |
(586) 0x42544c ADD X10, X8, X16 |
(586) 0x425450 SUB W0, W0, #1 |
(586) 0x425454 CMP W0, #1 |
(586) 0x425458 LDR D5, [X17, X16] |
(586) 0x42545c FADD D2, D3, D2 |
(586) 0x425460 FADD D3, D4, D5 |
(586) 0x425464 FADD D2, D2, D3 |
(586) 0x425468 LDR D3, [X18, X16] |
(586) 0x42546c FMUL D3, D3, D0 |
(586) 0x425470 FMUL D2, D3, D2 |
(586) 0x425474 STR D2, [X9, X16] |
(586) 0x425478 LDP D3, D2, [X10, #1016] |
(586) 0x42547c ADD X10, X14, X16 |
(586) 0x425480 LDP D5, D4, [X10, #1016] |
(586) 0x425484 FADD D2, D3, D2 |
(586) 0x425488 FADD D3, D4, D5 |
(586) 0x42548c FADD D2, D2, D3 |
(586) 0x425490 LDR D3, [X30, X16] |
(586) 0x425494 FMUL D3, D0, D3 |
(586) 0x425498 FMUL D2, D2, D3 |
(586) 0x42549c STR D2, [X3, X16] |
(586) 0x4254a0 ADD X16, X16, #8 |
(586) 0x4254a4 B.HI 425440 |
0x4254a8 LDR X28, [SP, #40] |
0x4254ac B 4252e8 |
/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/armclang_5/CMakeFiles/clover_leaf.dir/CloverLeaf_ref/kernels/flux_calc_kernel.f90-pp.f90: 54 - 62 |
-------------------------------------------------------------------------------- |
54: !$OMP DO |
55: DO k=y_min,y_max+1 |
56: !$OMP SIMD |
57: DO j=x_min,x_max+1 |
58: vol_flux_x(j,k)=0.25_8*dt*xarea(j,k) & |
59: *(xvel0(j,k)+xvel0(j,k+1)+xvel1(j,k)+xvel1(j,k+1)) |
60: vol_flux_y(j,k)=0.25_8*dt*yarea(j,k) & |
61: *(yvel0(j,k)+yvel0(j+1,k)+yvel1(j,k)+yvel1(j+1,k)) |
62: ENDDO |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.39 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.44 |
Bottlenecks | |
Function | __nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_ |
Source | flux_calc_kernel.f90-pp.f90:54-56,flux_calc_kernel.f90-pp.f90:61-62 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 7.00 |
CQA cycles if no scalar integer | 7.00 |
CQA cycles if FP arith vectorized | 7.00 |
CQA cycles if fully vectorized | 1.59 |
Front-end cycles | 4.88 |
DIV/SQRT cycles | 1.50 |
P0 cycles | 1.50 |
P1 cycles | 7.00 |
P2 cycles | 7.00 |
P3 cycles | 7.00 |
P4 cycles | 7.00 |
P5 cycles | 0.00 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 2.67 |
P10 cycles | 2.67 |
P11 cycles | 2.67 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.50 - 0.25 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 41.00 |
Nb uops | 39.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 10.25 |
Bytes prefetched | 0.00 |
Bytes loaded | 74.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 24.26 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 24.13 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | 25.00 |
Vector-efficiency ratio other | 25.00 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.29 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.42 |
Bottlenecks | P2, P3, P4, P5, |
Function | __nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_ |
Source | flux_calc_kernel.f90-pp.f90:54-56,flux_calc_kernel.f90-pp.f90:61-62 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 5.50 |
CQA cycles if no scalar integer | 5.50 |
CQA cycles if FP arith vectorized | 5.50 |
CQA cycles if fully vectorized | 1.28 |
Front-end cycles | 3.88 |
DIV/SQRT cycles | 1.50 |
P0 cycles | 1.50 |
P1 cycles | 5.50 |
P2 cycles | 5.50 |
P3 cycles | 5.50 |
P4 cycles | 5.50 |
P5 cycles | 0.00 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 2.00 |
P10 cycles | 2.00 |
P11 cycles | 2.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 1.00 - 0.50 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 31.00 |
Nb uops | 31.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.73 |
Bytes prefetched | 0.00 |
Bytes loaded | 48.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | 0.00 |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 24.38 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 24.22 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | 25.00 |
Vector-efficiency ratio other | 25.00 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.46 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.45 |
Bottlenecks | P2, P3, P4, P5, |
Function | __nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_ |
Source | flux_calc_kernel.f90-pp.f90:54-56,flux_calc_kernel.f90-pp.f90:61-62 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 8.50 |
CQA cycles if no scalar integer | 8.50 |
CQA cycles if FP arith vectorized | 8.50 |
CQA cycles if fully vectorized | 1.91 |
Front-end cycles | 5.88 |
DIV/SQRT cycles | 1.50 |
P0 cycles | 1.50 |
P1 cycles | 8.50 |
P2 cycles | 8.50 |
P3 cycles | 8.50 |
P4 cycles | 8.50 |
P5 cycles | 0.00 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 3.33 |
P10 cycles | 3.33 |
P11 cycles | 3.33 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | 1 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 51.00 |
Nb uops | 47.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 11.76 |
Bytes prefetched | 0.00 |
Bytes loaded | 100.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 24.14 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 24.04 |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 25.00 |
Path / |
Function | __nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_ |
Source file and lines | flux_calc_kernel.f90-pp.f90:54-62 |
Module | exec |
nb instructions | 41 |
loop length | 164 |
nb stack references | 0 |
front end | 4.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.50 | 1.50 | 7.00 | 7.00 | 7.00 | 7.00 | 0.00 | 0.00 | 0.00 | 0.00 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 |
cycles | 1.50 | 1.50 | 7.00 | 7.00 | 7.00 | 7.00 | 0.00 | 0.00 | 0.00 | 0.00 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | 0.50-0.25 |
Longest recurrence chain latency (RecMII) | 1.00 |
Front-end | 4.88 |
Data deps. | 1.00 |
Overall L1 | 7.00 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
Function | __nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_ |
Source file and lines | flux_calc_kernel.f90-pp.f90:54-62 |
Module | exec |
nb instructions | 31 |
loop length | 124 |
nb stack references | 0 |
front end | 3.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.50 | 1.50 | 5.50 | 5.50 | 5.50 | 5.50 | 0.00 | 0.00 | 0.00 | 0.00 | 2.00 | 2.00 | 2.00 | 0.00 | 0.00 |
cycles | 1.50 | 1.50 | 5.50 | 5.50 | 5.50 | 5.50 | 0.00 | 0.00 | 0.00 | 0.00 | 2.00 | 2.00 | 2.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Longest recurrence chain latency (RecMII) | 1.00 |
Front-end | 3.88 |
Data deps. | 1.00 |
Overall L1 | 5.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LDP X8, X9, [SP, #88] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
ADD X7, X7, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X20, X20, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUBS W11, W11, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
ADD X21, X21, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X22, X22, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X1, X1, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X2, X2, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X5, X5, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X6, X6, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X23, X23, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X25, X25, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X8, [SP, #104] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X27, X27, X9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X26, X26, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X29, X29, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
B.LE 4254b0 <__nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_+0x370> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
LDR X8, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W0, [SP, #124] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
CMP X8, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
ORR X8, XZR, X28 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
B.CC 4253d4 <__nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_+0x294> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
LDR X9, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W10, [SP, #124] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ORR X14, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
UDIV X8, X9, X12 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-20 | 1-0.50 |
MADD X13, X8, X12, XZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
SUB X9, X9, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X8, X13, X28 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB W0, W10, W13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CBZ X9, 4252e8 <__nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_+0x1a8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Function | __nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_ |
Source file and lines | flux_calc_kernel.f90-pp.f90:54-62 |
Module | exec |
nb instructions | 51 |
loop length | 204 |
nb stack references | 0 |
front end | 5.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.50 | 1.50 | 8.50 | 8.50 | 8.50 | 8.50 | 0.00 | 0.00 | 0.00 | 0.00 | 3.33 | 3.33 | 3.33 | 0.00 | 0.00 |
cycles | 1.50 | 1.50 | 8.50 | 8.50 | 8.50 | 8.50 | 0.00 | 0.00 | 0.00 | 0.00 | 3.33 | 3.33 | 3.33 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Longest recurrence chain latency (RecMII) | 1.00 |
Front-end | 5.88 |
Data deps. | 1.00 |
Overall L1 | 8.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LDP X8, X9, [SP, #88] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
ADD X7, X7, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X20, X20, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUBS W11, W11, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
ADD X21, X21, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X22, X22, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X1, X1, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X2, X2, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X5, X5, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X6, X6, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X23, X23, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X25, X25, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X8, [SP, #104] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X27, X27, X9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X26, X26, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X29, X29, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
B.LE 4254b0 <__nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_+0x370> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
LDR X8, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W0, [SP, #124] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
CMP X8, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
ORR X8, XZR, X28 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
B.CC 4253d4 <__nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_+0x294> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
LDP X17, X10, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
ADD X9, X8, X26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X13, X8, X27 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W0, W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
UBFM X9, X9, #61, #60 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X8, X8, X29 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
UBFM X3, X13, #61, #60 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
UBFM X13, X8, #61, #60 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR X16, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X28, X24, X9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X4, X10, X9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X9, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X18, X17, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X17, X10, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X10, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X30, X9, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDP X9, X8, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
ADD X3, X10, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X14, X9, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X9, [SP, #56] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X8, X8, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X9, X9, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X13, X24, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
HINT #0 | ||||||||||||||||||
HINT #0 | ||||||||||||||||||
HINT #0 | ||||||||||||||||||
HINT #0 | ||||||||||||||||||
LDR X28, [SP, #40] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
B 4252e8 <__nv_flux_calc_kernel_module_flux_calc_kernel__F1L49_1_+0x1a8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |