Loop Id: 1097 | Module: exec | Source: viscosity_kernel.f90-pp.f90:56-89 | Coverage: 11.51% |
---|
Loop Id: 1097 | Module: exec | Source: viscosity_kernel.f90-pp.f90:56-89 | Coverage: 11.51% |
---|
0x44ce80 UBFM X3, X10, #61, #60 |
0x44ce84 ADD X4, X26, X3 |
0x44ce88 ADD X6, X28, X3 |
0x44ce8c LD1D {Z23.D}, P0/Z, [X4, X0,LSL #3] [4] |
0x44ce90 LD1D {Z24.D}, P0/Z, [X6, X0,LSL #3] [2] |
0x44ce94 LD1D {Z25.D}, P0/Z, [X4, MUL VL] [15] |
0x44ce98 LD1D {Z26.D}, P0/Z, [X6, MUL VL] [11] |
0x44ce9c ADD X4, X25, X3 |
0x44cea0 LD1D {Z29.D}, P0/Z, [X4, X0,LSL #3] [7] |
0x44cea4 LD1D {Z30.D}, P0/Z, [X4, MUL VL] [16] |
0x44cea8 ADD X4, X27, X3 |
0x44ceac ADD X3, X22, X3 |
0x44ceb0 FADD Z28.D, Z25.D, Z26.D |
0x44ceb4 FADD Z27.D, Z24.D, Z23.D |
0x44ceb8 FADD Z23.D, Z23.D, Z25.D |
0x44cebc FADD Z24.D, Z24.D, Z26.D |
0x44cec0 FSUB Z23.D, Z23.D, Z24.D |
0x44cec4 ADD Z24.D, Z20.D, Z22.D |
0x44cec8 FSUB Z27.D, Z27.D, Z28.D |
0x44cecc LD1D {Z28.D}, P0/Z, [X4, X0,LSL #3] [18] |
0x44ced0 LD1D {Z31.D}, P0/Z, [X4, MUL VL] [17] |
0x44ced4 LD1D {Z10.D}, P0/Z, [X3, MUL VL] [1] |
0x44ced8 FADD Z8.D, Z30.D, Z29.D |
0x44cedc ADD Z22.D, Z22.D, Z6.D |
0x44cee0 FMOV X4, D24 |
0x44cee4 FMUL Z23.D, P0/M, Z23.D, #0 |
0x44cee8 LD1D {Z25.D}, P0/Z, [X18, X4,LSL #3] [6] |
0x44ceec LD1D {Z26.D}, P0/Z, [X2, X4,LSL #3] [10] |
0x44cef0 FADD Z9.D, Z28.D, Z31.D |
0x44cef4 FADD Z28.D, Z29.D, Z28.D |
0x44cef8 FADD Z29.D, Z30.D, Z31.D |
0x44cefc FSUB Z28.D, Z28.D, Z29.D |
0x44cf00 FSUB Z30.D, Z8.D, Z9.D |
0x44cf04 MOVPRFX Z8, Z27 |
0x44cf08 FMUL Z8.D, P0/M, Z8.D, #0 |
0x44cf0c FMUL Z28.D, P0/M, Z28.D, #0 |
0x44cf10 FSUB Z25.D, Z25.D, Z26.D |
0x44cf14 LD1D {Z26.D}, P0/Z, [X3, X0,LSL #3] [12] |
0x44cf18 FDIV Z28.D, P0/M, Z28.D, Z10.D |
0x44cf1c FADD Z26.D, Z26.D, Z10.D |
0x44cf20 FDIV Z25.D, P0/M, Z25.D, Z26.D |
0x44cf24 LD1D {Z26.D}, P0/Z, [X19, X10,LSL #3] [13] |
0x44cf28 FMAD Z23.D, P0/M, Z19.D, Z28.D |
0x44cf2c LD1D {Z28.D}, P0/Z, [X16, X10,LSL #3] [8] |
0x44cf30 ADD X10, X10, X13 |
0x44cf34 CMP X15, X10 |
0x44cf38 FMUL Z29.D, Z25.D, Z25.D |
0x44cf3c FSUB Z26.D, Z26.D, Z28.D |
0x44cf40 FMUL Z23.D, Z25.D, Z23.D |
0x44cf44 FMUL Z28.D, Z18.D, Z30.D |
0x44cf48 FMUL Z30.D, P0/M, Z30.D, #0 |
0x44cf4c FCMGE P2.D, P0/Z, Z25.D, #0 |
0x44cf50 FABS Z25.D, P0/M, Z25.D |
0x44cf54 FMUL Z26.D, Z26.D, Z21.D |
0x44cf58 FMAXNM Z25.D, P0/M, Z25.D, Z2.D |
0x44cf5c FMUL Z8.D, Z8.D, Z29.D |
0x44cf60 FMAD Z27.D, P0/M, Z10.D, Z28.D |
0x44cf64 FMUL Z31.D, Z26.D, Z26.D |
0x44cf68 FDIV Z8.D, P0/M, Z8.D, Z10.D |
0x44cf6c FMUL Z30.D, Z30.D, Z31.D |
0x44cf70 FADD Z28.D, Z31.D, Z29.D |
0x44cf74 FMAXNM Z28.D, P0/M, Z28.D, Z2.D |
0x44cf78 FMAD Z23.D, P0/M, Z26.D, Z8.D |
0x44cf7c FABS Z26.D, P0/M, Z26.D |
0x44cf80 FMAXNM Z26.D, P0/M, Z26.D, Z2.D |
0x44cf84 FMLA Z23.D, P0/M, Z30.D, Z19.D |
0x44cf88 FDIV Z23.D, P0/M, Z23.D, Z28.D |
0x44cf8c FCMLE P1.D, P0/Z, Z23.D, #0 |
0x44cf90 FCMLT P1.D, P1/Z, Z27.D, #0 |
0x44cf94 MOVPRFX Z27, Z25 |
0x44cf98 FNEG Z27.D, P0/M, Z25.D |
0x44cf9c SEL Z25.D, P2, Z25.D, Z27.D |
0x44cfa0 MOVPRFX Z27, Z26 |
0x44cfa4 FNEG Z27.D, P0/M, Z26.D |
0x44cfa8 FCMGE P2.D, P0/Z, Z25.D, #0 |
0x44cfac SEL Z26.D, P2, Z26.D, Z27.D |
0x44cfb0 FMUL Z27.D, Z25.D, Z25.D |
0x44cfb4 FMLA Z27.D, P0/M, Z26.D, Z26.D |
0x44cfb8 FSQRT Z27.D, P0/M, Z27.D |
0x44cfbc FMUL Z28.D, Z27.D, Z10.D |
0x44cfc0 FMUL Z27.D, Z27.D, Z18.D |
0x44cfc4 FDIVR Z25.D, P0/M, Z25.D, Z28.D |
0x44cfc8 FDIVR Z26.D, P0/M, Z26.D, Z27.D |
0x44cfcc FABS Z25.D, P0/M, Z25.D |
0x44cfd0 FABS Z26.D, P0/M, Z26.D |
0x44cfd4 FMINNM Z25.D, P0/M, Z25.D, Z26.D |
0x44cfd8 LD1D {Z26.D}, P1/Z, [X12, Z5.D,LSL #3] [9] |
0x44cfdc FMUL Z23.D, Z25.D, Z23.D |
0x44cfe0 ADR Z26.D, [Z26, Z24.D,LSL #3] [14] |
0x44cfe4 FMUL Z23.D, Z23.D, Z23.D |
0x44cfe8 LD1D {Z26.D}, P1/Z, [V26.D] [3] |
0x44cfec FADD Z25.D, Z26.D, Z26.D |
0x44cff0 FMUL Z23.D, Z23.D, Z25.D |
0x44cff4 SEL Z23.D, P1, Z23.D, Z5.D |
0x44cff8 ST1D {Z23.D}, P0, [X17, Z24.D,LSL #3] [5] |
0x44cffc B.NE 44ce80 |
/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/armclang_5/CMakeFiles/clover_leaf.dir/CloverLeaf_ref/kernels/viscosity_kernel.f90-pp.f90: 56 - 89 |
-------------------------------------------------------------------------------- |
56: DO j=x_min,x_max |
57: ugrad=(xvel0(j+1,k )+xvel0(j+1,k+1))-(xvel0(j ,k )+xvel0(j ,k+1)) |
58: # 58 "/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/viscosity_kernel.f90" |
59: vgrad=(yvel0(j ,k+1)+yvel0(j+1,k+1))-(yvel0(j ,k )+yvel0(j+1,k )) |
60: # 60 "/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/viscosity_kernel.f90" |
61: div = (celldx(j)*(ugrad)+ celldy(k)*(vgrad)) |
62: # 62 "/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/viscosity_kernel.f90" |
63: strain2 = 0.5_8*(xvel0(j, k+1) + xvel0(j+1,k+1)-xvel0(j ,k )-xvel0(j+1,k ))/celldy(k) & |
64: + 0.5_8*(yvel0(j+1,k ) + yvel0(j+1,k+1)-yvel0(j ,k )-yvel0(j ,k+1))/celldx(j) |
65: # 65 "/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/viscosity_kernel.f90" |
66: pgradx=(pressure(j+1,k)-pressure(j-1,k))/(celldx(j)+celldx(j+1)) |
67: pgrady=(pressure(j,k+1)-pressure(j,k-1))/(celldy(k)+celldy(k+1)) |
68: # 68 "/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/viscosity_kernel.f90" |
69: pgradx2 = pgradx*pgradx |
70: pgrady2 = pgrady*pgrady |
71: # 71 "/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/viscosity_kernel.f90" |
72: limiter = ((0.5_8*(ugrad)/celldx(j))*pgradx2+(0.5_8*(vgrad)/celldy(k))*pgrady2+strain2*pgradx*pgrady) & |
73: /MAX(pgradx2+pgrady2,1.0e-16_8) |
74: # 74 "/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/viscosity_kernel.f90" |
75: IF ((limiter.GT.0.0).OR.(div.GE.0.0))THEN |
76: viscosity(j,k) = 0.0 |
77: ELSE |
78: dirx=1.0_8 |
79: IF(pgradx.LT.0.0) dirx=-1.0_8 |
80: pgradx = dirx*MAX(1.0e-16_8,ABS(pgradx)) |
81: diry=1.0_8 |
82: IF(pgradx.LT.0.0) diry=-1.0_8 |
83: pgrady = diry*MAX(1.0e-16_8,ABS(pgrady)) |
84: pgrad = SQRT(pgradx**2+pgrady**2) |
85: xgrad = ABS(celldx(j)*pgrad/pgradx) |
86: ygrad = ABS(celldy(k)*pgrad/pgrady) |
87: grad = MIN(xgrad,ygrad) |
88: grad2 = grad*grad |
89: # 89 "/home/hbollore/qaas-runs/170-307-1706/intel/CloverLeafFC/build/CloverLeafFC/CloverLeaf_ref/kernels/viscosity_kernel.f90" |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.01 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.79 |
Bottlenecks | P6, P7, |
Function | __nv_viscosity_kernel_module_viscosity_kernel__F1L50_1_ |
Source | viscosity_kernel.f90-pp.f90:56-89 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 35.00 |
CQA cycles if no scalar integer | 34.50 |
CQA cycles if FP arith vectorized | 35.00 |
CQA cycles if fully vectorized | 34.63 |
Front-end cycles | 12.00 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 2.00 |
P2 cycles | 2.00 |
P3 cycles | 2.00 |
P4 cycles | 2.00 |
P5 cycles | 35.00 |
P6 cycles | 35.00 |
P7 cycles | 19.50 |
P8 cycles | 19.50 |
P9 cycles | 8.00 |
P10 cycles | 8.00 |
P11 cycles | 1.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 7.00 - 3.50 |
Inter-iter dependencies cycles | 2 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 96.00 |
Nb uops | 96.00 |
Nb loads | NA |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 5.83 |
Nb FLOP add-sub | 68.00 |
Nb FLOP mul | 68.00 |
Nb FLOP fma | 20.00 |
Nb FLOP div | 24.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 4.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 15.54 |
Bytes prefetched | 0.00 |
Bytes loaded | 512.00 |
Bytes stored | 32.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 94.25 |
Vectorization ratio load | 94.12 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 95.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 80.95 |
Vector-efficiency ratio all | 98.28 |
Vector-efficiency ratio load | 95.59 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 96.25 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 96.43 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.01 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 1.01 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.79 |
Bottlenecks | P6, P7, |
Function | __nv_viscosity_kernel_module_viscosity_kernel__F1L50_1_ |
Source | viscosity_kernel.f90-pp.f90:56-89 |
Source loop unroll info | not unrolled or unrolled with no peel/tail loop |
Source loop unroll confidence level | max |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 35.00 |
CQA cycles if no scalar integer | 34.50 |
CQA cycles if FP arith vectorized | 35.00 |
CQA cycles if fully vectorized | 34.63 |
Front-end cycles | 12.00 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 2.00 |
P2 cycles | 2.00 |
P3 cycles | 2.00 |
P4 cycles | 2.00 |
P5 cycles | 35.00 |
P6 cycles | 35.00 |
P7 cycles | 19.50 |
P8 cycles | 19.50 |
P9 cycles | 8.00 |
P10 cycles | 8.00 |
P11 cycles | 1.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 7.00 - 3.50 |
Inter-iter dependencies cycles | 2 |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 96.00 |
Nb uops | 96.00 |
Nb loads | NA |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 5.83 |
Nb FLOP add-sub | 68.00 |
Nb FLOP mul | 68.00 |
Nb FLOP fma | 20.00 |
Nb FLOP div | 24.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 4.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 15.54 |
Bytes prefetched | 0.00 |
Bytes loaded | 512.00 |
Bytes stored | 32.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 94.25 |
Vectorization ratio load | 94.12 |
Vectorization ratio store | 100.00 |
Vectorization ratio mul | 100.00 |
Vectorization ratio add_sub | 95.00 |
Vectorization ratio fma | 100.00 |
Vectorization ratio div_sqrt | 100.00 |
Vectorization ratio other | 80.95 |
Vector-efficiency ratio all | 98.28 |
Vector-efficiency ratio load | 95.59 |
Vector-efficiency ratio store | 100.00 |
Vector-efficiency ratio mul | 100.00 |
Vector-efficiency ratio add_sub | 96.25 |
Vector-efficiency ratio fma | 100.00 |
Vector-efficiency ratio div_sqrt | 100.00 |
Vector-efficiency ratio other | 96.43 |
Path / |
Function | __nv_viscosity_kernel_module_viscosity_kernel__F1L50_1_ |
Source file and lines | viscosity_kernel.f90-pp.f90:56-89 |
Module | exec |
nb instructions | 96 |
loop length | 384 |
nb stack references | 0 |
front end | 12.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 2.00 | 2.00 | 2.00 | 2.00 | 35.00 | 35.00 | 1.00 | 1.00 | 8.00 | 8.00 | 1.00 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 2.00 | 2.00 | 2.00 | 2.00 | 35.00 | 35.00 | 19.50 | 19.50 | 8.00 | 8.00 | 1.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | 7.00-3.50 |
Longest recurrence chain latency (RecMII) | 2.00 |
Front-end | 12.00 |
Data deps. | 2.00 |
Overall L1 | 35.00 |
all | 81% |
load | 94% |
store | 100% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 66% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 42% |
all | 100% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 94% |
load | 94% |
store | 100% |
mul | 100% |
add-sub | 95% |
fma | 100% |
div/sqrt | 100% |
other | 80% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
UBFM X3, X10, #61, #60 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X4, X26, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X6, X28, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LD1D {Z23.D}, P0/Z, [X4, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z24.D}, P0/Z, [X6, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z25.D}, P0/Z, [X4, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z26.D}, P0/Z, [X6, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
ADD X4, X25, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LD1D {Z29.D}, P0/Z, [X4, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z30.D}, P0/Z, [X4, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
ADD X4, X27, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X3, X22, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
FADD Z28.D, Z25.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z27.D, Z24.D, Z23.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z23.D, Z23.D, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z24.D, Z24.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FSUB Z23.D, Z23.D, Z24.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
ADD Z24.D, Z20.D, Z22.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FSUB Z27.D, Z27.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
LD1D {Z28.D}, P0/Z, [X4, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z31.D}, P0/Z, [X4, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z10.D}, P0/Z, [X3, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FADD Z8.D, Z30.D, Z29.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
ADD Z22.D, Z22.D, Z6.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMOV X4, D24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
FMUL Z23.D, P0/M, Z23.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
LD1D {Z25.D}, P0/Z, [X18, X4,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z26.D}, P0/Z, [X2, X4,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FADD Z9.D, Z28.D, Z31.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z28.D, Z29.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z29.D, Z30.D, Z31.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FSUB Z28.D, Z28.D, Z29.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FSUB Z30.D, Z8.D, Z9.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
MOVPRFX Z8, Z27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z8.D, P0/M, Z8.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMUL Z28.D, P0/M, Z28.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FSUB Z25.D, Z25.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
LD1D {Z26.D}, P0/Z, [X3, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FDIV Z28.D, P0/M, Z28.D, Z10.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FADD Z26.D, Z26.D, Z10.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FDIV Z25.D, P0/M, Z25.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
LD1D {Z26.D}, P0/Z, [X19, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FMAD Z23.D, P0/M, Z19.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
LD1D {Z28.D}, P0/Z, [X16, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
ADD X10, X10, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP X15, X10 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
FMUL Z29.D, Z25.D, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FSUB Z26.D, Z26.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z23.D, Z25.D, Z23.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMUL Z28.D, Z18.D, Z30.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMUL Z30.D, P0/M, Z30.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FCMGE P2.D, P0/Z, Z25.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
FABS Z25.D, P0/M, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z26.D, Z26.D, Z21.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMAXNM Z25.D, P0/M, Z25.D, Z2.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z8.D, Z8.D, Z29.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMAD Z27.D, P0/M, Z10.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
FMUL Z31.D, Z26.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FDIV Z8.D, P0/M, Z8.D, Z10.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FMUL Z30.D, Z30.D, Z31.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FADD Z28.D, Z31.D, Z29.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMAXNM Z28.D, P0/M, Z28.D, Z2.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMAD Z23.D, P0/M, Z26.D, Z8.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
FABS Z26.D, P0/M, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMAXNM Z26.D, P0/M, Z26.D, Z2.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMLA Z23.D, P0/M, Z30.D, Z19.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
FDIV Z23.D, P0/M, Z23.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FCMLE P1.D, P0/Z, Z23.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
FCMLT P1.D, P1/Z, Z27.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
MOVPRFX Z27, Z25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FNEG Z27.D, P0/M, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
SEL Z25.D, P2, Z25.D, Z27.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
MOVPRFX Z27, Z26 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FNEG Z27.D, P0/M, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FCMGE P2.D, P0/Z, Z25.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
SEL Z26.D, P2, Z26.D, Z27.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z27.D, Z25.D, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMLA Z27.D, P0/M, Z26.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
FSQRT Z27.D, P0/M, Z27.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-16 | 1-0.50 |
FMUL Z28.D, Z27.D, Z10.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMUL Z27.D, Z27.D, Z18.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FDIVR Z25.D, P0/M, Z25.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FDIVR Z26.D, P0/M, Z26.D, Z27.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FABS Z25.D, P0/M, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FABS Z26.D, P0/M, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMINNM Z25.D, P0/M, Z25.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
LD1D {Z26.D}, P1/Z, [X12, Z5.D,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FMUL Z23.D, Z25.D, Z23.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
ADR Z26.D, [Z26, Z24.D,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z23.D, Z23.D, Z23.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
LD1D {Z26.D}, P1/Z, [V26.D] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0.33 | 0.33 | 0.33 | 0 | 0 | 9 | 2 |
FADD Z25.D, Z26.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z23.D, Z23.D, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
SEL Z23.D, P1, Z23.D, Z5.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
ST1D {Z23.D}, P0, [X17, Z24.D,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 |
B.NE 44ce80 <__nv_viscosity_kernel_module_viscosity_kernel__F1L50_1_+0x2f0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Function | __nv_viscosity_kernel_module_viscosity_kernel__F1L50_1_ |
Source file and lines | viscosity_kernel.f90-pp.f90:56-89 |
Module | exec |
nb instructions | 96 |
loop length | 384 |
nb stack references | 0 |
front end | 12.00 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 2.00 | 2.00 | 2.00 | 2.00 | 35.00 | 35.00 | 1.00 | 1.00 | 8.00 | 8.00 | 1.00 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 2.00 | 2.00 | 2.00 | 2.00 | 35.00 | 35.00 | 19.50 | 19.50 | 8.00 | 8.00 | 1.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | 7.00-3.50 |
Longest recurrence chain latency (RecMII) | 2.00 |
Front-end | 12.00 |
Data deps. | 2.00 |
Overall L1 | 35.00 |
all | 81% |
load | 94% |
store | 100% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 66% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 42% |
all | 100% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | 100% |
add-sub | 100% |
fma | 100% |
div/sqrt | 100% |
other | 100% |
all | 94% |
load | 94% |
store | 100% |
mul | 100% |
add-sub | 95% |
fma | 100% |
div/sqrt | 100% |
other | 80% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
UBFM X3, X10, #61, #60 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X4, X26, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X6, X28, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LD1D {Z23.D}, P0/Z, [X4, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z24.D}, P0/Z, [X6, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z25.D}, P0/Z, [X4, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z26.D}, P0/Z, [X6, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
ADD X4, X25, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LD1D {Z29.D}, P0/Z, [X4, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z30.D}, P0/Z, [X4, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
ADD X4, X27, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X3, X22, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
FADD Z28.D, Z25.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z27.D, Z24.D, Z23.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z23.D, Z23.D, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z24.D, Z24.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FSUB Z23.D, Z23.D, Z24.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
ADD Z24.D, Z20.D, Z22.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FSUB Z27.D, Z27.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
LD1D {Z28.D}, P0/Z, [X4, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z31.D}, P0/Z, [X4, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z10.D}, P0/Z, [X3, MUL VL] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FADD Z8.D, Z30.D, Z29.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
ADD Z22.D, Z22.D, Z6.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMOV X4, D24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
FMUL Z23.D, P0/M, Z23.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
LD1D {Z25.D}, P0/Z, [X18, X4,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1D {Z26.D}, P0/Z, [X2, X4,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FADD Z9.D, Z28.D, Z31.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z28.D, Z29.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FADD Z29.D, Z30.D, Z31.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FSUB Z28.D, Z28.D, Z29.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FSUB Z30.D, Z8.D, Z9.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
MOVPRFX Z8, Z27 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z8.D, P0/M, Z8.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMUL Z28.D, P0/M, Z28.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FSUB Z25.D, Z25.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
LD1D {Z26.D}, P0/Z, [X3, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FDIV Z28.D, P0/M, Z28.D, Z10.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FADD Z26.D, Z26.D, Z10.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FDIV Z25.D, P0/M, Z25.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
LD1D {Z26.D}, P0/Z, [X19, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FMAD Z23.D, P0/M, Z19.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
LD1D {Z28.D}, P0/Z, [X16, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
ADD X10, X10, X13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP X15, X10 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
FMUL Z29.D, Z25.D, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FSUB Z26.D, Z26.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z23.D, Z25.D, Z23.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMUL Z28.D, Z18.D, Z30.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMUL Z30.D, P0/M, Z30.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FCMGE P2.D, P0/Z, Z25.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
FABS Z25.D, P0/M, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z26.D, Z26.D, Z21.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMAXNM Z25.D, P0/M, Z25.D, Z2.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z8.D, Z8.D, Z29.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMAD Z27.D, P0/M, Z10.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
FMUL Z31.D, Z26.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FDIV Z8.D, P0/M, Z8.D, Z10.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FMUL Z30.D, Z30.D, Z31.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FADD Z28.D, Z31.D, Z29.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMAXNM Z28.D, P0/M, Z28.D, Z2.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMAD Z23.D, P0/M, Z26.D, Z8.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
FABS Z26.D, P0/M, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMAXNM Z26.D, P0/M, Z26.D, Z2.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMLA Z23.D, P0/M, Z30.D, Z19.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
FDIV Z23.D, P0/M, Z23.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FCMLE P1.D, P0/Z, Z23.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
FCMLT P1.D, P1/Z, Z27.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
MOVPRFX Z27, Z25 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FNEG Z27.D, P0/M, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
SEL Z25.D, P2, Z25.D, Z27.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
MOVPRFX Z27, Z26 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FNEG Z27.D, P0/M, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FCMGE P2.D, P0/Z, Z25.D, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
SEL Z26.D, P2, Z26.D, Z27.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z27.D, Z25.D, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMLA Z27.D, P0/M, Z26.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0.50 |
FSQRT Z27.D, P0/M, Z27.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-16 | 1-0.50 |
FMUL Z28.D, Z27.D, Z10.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FMUL Z27.D, Z27.D, Z18.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
FDIVR Z25.D, P0/M, Z25.D, Z28.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FDIVR Z26.D, P0/M, Z26.D, Z27.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7-15 | 1-0.50 |
FABS Z25.D, P0/M, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FABS Z26.D, P0/M, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMINNM Z25.D, P0/M, Z25.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
LD1D {Z26.D}, P1/Z, [X12, Z5.D,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
FMUL Z23.D, Z25.D, Z23.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
ADR Z26.D, [Z26, Z24.D,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z23.D, Z23.D, Z23.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
LD1D {Z26.D}, P1/Z, [V26.D] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0.33 | 0.33 | 0.33 | 0 | 0 | 9 | 2 |
FADD Z25.D, Z26.D, Z26.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
FMUL Z23.D, Z23.D, Z25.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0.50 |
SEL Z23.D, P1, Z23.D, Z5.D | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
ST1D {Z23.D}, P0, [X17, Z24.D,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 |
B.NE 44ce80 <__nv_viscosity_kernel_module_viscosity_kernel__F1L50_1_+0x2f0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |