Loop Id: 261 | Module: exec | Source: advec_mom.cpp:182-211 [...] | Coverage: 0.02% |
---|
Loop Id: 261 | Module: exec | Source: advec_mom.cpp:182-211 [...] | Coverage: 0.02% |
---|
0x421d6c LDR X6, [X11] |
0x421d70 SBFM X9, X3, #0, #31 |
0x421d74 INDEX Z16.S, W3, #1 |
0x421d78 DUP Z17.S, W24 |
0x421d7c MOVZ X0, #0 |
0x421d80 DUP Z23.S, W22 |
0x421d84 LDR X3, [X13] |
0x421d88 ORR X8, XZR, X26 |
0x421d8c WHILELO P2.D, XZR, X26 |
0x421d90 UQDECD X8, ALL |
0x421d94 SUNPKLO Z20.D, Z23 |
0x421d98 WHILELO P1.D, XZR, X8 |
0x421d9c LDR X7, [X11, #16] |
0x421da0 SUNPKHI Z19.D, Z23 |
0x421da4 MOVPRFX Z25, Z17 |
0x421da8 SUB Z25.S, Z25.S, #1 |
0x421dac MADD X23, X18, X6, X9 |
0x421db0 DUP Z7.D, X18 |
0x421db4 LD1RD {Z18.D}, P0/Z, [X12] |
0x421db8 LDR X2, [X13, #16] |
0x421dbc LD1RD {Z6.D}, P0/Z, [X14] |
0x421dc0 LD1RD {Z24.D}, P0/Z, [X25] |
0x421dc4 MADD X1, X18, X3, X9 |
0x421dc8 LDR X4, [X17, #8] |
0x421dcc ADD X24, X7, X23,LSL #3 |
0x421dd0 LDR X6, [X12, #16] |
0x421dd4 ADD X16, X2, X1,LSL #3 |
0x421dd8 ADDVL X5, X24, #1 |
0x421ddc LDR X23, [X14, #16] |
0x421de0 ADD X9, X4, X18,LSL #3 |
0x421de4 ADDVL X7, X16, #1 |
(259) 0x421de8 LD1D {Z28.D}, P2/Z, [X16, X0,LSL #3] |
(259) 0x421dec LD1D {Z5.D}, P1/Z, [X7, X0,LSL #3] |
(259) 0x421df0 SUNPKLO Z9.D, Z16 |
(259) 0x421df4 SUNPKHI Z8.D, Z16 |
(259) 0x421df8 FCMGE P4.D, P0/Z, Z28.D, #0 |
(259) 0x421dfc FCMGE P5.D, P0/Z, Z5.D, #0 |
(259) 0x421e00 SEL Z1.D, P4, Z7.D, Z20.D |
(259) 0x421e04 UZP1 P3.S, P4.S, P5.S |
(259) 0x421e08 MOVPRFX Z3, Z9 |
(259) 0x421e0c MLA Z3.D, P0/M, Z6.D, Z1.D |
(259) 0x421e10 EOR P6.B, P0/Z, P3.B, P0.B |
(259) 0x421e14 MAD Z1.D, P0/M, Z18.D, Z9.D |
(259) 0x421e18 SEL Z0.S, P6, Z17.S, Z25.S |
(259) 0x421e1c LD1D {Z30.D}, P2/Z, [X6, Z1.D,LSL #3] |
(259) 0x421e20 ADD Z0.S, P6/M, Z0.S, Z27.S |
(259) 0x421e24 SUNPKHI Z2.D, Z0 |
(259) 0x421e28 MAD Z2.D, P0/M, Z6.D, Z8.D |
(259) 0x421e2c LD1D {Z3.D}, P2/Z, [X23, Z3.D,LSL #3] |
(259) 0x421e30 FCMLT P7.D, P0/Z, Z28.D, #0 |
(259) 0x421e34 SUNPKLO Z29.D, Z0 |
(259) 0x421e38 MOVPRFX Z4, Z28 |
(259) 0x421e3c FABS Z4.D, P0/M, Z28.D |
(259) 0x421e40 LD1D {Z0.D}, P1/Z, [X23, Z2.D,LSL #3] |
(259) 0x421e44 FDIV Z4.D, P0/M, Z4.D, Z30.D |
(259) 0x421e48 MAD Z29.D, P0/M, Z6.D, Z9.D |
(259) 0x421e4c SEL Z1.D, P7, Z7.D, Z20.D |
(259) 0x421e50 LD1D {Z29.D}, P2/Z, [X23, Z29.D,LSL #3] |
(259) 0x421e54 MAD Z1.D, P0/M, Z6.D, Z9.D |
(259) 0x421e58 FSUB Z10.D, Z3.D, Z29.D |
(259) 0x421e5c LD1D {Z2.D}, P2/Z, [X23, Z1.D,LSL #3] |
(259) 0x421e60 FABD Z29.D, P0/M, Z29.D, Z3.D |
(259) 0x421e64 FSUB Z30.D, Z2.D, Z3.D |
(259) 0x421e68 SEL Z31.S, P3, Z17.S, Z23.S |
(259) 0x421e6c FMUL Z10.D, Z10.D, Z30.D |
(259) 0x421e70 ADD Z31.S, P3/M, Z31.S, Z26.S |
(259) 0x421e74 SUNPKLO Z9.D, Z31 |
(259) 0x421e78 FCMGT P3.D, P2/Z, Z10.D, #0 |
(259) 0x421e7c FABD Z2.D, P0/M, Z2.D, Z3.D |
(259) 0x421e80 FCMGT P4.D, P0/Z, Z30.D, #0 |
(259) 0x421e84 MOVPRFX Z1, Z4 |
(259) 0x421e88 FADD Z1.D, P0/M, Z1.D, #1 |
(259) 0x421e8c LD1D {Z30.D}, P3/Z, [X4, Z9.D,LSL #3] |
(259) 0x421e90 FCMGT P6.D, P4/Z, Z10.D, #0 |
(259) 0x421e94 MOVPRFX Z9, Z29 |
(259) 0x421e98 FMINNM Z9.D, P0/M, Z9.D, Z2.D |
(259) 0x421e9c FMUL Z1.D, Z1.D, Z29.D |
(259) 0x421ea0 MOVPRFX Z10, Z22 |
(259) 0x421ea4 FCPY Z10.D, P6/M, #1.0000000 |
(259) 0x421ea8 FSUB Z29.D, Z21.D, Z4.D |
(259) 0x421eac FDIV Z1.D, P0/M, Z1.D, Z30.D |
(259) 0x421eb0 FMUL Z2.D, Z29.D, Z2.D |
(259) 0x421eb4 LD1RD {Z30.D}, P0/Z, [X9] |
(259) 0x421eb8 FSUBR Z4.D, P0/M, Z4.D, #1 |
(259) 0x421ebc FDIV Z2.D, P0/M, Z2.D, Z30.D |
(259) 0x421ec0 FMUL Z29.D, Z30.D, Z24.D |
(259) 0x421ec4 FMUL Z4.D, Z4.D, Z10.D |
(259) 0x421ec8 FADD Z1.D, Z1.D, Z2.D |
(259) 0x421ecc SEL Z10.D, P5, Z7.D, Z19.D |
(259) 0x421ed0 FMUL Z1.D, Z1.D, Z29.D |
(259) 0x421ed4 FMINNM Z1.D, P0/M, Z1.D, Z9.D |
(259) 0x421ed8 MOVPRFX Z9, Z8 |
(259) 0x421edc MLA Z9.D, P0/M, Z6.D, Z10.D |
(259) 0x421ee0 FMLA Z3.D, P3/M, Z4.D, Z1.D |
(259) 0x421ee4 LD1D {Z2.D}, P1/Z, [X23, Z9.D,LSL #3] |
(259) 0x421ee8 FMUL Z3.D, Z28.D, Z3.D |
(259) 0x421eec FSUB Z1.D, Z2.D, Z0.D |
(259) 0x421ef0 MAD Z10.D, P0/M, Z18.D, Z8.D |
(259) 0x421ef4 FCMLT P8.D, P0/Z, Z5.D, #0 |
(259) 0x421ef8 LD1D {Z10.D}, P1/Z, [X6, Z10.D,LSL #3] |
(259) 0x421efc SEL Z28.D, P8, Z7.D, Z19.D |
(259) 0x421f00 MAD Z28.D, P0/M, Z6.D, Z8.D |
(259) 0x421f04 LD1D {Z28.D}, P1/Z, [X23, Z28.D,LSL #3] |
(259) 0x421f08 FSUB Z8.D, Z28.D, Z2.D |
(259) 0x421f0c FMUL Z9.D, Z1.D, Z8.D |
(259) 0x421f10 FABD Z0.D, P0/M, Z0.D, Z2.D |
(259) 0x421f14 FCMGT P7.D, P1/Z, Z9.D, #0 |
(259) 0x421f18 FABD Z28.D, P0/M, Z28.D, Z2.D |
(259) 0x421f1c MOVPRFX Z4, Z5 |
(259) 0x421f20 FABS Z4.D, P0/M, Z5.D |
(259) 0x421f24 FCMGT P5.D, P0/Z, Z8.D, #0 |
(259) 0x421f28 FDIV Z4.D, P0/M, Z4.D, Z10.D |
(259) 0x421f2c SUNPKHI Z31.D, Z31 |
(259) 0x421f30 MOVPRFX Z10, Z0 |
(259) 0x421f34 FMINNM Z10.D, P0/M, Z10.D, Z28.D |
(259) 0x421f38 LD1D {Z31.D}, P7/Z, [X4, Z31.D,LSL #3] |
(259) 0x421f3c MOVPRFX Z1, Z4 |
(259) 0x421f40 FADD Z1.D, P0/M, Z1.D, #1 |
(259) 0x421f44 FSUB Z8.D, Z21.D, Z4.D |
(259) 0x421f48 FCMGT P4.D, P5/Z, Z9.D, #0 |
(259) 0x421f4c FMUL Z0.D, Z1.D, Z0.D |
(259) 0x421f50 FMUL Z28.D, Z8.D, Z28.D |
(259) 0x421f54 FDIV Z0.D, P0/M, Z0.D, Z31.D |
(259) 0x421f58 FDIV Z28.D, P0/M, Z28.D, Z30.D |
(259) 0x421f5c FSUBR Z4.D, P0/M, Z4.D, #1 |
(259) 0x421f60 FADD Z30.D, Z0.D, Z28.D |
(259) 0x421f64 MOVPRFX Z9, Z22 |
(259) 0x421f68 FCPY Z9.D, P4/M, #1.0000000 |
(259) 0x421f6c FMUL Z29.D, Z30.D, Z29.D |
(259) 0x421f70 FMUL Z4.D, Z4.D, Z9.D |
(259) 0x421f74 FMINNM Z29.D, P0/M, Z29.D, Z10.D |
(259) 0x421f78 FMLA Z2.D, P7/M, Z29.D, Z4.D |
(259) 0x421f7c FMUL Z5.D, Z5.D, Z2.D |
(259) 0x421f80 ST1D {Z3.D}, P2, [X24, X0,LSL #3] |
(259) 0x421f84 ST1D {Z5.D}, P1, [X5, X0,LSL #3] |
(259) 0x421f88 ADD X0, X0, X10 |
(259) 0x421f8c WHILELO P1.D, X0, X8 |
(259) 0x421f90 INCW Z16.S, ALL |
(259) 0x421f94 WHILELO P2.D, X0, X26 |
(259) 0x421f98 B.NE 421de8 |
0x421f9c ADD X18, X18, #1 |
0x421fa0 CMP W19, W22 |
0x421fa4 B.LE 421fe4 |
0x421fa8 SUB W2, W15, W30 |
0x421fac ORR W26, WZR, W21 |
0x421fb0 ORR W8, WZR, W30 |
0x421fb4 ORR W24, WZR, W22 |
0x421fb8 CMP W2, W26 |
0x421fbc CSEL X26, X2, X26, #9 |
0x421fc0 ADD W30, W8, W26 |
0x421fc4 ORR W3, WZR, W20 |
0x421fc8 ADD W22, W24, #1 |
0x421fcc CMP W8, W30 |
0x421fd0 B.CC 421d6c |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/advec_mom.cpp: 182 - 211 |
-------------------------------------------------------------------------------- |
182: for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) |
183: ({ |
184: int upwind, donor, downwind, dif; |
185: double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; |
186: if (node_flux(i, j) < 0.0) { |
[...] |
197: sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(i, donor)); |
198: width = celldy[j]; |
199: vdiffuw = vel1(i, donor) - vel1(i, upwind); |
200: vdiffdw = vel1(i, downwind) - vel1(i, donor); |
201: limiter = 0.0; |
202: if (vdiffuw * vdiffdw > 0.0) { |
203: auw = std::fabs(vdiffuw); |
204: adw = std::fabs(vdiffdw); |
205: wind = 1.0; |
206: if (vdiffdw <= 0.0) wind = -1.0; |
207: limiter = |
208: wind * std::fmin(std::fmin(width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldy[dif]) / 6.0, auw), adw); |
209: } |
210: advec_vel_s = vel1(i, donor) + (1.0 - sigma) * limiter; |
211: mom_flux(i, j) = advec_vel_s * node_flux(i, j); |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►97.92+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.33 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 2.22 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.67 |
Bottlenecks | P4, |
Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
Source | advec_mom.cpp:182-182,context.h:46-46,context.h:69-69 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 10.00 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 10.00 |
CQA cycles if fully vectorized | 4.50 |
Front-end cycles | 5.63 |
DIV/SQRT cycles | 1.00 |
P0 cycles | 1.00 |
P1 cycles | 6.00 |
P2 cycles | 6.00 |
P3 cycles | 10.00 |
P4 cycles | 6.00 |
P5 cycles | 2.50 |
P6 cycles | 2.50 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 3.50 |
P10 cycles | 3.17 |
P11 cycles | 3.33 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 45.00 |
Nb uops | 45.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 80.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 3.85 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 11.11 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 32.21 |
Vector-efficiency ratio load | 25.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 30.56 |
Vector-efficiency ratio fma | 25.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 36.46 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 3.33 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 2.22 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.67 |
Bottlenecks | P4, |
Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
Source | advec_mom.cpp:182-182,context.h:46-46,context.h:69-69 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 10.00 |
CQA cycles if no scalar integer | 3.00 |
CQA cycles if FP arith vectorized | 10.00 |
CQA cycles if fully vectorized | 4.50 |
Front-end cycles | 5.63 |
DIV/SQRT cycles | 1.00 |
P0 cycles | 1.00 |
P1 cycles | 6.00 |
P2 cycles | 6.00 |
P3 cycles | 10.00 |
P4 cycles | 6.00 |
P5 cycles | 2.50 |
P6 cycles | 2.50 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 3.50 |
P10 cycles | 3.17 |
P11 cycles | 3.33 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 45.00 |
Nb uops | 45.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 80.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 3.85 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 11.11 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 32.21 |
Vector-efficiency ratio load | 25.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 30.56 |
Vector-efficiency ratio fma | 25.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 36.46 |
Path / |
nb instructions | 45 |
loop length | 180 |
nb stack references | 0 |
front end | 5.63 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.00 | 1.00 | 6.00 | 6.00 | 10.00 | 6.00 | 2.50 | 2.50 | 0.00 | 0.00 | 3.50 | 3.17 | 3.33 | 0.00 | 0.00 |
cycles | 1.00 | 1.00 | 6.00 | 6.00 | 10.00 | 6.00 | 2.50 | 2.50 | 0.00 | 0.00 | 3.50 | 3.17 | 3.33 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 5.63 |
Overall L1 | 10.00 |
all | 3% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 11% |
fma | 0% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LDR X6, [X11] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
SBFM X9, X3, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
INDEX Z16.S, W3, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
DUP Z17.S, W24 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOVZ X0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
DUP Z23.S, W22 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LDR X3, [X13] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ORR X8, XZR, X26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
WHILELO P2.D, XZR, X26 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 |
UQDECD X8, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
SUNPKLO Z20.D, Z23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
WHILELO P1.D, XZR, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 |
LDR X7, [X11, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
SUNPKHI Z19.D, Z23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
MOVPRFX Z25, Z17 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
SUB Z25.S, Z25.S, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
MADD X23, X18, X6, X9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
DUP Z7.D, X18 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LD1RD {Z18.D}, P0/Z, [X12] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LDR X2, [X13, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LD1RD {Z6.D}, P0/Z, [X14] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1RD {Z24.D}, P0/Z, [X25] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
MADD X1, X18, X3, X9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X4, [X17, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X24, X7, X23,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X6, [X12, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X16, X2, X1,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X5, X24, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X23, [X14, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X9, X4, X18,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X7, X16, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD X18, X18, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W19, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.LE 421fe4 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x360> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W2, W15, W30 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W26, WZR, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W8, WZR, W30 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W24, WZR, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W2, W26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
CSEL X26, X2, X26, #9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W30, W8, W26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W3, WZR, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W22, W24, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W8, W30 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 421d6c <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0xe8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
nb instructions | 45 |
loop length | 180 |
nb stack references | 0 |
front end | 5.63 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.00 | 1.00 | 6.00 | 6.00 | 10.00 | 6.00 | 2.50 | 2.50 | 0.00 | 0.00 | 3.50 | 3.17 | 3.33 | 0.00 | 0.00 |
cycles | 1.00 | 1.00 | 6.00 | 6.00 | 10.00 | 6.00 | 2.50 | 2.50 | 0.00 | 0.00 | 3.50 | 3.17 | 3.33 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 5.63 |
Overall L1 | 10.00 |
all | 3% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 11% |
fma | 0% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LDR X6, [X11] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
SBFM X9, X3, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
INDEX Z16.S, W3, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
DUP Z17.S, W24 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
MOVZ X0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
DUP Z23.S, W22 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LDR X3, [X13] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ORR X8, XZR, X26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
WHILELO P2.D, XZR, X26 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 |
UQDECD X8, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
SUNPKLO Z20.D, Z23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
WHILELO P1.D, XZR, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 |
LDR X7, [X11, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
SUNPKHI Z19.D, Z23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
MOVPRFX Z25, Z17 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
SUB Z25.S, Z25.S, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 |
MADD X23, X18, X6, X9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
DUP Z7.D, X18 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 |
LD1RD {Z18.D}, P0/Z, [X12] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LDR X2, [X13, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LD1RD {Z6.D}, P0/Z, [X14] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
LD1RD {Z24.D}, P0/Z, [X25] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
MADD X1, X18, X3, X9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X4, [X17, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X24, X7, X23,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X6, [X12, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X16, X2, X1,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X5, X24, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X23, [X14, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X9, X4, X18,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X7, X16, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD X18, X18, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W19, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.LE 421fe4 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x360> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W2, W15, W30 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W26, WZR, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W8, WZR, W30 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W24, WZR, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W2, W26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
CSEL X26, X2, X26, #9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W30, W8, W26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W3, WZR, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W22, W24, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W8, W30 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 421d6c <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0xe8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |