Loop Id: 256 | Module: exec | Source: advec_mom.cpp:110-139 [...] | Coverage: 0.02% |
---|
Loop Id: 256 | Module: exec | Source: advec_mom.cpp:110-139 [...] | Coverage: 0.02% |
---|
0x4219e0 LDR X7, [X11] |
0x4219e4 SBFM X30, X16, #0, #31 |
0x4219e8 INDEX Z5.S, W16, #1 |
0x4219ec MOVZ X0, #0 |
0x4219f0 ORR X23, XZR, X2 |
0x4219f4 WHILELO P2.D, XZR, X2 |
0x4219f8 LDR X24, [X12] |
0x4219fc UQDECD X23, ALL |
0x421a00 LD1RD {Z6.D}, P0/Z, [X26] |
0x421a04 WHILELO P1.D, XZR, X23 |
0x421a08 LDR X8, [X11, #16] |
0x421a0c MADD X25, X9, X7, X30 |
0x421a10 LDR X1, [X12, #16] |
0x421a14 MADD X5, X9, X24, XZR |
0x421a18 LDR X6, [X13] |
0x421a1c ADD X3, X8, X25,LSL #3 |
0x421a20 LDR X24, [X14] |
0x421a24 ADD X16, X1, X5,LSL #3 |
0x421a28 ADDVL X4, X3, #1 |
0x421a2c LDR X5, [X18, #8] |
0x421a30 MADD X7, X9, X6, X30 |
0x421a34 LDR X8, [X13, #16] |
0x421a38 MADD X1, X9, X24, XZR |
0x421a3c LDR X25, [X14, #16] |
0x421a40 ADD X30, X5, X30,LSL #3 |
0x421a44 ADD X6, X8, X7,LSL #3 |
0x421a48 ADDVL X24, X30, #1 |
0x421a4c ADD X7, X25, X1,LSL #3 |
0x421a50 ADDVL X8, X6, #1 |
0x421a54 HINT #0 |
0x421a58 HINT #0 |
0x421a5c HINT #0 |
(255) 0x421a60 LD1D {Z23.D}, P2/Z, [X6, X0,LSL #3] |
(255) 0x421a64 LD1D {Z4.D}, P1/Z, [X8, X0,LSL #3] |
(255) 0x421a68 FCMGE P3.D, P0/Z, Z23.D, #0 |
(255) 0x421a6c FCMGE P5.D, P0/Z, Z4.D, #0 |
(255) 0x421a70 MOVPRFX Z20, Z5 |
(255) 0x421a74 SUB Z20.S, Z20.S, #1 |
(255) 0x421a78 UZP1 P6.S, P3.S, P5.S |
(255) 0x421a7c EOR P4.B, P0/Z, P6.B, P0.B |
(255) 0x421a80 MOVPRFX Z20.S, P4/M, Z5.S |
(255) 0x421a84 ADD Z20.S, P4/M, Z20.S, Z18.S |
(255) 0x421a88 MOVPRFX Z24, Z5 |
(255) 0x421a8c ADD Z24.S, Z24.S, #1 |
(255) 0x421a90 SUNPKLO Z2.D, Z5 |
(255) 0x421a94 SUNPKLO Z3.D, Z24 |
(255) 0x421a98 SUNPKLO Z19.D, Z20 |
(255) 0x421a9c SEL Z22.D, P3, Z2.D, Z3.D |
(255) 0x421aa0 LD1D {Z27.D}, P2/Z, [X7, Z19.D,LSL #3] |
(255) 0x421aa4 LD1D {Z21.D}, P2/Z, [X7, Z22.D,LSL #3] |
(255) 0x421aa8 LD1D {Z26.D}, P2/Z, [X16, Z22.D,LSL #3] |
(255) 0x421aac FSUB Z29.D, Z21.D, Z27.D |
(255) 0x421ab0 SUNPKHI Z0.D, Z20 |
(255) 0x421ab4 FCMLT P7.D, P0/Z, Z23.D, #0 |
(255) 0x421ab8 SEL Z25.D, P7, Z2.D, Z3.D |
(255) 0x421abc LD1D {Z30.D}, P2/Z, [X7, Z25.D,LSL #3] |
(255) 0x421ac0 FSUB Z20.D, Z30.D, Z21.D |
(255) 0x421ac4 LD1D {Z28.D}, P1/Z, [X7, Z0.D,LSL #3] |
(255) 0x421ac8 FMUL Z3.D, Z20.D, Z29.D |
(255) 0x421acc SEL Z19.S, P6, Z5.S, Z24.S |
(255) 0x421ad0 FCMGT P3.D, P2/Z, Z3.D, #0 |
(255) 0x421ad4 ADD Z19.S, P6/M, Z19.S, Z17.S |
(255) 0x421ad8 SUNPKHI Z1.D, Z5 |
(255) 0x421adc FABD Z27.D, P0/M, Z27.D, Z21.D |
(255) 0x421ae0 FABD Z30.D, P0/M, Z30.D, Z21.D |
(255) 0x421ae4 SUNPKLO Z0.D, Z19 |
(255) 0x421ae8 MOVPRFX Z31, Z23 |
(255) 0x421aec FABS Z31.D, P0/M, Z23.D |
(255) 0x421af0 LD1D {Z2.D}, P3/Z, [X5, Z0.D,LSL #3] |
(255) 0x421af4 FDIV Z31.D, P0/M, Z31.D, Z26.D |
(255) 0x421af8 FCMGT P4.D, P0/Z, Z20.D, #0 |
(255) 0x421afc MOVPRFX Z26, Z27 |
(255) 0x421b00 FMINNM Z26.D, P0/M, Z26.D, Z30.D |
(255) 0x421b04 LD1D {Z20.D}, P2/Z, [X30, X0,LSL #3] |
(255) 0x421b08 MOVPRFX Z25, Z31 |
(255) 0x421b0c FADD Z25.D, P0/M, Z25.D, #1 |
(255) 0x421b10 FSUB Z29.D, Z7.D, Z31.D |
(255) 0x421b14 FMUL Z27.D, Z25.D, Z27.D |
(255) 0x421b18 FMUL Z30.D, Z29.D, Z30.D |
(255) 0x421b1c FDIV Z27.D, P0/M, Z27.D, Z2.D |
(255) 0x421b20 FDIV Z30.D, P0/M, Z30.D, Z20.D |
(255) 0x421b24 SUNPKHI Z24.D, Z24 |
(255) 0x421b28 FADD Z0.D, Z27.D, Z30.D |
(255) 0x421b2c SEL Z25.D, P5, Z1.D, Z24.D |
(255) 0x421b30 SUNPKHI Z22.D, Z19 |
(255) 0x421b34 LD1D {Z30.D}, P1/Z, [X7, Z25.D,LSL #3] |
(255) 0x421b38 FMUL Z19.D, Z20.D, Z6.D |
(255) 0x421b3c FCMLT P8.D, P0/Z, Z4.D, #0 |
(255) 0x421b40 LD1D {Z20.D}, P1/Z, [X16, Z25.D,LSL #3] |
(255) 0x421b44 FMUL Z27.D, Z0.D, Z19.D |
(255) 0x421b48 FCMGT P6.D, P4/Z, Z3.D, #0 |
(255) 0x421b4c FMINNM Z27.D, P0/M, Z27.D, Z26.D |
(255) 0x421b50 MOVPRFX Z3, Z16 |
(255) 0x421b54 FCPY Z3.D, P6/M, #1.0000000 |
(255) 0x421b58 FSUBR Z31.D, P0/M, Z31.D, #1 |
(255) 0x421b5c FMUL Z31.D, Z31.D, Z3.D |
(255) 0x421b60 MOVPRFX Z2, Z21 |
(255) 0x421b64 FMLA Z2.D, P3/M, Z31.D, Z27.D |
(255) 0x421b68 SEL Z21.D, P8, Z1.D, Z24.D |
(255) 0x421b6c FMUL Z29.D, Z23.D, Z2.D |
(255) 0x421b70 LD1D {Z1.D}, P1/Z, [X7, Z21.D,LSL #3] |
(255) 0x421b74 FSUB Z23.D, Z30.D, Z28.D |
(255) 0x421b78 FSUB Z26.D, Z1.D, Z30.D |
(255) 0x421b7c FABD Z1.D, P0/M, Z1.D, Z30.D |
(255) 0x421b80 LD1D {Z24.D}, P1/Z, [X24, X0,LSL #3] |
(255) 0x421b84 FMUL Z3.D, Z23.D, Z26.D |
(255) 0x421b88 FABD Z28.D, P0/M, Z28.D, Z30.D |
(255) 0x421b8c FCMGT P5.D, P1/Z, Z3.D, #0 |
(255) 0x421b90 MOVPRFX Z31, Z28 |
(255) 0x421b94 FMINNM Z31.D, P0/M, Z31.D, Z1.D |
(255) 0x421b98 LD1D {Z22.D}, P5/Z, [X5, Z22.D,LSL #3] |
(255) 0x421b9c MOVPRFX Z19, Z4 |
(255) 0x421ba0 FABS Z19.D, P0/M, Z4.D |
(255) 0x421ba4 FCMGT P7.D, P0/Z, Z26.D, #0 |
(255) 0x421ba8 FDIV Z19.D, P0/M, Z19.D, Z20.D |
(255) 0x421bac FMUL Z21.D, Z24.D, Z6.D |
(255) 0x421bb0 MOVPRFX Z0, Z19 |
(255) 0x421bb4 FADD Z0.D, P0/M, Z0.D, #1 |
(255) 0x421bb8 FSUB Z27.D, Z7.D, Z19.D |
(255) 0x421bbc FMUL Z28.D, Z0.D, Z28.D |
(255) 0x421bc0 FMUL Z25.D, Z27.D, Z1.D |
(255) 0x421bc4 FDIV Z28.D, P0/M, Z28.D, Z22.D |
(255) 0x421bc8 FDIV Z25.D, P0/M, Z25.D, Z24.D |
(255) 0x421bcc FSUBR Z19.D, P0/M, Z19.D, #1 |
(255) 0x421bd0 FADD Z2.D, Z28.D, Z25.D |
(255) 0x421bd4 FCMGT P3.D, P7/Z, Z3.D, #0 |
(255) 0x421bd8 FMUL Z20.D, Z2.D, Z21.D |
(255) 0x421bdc MOVPRFX Z1, Z16 |
(255) 0x421be0 FCPY Z1.D, P3/M, #1.0000000 |
(255) 0x421be4 FMINNM Z20.D, P0/M, Z20.D, Z31.D |
(255) 0x421be8 FMUL Z23.D, Z19.D, Z1.D |
(255) 0x421bec MOVPRFX Z26, Z30 |
(255) 0x421bf0 FMLA Z26.D, P5/M, Z23.D, Z20.D |
(255) 0x421bf4 FMUL Z4.D, Z4.D, Z26.D |
(255) 0x421bf8 ST1D {Z29.D}, P2, [X3, X0,LSL #3] |
(255) 0x421bfc ST1D {Z4.D}, P1, [X4, X0,LSL #3] |
(255) 0x421c00 ADD X0, X0, X10 |
(255) 0x421c04 WHILELO P1.D, X0, X23 |
(255) 0x421c08 INCW Z5.S, ALL |
(255) 0x421c0c WHILELO P2.D, X0, X2 |
(255) 0x421c10 B.NE 421a60 |
0x421c14 ADD X9, X9, #1 |
0x421c18 ADD W2, W17, W9 |
0x421c1c CMP W20, W2 |
0x421c20 B.LE 421c60 |
0x421c24 SUB W3, W15, W19 |
0x421c28 ORR W8, WZR, W19 |
0x421c2c ORR W19, WZR, W22 |
0x421c30 ORR W16, WZR, W21 |
0x421c34 CMP W3, W19 |
0x421c38 CSEL X2, X3, X19, #9 |
0x421c3c ADD W19, W8, W2 |
0x421c40 CMP W8, W19 |
0x421c44 B.CC 4219e0 |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/advec_mom.cpp: 110 - 139 |
-------------------------------------------------------------------------------- |
110: for (int i = (x_min - 1 + 1); i < (x_max + 1 + 2); i++) |
111: ({ |
112: int upwind, donor, downwind, dif; |
113: double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; |
114: if (node_flux(i, j) < 0.0) { |
[...] |
120: upwind = i - 1; |
121: donor = i; |
122: downwind = i + 1; |
123: dif = upwind; |
124: } |
125: sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(donor, j)); |
126: width = celldx[i]; |
127: vdiffuw = vel1(donor, j) - vel1(upwind, j); |
128: vdiffdw = vel1(downwind, j) - vel1(donor, j); |
129: limiter = 0.0; |
130: if (vdiffuw * vdiffdw > 0.0) { |
131: auw = std::fabs(vdiffuw); |
132: adw = std::fabs(vdiffdw); |
133: wind = 1.0; |
134: if (vdiffdw <= 0.0) wind = -1.0; |
135: limiter = |
136: wind * std::fmin(std::fmin(width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldx[dif]) / 6.0, auw), adw); |
137: } |
138: advec_vel_s = vel1(donor, j) + (1.0 - sigma) * limiter; |
139: mom_flux(i, j) = advec_vel_s * node_flux(i, j); |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►98.05+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 10.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.71 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.54 |
Bottlenecks | P4, |
Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
Source | advec_mom.cpp:110-110,advec_mom.cpp:136-136,context.h:46-46,context.h:69-69 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 10.00 |
CQA cycles if no scalar integer | 1.00 |
CQA cycles if FP arith vectorized | 10.00 |
CQA cycles if fully vectorized | 2.13 |
Front-end cycles | 5.25 |
DIV/SQRT cycles | 1.00 |
P0 cycles | 1.00 |
P1 cycles | 6.50 |
P2 cycles | 6.25 |
P3 cycles | 10.00 |
P4 cycles | 6.25 |
P5 cycles | 1.00 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 3.50 |
P10 cycles | 3.17 |
P11 cycles | 3.33 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 45.00 |
Nb uops | 42.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 80.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 22.02 |
Vector-efficiency ratio load | 25.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 22.73 |
Vector-efficiency ratio fma | 25.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 17.50 |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 10.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.71 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.54 |
Bottlenecks | P4, |
Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
Source | advec_mom.cpp:110-110,advec_mom.cpp:136-136,context.h:46-46,context.h:69-69 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 10.00 |
CQA cycles if no scalar integer | 1.00 |
CQA cycles if FP arith vectorized | 10.00 |
CQA cycles if fully vectorized | 2.13 |
Front-end cycles | 5.25 |
DIV/SQRT cycles | 1.00 |
P0 cycles | 1.00 |
P1 cycles | 6.50 |
P2 cycles | 6.25 |
P3 cycles | 10.00 |
P4 cycles | 6.25 |
P5 cycles | 1.00 |
P6 cycles | 0.00 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 3.50 |
P10 cycles | 3.17 |
P11 cycles | 3.33 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 45.00 |
Nb uops | 42.00 |
Nb loads | NA |
Nb stores | 0.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 8.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 80.00 |
Bytes stored | 0.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | 0.00 |
Vectorization ratio store | NA |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | 0.00 |
Vectorization ratio fma | 0.00 |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | 0.00 |
Vector-efficiency ratio all | 22.02 |
Vector-efficiency ratio load | 25.00 |
Vector-efficiency ratio store | NA |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | 22.73 |
Vector-efficiency ratio fma | 25.00 |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | 17.50 |
Path / |
nb instructions | 45 |
loop length | 180 |
nb stack references | 0 |
front end | 5.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.00 | 1.00 | 6.50 | 6.25 | 10.00 | 6.25 | 1.00 | 0.00 | 0.00 | 0.00 | 3.50 | 3.17 | 3.33 | 0.00 | 0.00 |
cycles | 1.00 | 1.00 | 6.50 | 6.25 | 10.00 | 6.25 | 1.00 | 0.00 | 0.00 | 0.00 | 3.50 | 3.17 | 3.33 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 5.25 |
Overall L1 | 10.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LDR X7, [X11] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
SBFM X30, X16, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
INDEX Z5.S, W16, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
MOVZ X0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR X23, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
WHILELO P2.D, XZR, X2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 |
LDR X24, [X12] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
UQDECD X23, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LD1RD {Z6.D}, P0/Z, [X26] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
WHILELO P1.D, XZR, X23 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 |
LDR X8, [X11, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
MADD X25, X9, X7, X30 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X1, [X12, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
MADD X5, X9, X24, XZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X6, [X13] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X3, X8, X25,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X24, [X14] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X16, X1, X5,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X4, X3, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X5, [X18, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
MADD X7, X9, X6, X30 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X8, [X13, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
MADD X1, X9, X24, XZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X25, [X14, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X30, X5, X30,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X6, X8, X7,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X24, X30, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD X7, X25, X1,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X8, X6, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
HINT #0 | ||||||||||||||||||
HINT #0 | ||||||||||||||||||
HINT #0 | ||||||||||||||||||
ADD X9, X9, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W2, W17, W9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W20, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.LE 421c60 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x35c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W3, W15, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W8, WZR, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W19, WZR, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W16, WZR, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W3, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
CSEL X2, X3, X19, #9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W19, W8, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W8, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 4219e0 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0xdc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
nb instructions | 45 |
loop length | 180 |
nb stack references | 0 |
front end | 5.25 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 1.00 | 1.00 | 6.50 | 6.25 | 10.00 | 6.25 | 1.00 | 0.00 | 0.00 | 0.00 | 3.50 | 3.17 | 3.33 | 0.00 | 0.00 |
cycles | 1.00 | 1.00 | 6.50 | 6.25 | 10.00 | 6.25 | 1.00 | 0.00 | 0.00 | 0.00 | 3.50 | 3.17 | 3.33 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 5.25 |
Overall L1 | 10.00 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
LDR X7, [X11] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
SBFM X30, X16, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
INDEX Z5.S, W16, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 |
MOVZ X0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR X23, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
WHILELO P2.D, XZR, X2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 |
LDR X24, [X12] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
UQDECD X23, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LD1RD {Z6.D}, P0/Z, [X26] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 |
WHILELO P1.D, XZR, X23 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 |
LDR X8, [X11, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
MADD X25, X9, X7, X30 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X1, [X12, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
MADD X5, X9, X24, XZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X6, [X13] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X3, X8, X25,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR X24, [X14] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X16, X1, X5,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X4, X3, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X5, [X18, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
MADD X7, X9, X6, X30 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X8, [X13, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
MADD X1, X9, X24, XZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
LDR X25, [X14, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD X30, X5, X30,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD X6, X8, X7,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X24, X30, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD X7, X25, X1,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADDVL X8, X6, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
HINT #0 | ||||||||||||||||||
HINT #0 | ||||||||||||||||||
HINT #0 | ||||||||||||||||||
ADD X9, X9, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W2, W17, W9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W20, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.LE 421c60 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x35c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W3, W15, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W8, WZR, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W19, WZR, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ORR W16, WZR, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W3, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
CSEL X2, X3, X19, #9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W19, W8, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W8, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 4219e0 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0xdc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |