| Loop Id: 210 | Module: exec | Source: advec_mom.cpp:110-139 [...] | Coverage: 0.01% |
|---|
| Loop Id: 210 | Module: exec | Source: advec_mom.cpp:110-139 [...] | Coverage: 0.01% |
|---|
0x42bd44 LDR X7, [X16] |
0x42bd48 SBFM X8, X3, #0, #31 |
0x42bd4c INDEX Z25.S, W3, #1 |
0x42bd50 MOVZ X0, #0 |
0x42bd54 ORR X22, XZR, X2 |
0x42bd58 WHILELO P5.D, XZR, X2 |
0x42bd5c LDR X3, [X14] |
0x42bd60 UQDECD X22, ALL |
0x42bd64 WHILELO P4.D, XZR, X22 |
0x42bd68 LDR X1, [X17] |
0x42bd6c MADD X7, X11, X7, X8 |
0x42bd70 LDR X4, [X15] |
0x42bd74 MADD X3, X11, X3, X8 |
0x42bd78 LDR X5, [X30, #8] |
0x42bd7c MUL X1, X11, X1 |
0x42bd80 LDR X9, [X14, #16] |
0x42bd84 MUL X4, X11, X4 |
0x42bd88 LDR X10, [X16, #16] |
0x42bd8c ADD X8, X5, X8,LSL #3 |
0x42bd90 LDR X23, [X17, #16] |
0x42bd94 ADD X3, X9, X3,LSL #3 |
0x42bd98 LDR X6, [X15, #16] |
0x42bd9c ADD X7, X10, X7,LSL #3 |
0x42bda0 ADDVL X10, X8, #1 |
0x42bda4 ADD X1, X23, X1,LSL #3 |
0x42bda8 ADDVL X9, X7, #1 |
0x42bdac ADD X4, X6, X4,LSL #3 |
0x42bdb0 ADDVL X6, X3, #1 |
0x42bdb4 HINT #0 |
0x42bdb8 HINT #0 |
0x42bdbc HINT #0 |
(209) 0x42bdc0 LD1D {Z21.D}, P5/Z, [X7, X0,LSL #3] |
(209) 0x42bdc4 MOVPRFX Z19, Z25 |
(209) 0x42bdc8 ADD Z19.S, Z19.S, #1 |
(209) 0x42bdcc FCMLT P15.D, P3/Z, Z21.D, #0.0000000 |
(209) 0x42bdd0 SUNPKLO Z23.D, Z25 |
(209) 0x42bdd4 SUNPKLO Z18.D, Z19 |
(209) 0x42bdd8 SEL Z7.D, P15, Z18.D, Z23.D |
(209) 0x42bddc SEL Z18.D, P15, Z23.D, Z18.D |
(209) 0x42bde0 LD1D {Z15.D}, P5/Z, [X1, Z7.D,LSL #3] |
(209) 0x42bde4 LD1D {Z20.D}, P4/Z, [X9, X0,LSL #3] |
(209) 0x42bde8 LD1D {Z3.D}, P5/Z, [X4, Z7.D,LSL #3] |
(209) 0x42bdec FCMLT P14.D, P3/Z, Z20.D, #0.0000000 |
(209) 0x42bdf0 LD1D {Z7.D}, P5/Z, [X1, Z18.D,LSL #3] |
(209) 0x42bdf4 UZP1 P7.S, P15.S, P14.S |
(209) 0x42bdf8 FSUB Z7.D, P5/M, Z7.D, Z15.D |
(209) 0x42bdfc SUNPKHI Z22.D, Z25 |
(209) 0x42be00 SUNPKHI Z17.D, Z19 |
(209) 0x42be04 MOVPRFX Z16, Z25 |
(209) 0x42be08 SUB Z16.S, Z16.S, #1 |
(209) 0x42be0c SEL Z6.D, P14, Z17.D, Z22.D |
(209) 0x42be10 MOVPRFX Z16.S, P7/M, Z25.S |
(209) 0x42be14 ADD Z16.S, P7/M, Z16.S, Z31.S |
(209) 0x42be18 SEL Z17.D, P14, Z22.D, Z17.D |
(209) 0x42be1c SUNPKLO Z13.D, Z16 |
(209) 0x42be20 SUNPKHI Z16.D, Z16 |
(209) 0x42be24 LD1D {Z12.D}, P5/Z, [X1, Z13.D,LSL #3] |
(209) 0x42be28 MOVPRFX Z23, Z15 |
(209) 0x42be2c FSUB Z23.D, P5/M, Z23.D, Z12.D |
(209) 0x42be30 MOVPRFX Z18, Z23 |
(209) 0x42be34 FMUL Z18.D, P5/M, Z18.D, Z7.D |
(209) 0x42be38 LD1D {Z14.D}, P4/Z, [X1, Z6.D,LSL #3] |
(209) 0x42be3c FCMGT P6.D, P5/Z, Z18.D, #0.0000000 |
(209) 0x42be40 LD1D {Z24.D}, P4/Z, [X1, Z16.D,LSL #3] |
(209) 0x42be44 LD1D {Z2.D}, P4/Z, [X4, Z6.D,LSL #3] |
(209) 0x42be48 MOVPRFX Z22, Z14 |
(209) 0x42be4c FSUB Z22.D, P4/M, Z22.D, Z24.D |
(209) 0x42be50 LD1D {Z6.D}, P4/Z, [X1, Z17.D,LSL #3] |
(209) 0x42be54 MOVPRFX Z5, Z21 |
(209) 0x42be58 FABS Z5.D, P3/M, Z21.D |
(209) 0x42be5c FSUB Z6.D, P4/M, Z6.D, Z14.D |
(209) 0x42be60 FDIV Z5.D, P5/M, Z5.D, Z3.D |
(209) 0x42be64 MOVPRFX Z17, Z22 |
(209) 0x42be68 FMUL Z17.D, P4/M, Z17.D, Z6.D |
(209) 0x42be6c EOR P7.B, P3/Z, P7.B, P3.B |
(209) 0x42be70 MOVPRFX Z18, Z5 |
(209) 0x42be74 FADD Z18.D, P6/M, Z18.D, #0.0000000 |
(209) 0x42be78 MOVPRFX Z16, Z27 |
(209) 0x42be7c FSUB Z16.D, P6/M, Z16.D, Z5.D |
(209) 0x42be80 MOVPRFX Z19.S, P7/M, Z25.S |
(209) 0x42be84 ADD Z19.S, P7/M, Z19.S, Z30.S |
(209) 0x42be88 FABS Z23.D, P3/M, Z23.D |
(209) 0x42be8c FCMGT P7.D, P4/Z, Z17.D, #0.0000000 |
(209) 0x42be90 FCMLE P14.D, P6/Z, Z7.D, #0.0000000 |
(209) 0x42be94 MOVPRFX Z3, Z7 |
(209) 0x42be98 FABS Z3.D, P3/M, Z7.D |
(209) 0x42be9c MOVPRFX Z4, Z20 |
(209) 0x42bea0 FABS Z4.D, P3/M, Z20.D |
(209) 0x42bea4 MOVPRFX Z7, Z3 |
(209) 0x42bea8 FMUL Z7.D, P6/M, Z7.D, Z16.D |
(209) 0x42beac FDIV Z4.D, P4/M, Z4.D, Z2.D |
(209) 0x42beb0 MOVPRFX Z16, Z23 |
(209) 0x42beb4 FMUL Z16.D, P6/M, Z16.D, Z18.D |
(209) 0x42beb8 MOVPRFX Z24, Z27 |
(209) 0x42bebc FSUB Z24.D, P7/M, Z24.D, Z4.D |
(209) 0x42bec0 SUNPKLO Z18.D, Z19 |
(209) 0x42bec4 MOVPRFX Z17, Z4 |
(209) 0x42bec8 FADD Z17.D, P7/M, Z17.D, #0.0000000 |
(209) 0x42becc LD1D {Z1.D}, P5/Z, [X8, X0,LSL #3] |
(209) 0x42bed0 LD1D {Z0.D}, P4/Z, [X10, X0,LSL #3] |
(209) 0x42bed4 FABS Z22.D, P3/M, Z22.D |
(209) 0x42bed8 FCMLE P15.D, P7/Z, Z6.D, #0.0000000 |
(209) 0x42bedc MOVPRFX Z2, Z6 |
(209) 0x42bee0 FABS Z2.D, P3/M, Z6.D |
(209) 0x42bee4 FDIV Z7.D, P6/M, Z7.D, Z1.D |
(209) 0x42bee8 MOVPRFX Z6, Z2 |
(209) 0x42beec FMUL Z6.D, P7/M, Z6.D, Z24.D |
(209) 0x42bef0 SUNPKHI Z19.D, Z19 |
(209) 0x42bef4 MOVPRFX Z24, Z22 |
(209) 0x42bef8 FMUL Z24.D, P7/M, Z24.D, Z17.D |
(209) 0x42befc FDIV Z6.D, P7/M, Z6.D, Z0.D |
(209) 0x42bf00 LD1D {Z17.D}, P6/Z, [X5, Z18.D,LSL #3] |
(209) 0x42bf04 LD1D {Z18.D}, P7/Z, [X5, Z19.D,LSL #3] |
(209) 0x42bf08 FDIV Z16.D, P6/M, Z16.D, Z17.D |
(209) 0x42bf0c FDIV Z24.D, P7/M, Z24.D, Z18.D |
(209) 0x42bf10 FADD Z7.D, P6/M, Z7.D, Z16.D |
(209) 0x42bf14 FADD Z6.D, P7/M, Z6.D, Z24.D |
(209) 0x42bf18 FMUL Z1.D, P6/M, Z1.D, Z7.D |
(209) 0x42bf1c FMUL Z0.D, P7/M, Z0.D, Z6.D |
(209) 0x42bf20 SEL Z7.D, P5, Z5.D, Z29.D |
(209) 0x42bf24 SEL Z6.D, P4, Z4.D, Z29.D |
(209) 0x42bf28 FSUBR Z7.D, P5/M, Z7.D, #0.0000000 |
(209) 0x42bf2c FSUBR Z6.D, P4/M, Z6.D, #0.0000000 |
(209) 0x42bf30 EOR P14.B, P6/Z, P14.B, P6.B |
(209) 0x42bf34 EOR P15.B, P7/Z, P15.B, P7.B |
(209) 0x42bf38 MOVPRFX Z13, Z28 |
(209) 0x42bf3c FCPY Z13.D, P14/M, #1.0000000 |
(209) 0x42bf40 FDIV Z1.D, P6/M, Z1.D, Z26.D |
(209) 0x42bf44 FDIV Z0.D, P7/M, Z0.D, Z26.D |
(209) 0x42bf48 FMINNM Z1.D, P3/M, Z1.D, Z23.D |
(209) 0x42bf4c FMINNM Z0.D, P3/M, Z0.D, Z22.D |
(209) 0x42bf50 FMINNM Z1.D, P3/M, Z1.D, Z3.D |
(209) 0x42bf54 FMINNM Z0.D, P3/M, Z0.D, Z2.D |
(209) 0x42bf58 MOVPRFX Z1.D, P6/Z, Z1.D |
(209) 0x42bf5c FMUL Z1.D, P6/M, Z1.D, Z13.D |
(209) 0x42bf60 MOVPRFX Z12, Z28 |
(209) 0x42bf64 FCPY Z12.D, P15/M, #1.0000000 |
(209) 0x42bf68 FMLA Z15.D, P5/M, Z1.D, Z7.D |
(209) 0x42bf6c MOVPRFX Z0.D, P7/Z, Z0.D |
(209) 0x42bf70 FMUL Z0.D, P7/M, Z0.D, Z12.D |
(209) 0x42bf74 FMUL Z21.D, P5/M, Z21.D, Z15.D |
(209) 0x42bf78 FMLA Z14.D, P4/M, Z0.D, Z6.D |
(209) 0x42bf7c FMUL Z20.D, P4/M, Z20.D, Z14.D |
(209) 0x42bf80 ST1D {Z21.D}, P5, [X3, X0,LSL #3] |
(209) 0x42bf84 ST1D {Z20.D}, P4, [X6, X0,LSL #3] |
(209) 0x42bf88 ADD X0, X0, X12 |
(209) 0x42bf8c WHILELO P4.D, X0, X22 |
(209) 0x42bf90 INCW Z25.S, ALL |
(209) 0x42bf94 WHILELO P5.D, X0, X2 |
(209) 0x42bf98 B.NE 42bdc0 |
0x42bf9c ADD X11, X11, #1 |
0x42bfa0 CMP W19, W11 |
0x42bfa4 B.LE 42bfe0 |
0x42bfa8 SUB W2, W18, W13 |
0x42bfac ORR W0, WZR, W21 |
0x42bfb0 ORR W1, WZR, W13 |
0x42bfb4 CMP W2, W0 |
0x42bfb8 CSEL X2, X2, X0, #9 |
0x42bfbc ADD W13, W1, W2 |
0x42bfc0 ORR W3, WZR, W20 |
0x42bfc4 CMP W1, W13 |
0x42bfc8 B.CC 42bd44 |
/home/eoseret/qaas/qaas_runs/178-219-7589/intel/CloverLeaf2.0-CXX/build/CloverLeaf2.0-CXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
/home/eoseret/qaas/qaas_runs/178-219-7589/intel/CloverLeaf2.0-CXX/build/CloverLeaf2.0-CXX/src/omp/advec_mom.cpp: 110 - 139 |
-------------------------------------------------------------------------------- |
110: for (int i = (x_min - 1 + 1); i < (x_max + 1 + 2); i++) |
111: ({ |
112: int upwind, donor, downwind, dif; |
113: double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; |
114: if (node_flux(i, j) < 0.0) { |
[...] |
120: upwind = i - 1; |
121: donor = i; |
122: downwind = i + 1; |
123: dif = upwind; |
124: } |
125: sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(donor, j)); |
126: width = celldx[i]; |
127: vdiffuw = vel1(donor, j) - vel1(upwind, j); |
128: vdiffdw = vel1(downwind, j) - vel1(donor, j); |
129: limiter = 0.0; |
130: if (vdiffuw * vdiffdw > 0.0) { |
131: auw = std::fabs(vdiffuw); |
132: adw = std::fabs(vdiffdw); |
133: wind = 1.0; |
134: if (vdiffdw <= 0.0) wind = -1.0; |
135: limiter = |
136: wind * std::fmin(std::fmin(width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldx[dif]) / 6.0, auw), adw); |
137: } |
138: advec_vel_s = vel1(donor, j) + (1.0 - sigma) * limiter; |
139: mom_flux(i, j) = advec_vel_s * node_flux(i, j); |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►100.00+ | omp_fulfill_event | libgomp.so.1.0.0 | |
| ○ | start_thread | libc.so.6 | |
| ○ | thread_start | libc.so.6 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 2.29 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.19 |
| Bottlenecks | P4, |
| Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
| Source | context.h:46-46,context.h:69-69,advec_mom.cpp:110-110 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 8.00 |
| CQA cycles if no scalar integer | 8.00 |
| CQA cycles if FP arith vectorized | 8.00 |
| CQA cycles if fully vectorized | 3.50 |
| Front-end cycles | 5.00 |
| P0 cycles | 1.00 |
| P1 cycles | 1.00 |
| P2 cycles | 6.75 |
| P3 cycles | 6.75 |
| P4 cycles | 8.00 |
| P5 cycles | 6.50 |
| P6 cycles | 1.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 0.00 |
| P10 cycles | 3.00 |
| P11 cycles | 3.00 |
| P12 cycles | 3.00 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 43.00 |
| Nb uops | 40.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 0.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | 0.00 |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 25.43 |
| Vector-efficiency ratio load | 25.00 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | 25.00 |
| Vector-efficiency ratio add_sub | 23.75 |
| Vector-efficiency ratio fma | 25.00 |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 28.13 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.00 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 2.29 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.19 |
| Bottlenecks | P4, |
| Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
| Source | context.h:46-46,context.h:69-69,advec_mom.cpp:110-110 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 8.00 |
| CQA cycles if no scalar integer | 8.00 |
| CQA cycles if FP arith vectorized | 8.00 |
| CQA cycles if fully vectorized | 3.50 |
| Front-end cycles | 5.00 |
| P0 cycles | 1.00 |
| P1 cycles | 1.00 |
| P2 cycles | 6.75 |
| P3 cycles | 6.75 |
| P4 cycles | 8.00 |
| P5 cycles | 6.50 |
| P6 cycles | 1.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 0.00 |
| P10 cycles | 3.00 |
| P11 cycles | 3.00 |
| P12 cycles | 3.00 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 43.00 |
| Nb uops | 40.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 0.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | 0.00 |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 25.43 |
| Vector-efficiency ratio load | 25.00 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | 25.00 |
| Vector-efficiency ratio add_sub | 23.75 |
| Vector-efficiency ratio fma | 25.00 |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 28.13 |
| Path / |
| nb instructions | 43 |
| nb uops | 40 |
| loop length | 172 |
| used w registers | 11 |
| used x registers | 20 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 0 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 1 |
| nb stack references | 0 |
| micro-operation queue | 5.00 cycles |
| front end | 5.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.00 | 1.00 | 6.75 | 6.75 | 8.00 | 6.50 | 1.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| cycles | 1.00 | 1.00 | 6.75 | 6.75 | 8.00 | 6.50 | 1.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.00 |
| Dispatch | 8.00 |
| Overall L1 | 8.00 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 25% |
| load | 25% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 25% |
| add-sub | 23% |
| fma | 25% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 28% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LDR X7, [X16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| SBFM X8, X3, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (100.0%) |
| INDEX Z25.S, W3, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | N/A |
| MOVZ X0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR X22, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| WHILELO P5.D, XZR, X2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | N/A |
| LDR X3, [X14] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| UQDECD X22, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| WHILELO P4.D, XZR, X22 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | N/A |
| LDR X1, [X17] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| MADD X7, X11, X7, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| LDR X4, [X15] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| MADD X3, X11, X3, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| LDR X5, [X30, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| MUL X1, X11, X1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| LDR X9, [X14, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| MUL X4, X11, X4 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| LDR X10, [X16, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X8, X5, X8,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR X23, [X17, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X3, X9, X3,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR X6, [X15, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X7, X10, X7,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADDVL X10, X8, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| ADD X1, X23, X1,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ADDVL X9, X7, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| ADD X4, X6, X4,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADDVL X6, X3, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| HINT #0 | N/A | ||||||||||||||||||
| HINT #0 | N/A | ||||||||||||||||||
| HINT #0 | N/A | ||||||||||||||||||
| ADD X11, X11, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| CMP W19, W11 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 42bfe0 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x37c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| SUB W2, W18, W13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W0, WZR, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W1, WZR, W13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W2, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| CSEL X2, X2, X0, #9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD W13, W1, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR W3, WZR, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP W1, W13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.CC 42bd44 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0xe0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| nb instructions | 43 |
| nb uops | 40 |
| loop length | 172 |
| used w registers | 11 |
| used x registers | 20 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 0 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 1 |
| nb stack references | 0 |
| micro-operation queue | 5.00 cycles |
| front end | 5.00 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.00 | 1.00 | 6.75 | 6.75 | 8.00 | 6.50 | 1.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| cycles | 1.00 | 1.00 | 6.75 | 6.75 | 8.00 | 6.50 | 1.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.00 |
| Dispatch | 8.00 |
| Overall L1 | 8.00 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 25% |
| load | 25% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 25% |
| add-sub | 23% |
| fma | 25% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 28% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LDR X7, [X16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| SBFM X8, X3, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (100.0%) |
| INDEX Z25.S, W3, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | N/A |
| MOVZ X0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR X22, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| WHILELO P5.D, XZR, X2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | N/A |
| LDR X3, [X14] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| UQDECD X22, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| WHILELO P4.D, XZR, X22 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | N/A |
| LDR X1, [X17] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| MADD X7, X11, X7, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| LDR X4, [X15] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| MADD X3, X11, X3, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| LDR X5, [X30, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| MUL X1, X11, X1 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
| LDR X9, [X14, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| MUL X4, X11, X4 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (25.0%) |
| LDR X10, [X16, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X8, X5, X8,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR X23, [X17, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X3, X9, X3,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR X6, [X15, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X7, X10, X7,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADDVL X10, X8, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| ADD X1, X23, X1,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ADDVL X9, X7, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| ADD X4, X6, X4,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADDVL X6, X3, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| HINT #0 | N/A | ||||||||||||||||||
| HINT #0 | N/A | ||||||||||||||||||
| HINT #0 | N/A | ||||||||||||||||||
| ADD X11, X11, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| CMP W19, W11 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 42bfe0 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x37c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| SUB W2, W18, W13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W0, WZR, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W1, WZR, W13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W2, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| CSEL X2, X2, X0, #9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD W13, W1, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR W3, WZR, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP W1, W13 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.CC 42bd44 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0xe0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
