| Loop Id: 213 | Module: exec | Source: advec_mom.cpp:182-211 [...] | Coverage: 0.01% |
|---|
| Loop Id: 213 | Module: exec | Source: advec_mom.cpp:182-211 [...] | Coverage: 0.01% |
|---|
0x42c104 LDR X7, [X15] |
0x42c108 ADD W6, W8, #2 |
0x42c10c SBFM X1, X4, #0, #31 |
0x42c110 DUP Z6.S, W6 |
0x42c114 DUP Z7.S, W5 |
0x42c118 INDEX Z21.S, W4, #1 |
0x42c11c LDR X3, [X13] |
0x42c120 SUB W9, W8, #1 |
0x42c124 ADD X22, X8, #1 |
0x42c128 DUP Z24.D, X8 |
0x42c12c DUP Z20.S, W9 |
0x42c130 MOVZ X0, #0 |
0x42c134 LDR X6, [X15, #16] |
0x42c138 ORR X10, XZR, X2 |
0x42c13c WHILELO P6.D, XZR, X2 |
0x42c140 MADD X7, X8, X7, X1 |
0x42c144 UQDECD X10, ALL |
0x42c148 DUP Z23.D, X22 |
0x42c14c LDR X5, [X13, #16] |
0x42c150 WHILELO P5.D, XZR, X10 |
0x42c154 LD1RD {Z19.D}, P4/Z, [X14] |
0x42c158 MADD X3, X8, X3, X1 |
0x42c15c LD1RD {Z25.D}, P4/Z, [X16] |
0x42c160 LDR X4, [X18, #8] |
0x42c164 ADD X7, X6, X7,LSL #3 |
0x42c168 LDR X1, [X16, #16] |
0x42c16c ADD X3, X5, X3,LSL #3 |
0x42c170 ADDVL X9, X7, #1 |
0x42c174 LDR X6, [X14, #16] |
0x42c178 ADD X8, X4, X8,LSL #3 |
0x42c17c ADDVL X5, X3, #1 |
(212) 0x42c180 LD1D {Z26.D}, P6/Z, [X7, X0,LSL #3] |
(212) 0x42c184 LD1D {Z27.D}, P5/Z, [X9, X0,LSL #3] |
(212) 0x42c188 SUNPKHI Z29.D, Z21 |
(212) 0x42c18c FCMLT P14.D, P4/Z, Z26.D, #0.0000000 |
(212) 0x42c190 FCMLT P15.D, P4/Z, Z27.D, #0.0000000 |
(212) 0x42c194 SUNPKLO Z15.D, Z21 |
(212) 0x42c198 UZP1 P13.S, P14.S, P15.S |
(212) 0x42c19c SEL Z31.S, P13, Z6.S, Z20.S |
(212) 0x42c1a0 SUNPKLO Z30.D, Z31 |
(212) 0x42c1a4 SUNPKHI Z31.D, Z31 |
(212) 0x42c1a8 MAD Z30.D, P4/M, Z25.D, Z15.D |
(212) 0x42c1ac MAD Z31.D, P4/M, Z25.D, Z29.D |
(212) 0x42c1b0 LD1D {Z5.D}, P6/Z, [X1, Z30.D,LSL #3] |
(212) 0x42c1b4 LD1D {Z2.D}, P5/Z, [X1, Z31.D,LSL #3] |
(212) 0x42c1b8 SEL Z14.D, P14, Z24.D, Z23.D |
(212) 0x42c1bc SEL Z31.D, P14, Z23.D, Z24.D |
(212) 0x42c1c0 MOVPRFX Z30, Z15 |
(212) 0x42c1c4 MLA Z30.D, P4/M, Z25.D, Z31.D |
(212) 0x42c1c8 MAD Z31.D, P4/M, Z19.D, Z15.D |
(212) 0x42c1cc LD1D {Z28.D}, P6/Z, [X1, Z30.D,LSL #3] |
(212) 0x42c1d0 MLA Z15.D, P4/M, Z25.D, Z14.D |
(212) 0x42c1d4 MOVPRFX Z30, Z28 |
(212) 0x42c1d8 FSUB Z30.D, P6/M, Z30.D, Z5.D |
(212) 0x42c1dc LD1D {Z14.D}, P6/Z, [X1, Z15.D,LSL #3] |
(212) 0x42c1e0 MOVPRFX Z4, Z30 |
(212) 0x42c1e4 FABS Z4.D, P4/M, Z30.D |
(212) 0x42c1e8 FSUB Z14.D, P6/M, Z14.D, Z28.D |
(212) 0x42c1ec FMUL Z30.D, P6/M, Z30.D, Z14.D |
(212) 0x42c1f0 LD1D {Z13.D}, P6/Z, [X6, Z31.D,LSL #3] |
(212) 0x42c1f4 FCMGT P7.D, P6/Z, Z30.D, #0.0000000 |
(212) 0x42c1f8 SEL Z3.S, P13, Z7.S, Z20.S |
(212) 0x42c1fc SUNPKLO Z30.D, Z3 |
(212) 0x42c200 MOVPRFX Z31, Z26 |
(212) 0x42c204 FABS Z31.D, P4/M, Z26.D |
(212) 0x42c208 LD1D {Z15.D}, P7/Z, [X4, Z30.D,LSL #3] |
(212) 0x42c20c FDIV Z31.D, P6/M, Z31.D, Z13.D |
(212) 0x42c210 MOVPRFX Z13, Z31 |
(212) 0x42c214 FADD Z13.D, P7/M, Z13.D, #0.0000000 |
(212) 0x42c218 MOVPRFX Z30, Z4 |
(212) 0x42c21c FMUL Z30.D, P7/M, Z30.D, Z13.D |
(212) 0x42c220 MOVPRFX Z5, Z14 |
(212) 0x42c224 FABS Z5.D, P4/M, Z14.D |
(212) 0x42c228 MOVPRFX Z1, Z30 |
(212) 0x42c22c FDIV Z1.D, P7/M, Z1.D, Z15.D |
(212) 0x42c230 MOVPRFX Z13, Z22 |
(212) 0x42c234 FSUB Z13.D, P7/M, Z13.D, Z31.D |
(212) 0x42c238 SEL Z15.D, P6, Z31.D, Z16.D |
(212) 0x42c23c MOVPRFX Z30, Z5 |
(212) 0x42c240 FMUL Z30.D, P7/M, Z30.D, Z13.D |
(212) 0x42c244 LD1RD {Z31.D}, P4/Z, [X8] |
(212) 0x42c248 FSUBR Z15.D, P6/M, Z15.D, #0.0000000 |
(212) 0x42c24c FCMLE P14.D, P7/Z, Z14.D, #0.0000000 |
(212) 0x42c250 FDIV Z30.D, P7/M, Z30.D, Z31.D |
(212) 0x42c254 EOR P14.B, P7/Z, P14.B, P7.B |
(212) 0x42c258 FADD Z30.D, P7/M, Z30.D, Z1.D |
(212) 0x42c25c MOVPRFX Z14, Z17 |
(212) 0x42c260 FCPY Z14.D, P14/M, #1.0000000 |
(212) 0x42c264 MOVPRFX Z0, Z31 |
(212) 0x42c268 FMUL Z0.D, P7/M, Z0.D, Z30.D |
(212) 0x42c26c FDIV Z0.D, P7/M, Z0.D, Z18.D |
(212) 0x42c270 FMINNM Z4.D, P4/M, Z4.D, Z0.D |
(212) 0x42c274 FMINNM Z5.D, P4/M, Z5.D, Z4.D |
(212) 0x42c278 MOVPRFX Z5.D, P7/Z, Z5.D |
(212) 0x42c27c FMUL Z5.D, P7/M, Z5.D, Z14.D |
(212) 0x42c280 FMLA Z28.D, P6/M, Z5.D, Z15.D |
(212) 0x42c284 SEL Z1.D, P15, Z23.D, Z24.D |
(212) 0x42c288 FMUL Z26.D, P6/M, Z26.D, Z28.D |
(212) 0x42c28c MOVPRFX Z13, Z29 |
(212) 0x42c290 MLA Z13.D, P4/M, Z25.D, Z1.D |
(212) 0x42c294 SEL Z15.D, P15, Z24.D, Z23.D |
(212) 0x42c298 LD1D {Z30.D}, P5/Z, [X1, Z13.D,LSL #3] |
(212) 0x42c29c MAD Z15.D, P4/M, Z25.D, Z29.D |
(212) 0x42c2a0 MOVPRFX Z4, Z30 |
(212) 0x42c2a4 FSUB Z4.D, P5/M, Z4.D, Z2.D |
(212) 0x42c2a8 LD1D {Z28.D}, P5/Z, [X1, Z15.D,LSL #3] |
(212) 0x42c2ac MOVPRFX Z5, Z4 |
(212) 0x42c2b0 FABS Z5.D, P4/M, Z4.D |
(212) 0x42c2b4 FSUB Z28.D, P5/M, Z28.D, Z30.D |
(212) 0x42c2b8 FMUL Z4.D, P5/M, Z4.D, Z28.D |
(212) 0x42c2bc MAD Z1.D, P4/M, Z19.D, Z29.D |
(212) 0x42c2c0 FCMGT P7.D, P5/Z, Z4.D, #0.0000000 |
(212) 0x42c2c4 LD1D {Z0.D}, P5/Z, [X6, Z1.D,LSL #3] |
(212) 0x42c2c8 MOVPRFX Z29, Z27 |
(212) 0x42c2cc FABS Z29.D, P4/M, Z27.D |
(212) 0x42c2d0 SUNPKHI Z3.D, Z3 |
(212) 0x42c2d4 FDIV Z29.D, P5/M, Z29.D, Z0.D |
(212) 0x42c2d8 LD1D {Z4.D}, P7/Z, [X4, Z3.D,LSL #3] |
(212) 0x42c2dc SEL Z2.D, P5, Z29.D, Z16.D |
(212) 0x42c2e0 MOVPRFX Z14, Z28 |
(212) 0x42c2e4 FABS Z14.D, P4/M, Z28.D |
(212) 0x42c2e8 MOVPRFX Z15, Z29 |
(212) 0x42c2ec FADD Z15.D, P7/M, Z15.D, #0.0000000 |
(212) 0x42c2f0 FCMLE P15.D, P7/Z, Z28.D, #0.0000000 |
(212) 0x42c2f4 MOVPRFX Z28, Z5 |
(212) 0x42c2f8 FMUL Z28.D, P7/M, Z28.D, Z15.D |
(212) 0x42c2fc FDIV Z28.D, P7/M, Z28.D, Z4.D |
(212) 0x42c300 MOVPRFX Z4, Z22 |
(212) 0x42c304 FSUB Z4.D, P7/M, Z4.D, Z29.D |
(212) 0x42c308 MOVPRFX Z29, Z14 |
(212) 0x42c30c FMUL Z29.D, P7/M, Z29.D, Z4.D |
(212) 0x42c310 FDIV Z29.D, P7/M, Z29.D, Z31.D |
(212) 0x42c314 FADD Z29.D, P7/M, Z29.D, Z28.D |
(212) 0x42c318 FMUL Z31.D, P7/M, Z31.D, Z29.D |
(212) 0x42c31c FDIV Z31.D, P7/M, Z31.D, Z18.D |
(212) 0x42c320 FSUBR Z2.D, P5/M, Z2.D, #0.0000000 |
(212) 0x42c324 EOR P15.B, P7/Z, P15.B, P7.B |
(212) 0x42c328 FMINNM Z5.D, P4/M, Z5.D, Z31.D |
(212) 0x42c32c MOVPRFX Z15, Z17 |
(212) 0x42c330 FCPY Z15.D, P15/M, #1.0000000 |
(212) 0x42c334 MOVPRFX Z31, Z14 |
(212) 0x42c338 FMINNM Z31.D, P4/M, Z31.D, Z5.D |
(212) 0x42c33c MOVPRFX Z31.D, P7/Z, Z31.D |
(212) 0x42c340 FMUL Z31.D, P7/M, Z31.D, Z15.D |
(212) 0x42c344 FMLA Z30.D, P5/M, Z31.D, Z2.D |
(212) 0x42c348 FMUL Z27.D, P5/M, Z27.D, Z30.D |
(212) 0x42c34c ST1D {Z26.D}, P6, [X3, X0,LSL #3] |
(212) 0x42c350 ST1D {Z27.D}, P5, [X5, X0,LSL #3] |
(212) 0x42c354 ADD X0, X0, X11 |
(212) 0x42c358 WHILELO P5.D, X0, X10 |
(212) 0x42c35c INCW Z21.S, ALL |
(212) 0x42c360 WHILELO P6.D, X0, X2 |
(212) 0x42c364 B.NE 42c180 |
0x42c368 ORR X8, XZR, X22 |
0x42c36c CMP W19, W22 |
0x42c370 B.LE 42c3b0 |
0x42c374 SUB W2, W17, W12 |
0x42c378 ORR W0, WZR, W21 |
0x42c37c ORR W1, WZR, W12 |
0x42c380 CMP W2, W0 |
0x42c384 CSEL X2, X2, X0, #9 |
0x42c388 ADD W12, W1, W2 |
0x42c38c ORR W4, WZR, W20 |
0x42c390 ADD W5, W8, #1 |
0x42c394 CMP W1, W12 |
0x42c398 B.CC 42c104 |
/home/eoseret/qaas/qaas_runs/178-219-7589/intel/CloverLeaf2.0-CXX/build/CloverLeaf2.0-CXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
/home/eoseret/qaas/qaas_runs/178-219-7589/intel/CloverLeaf2.0-CXX/build/CloverLeaf2.0-CXX/src/omp/advec_mom.cpp: 182 - 211 |
-------------------------------------------------------------------------------- |
182: for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) |
183: ({ |
184: int upwind, donor, downwind, dif; |
185: double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; |
186: if (node_flux(i, j) < 0.0) { |
[...] |
197: sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(i, donor)); |
198: width = celldy[j]; |
199: vdiffuw = vel1(i, donor) - vel1(i, upwind); |
200: vdiffdw = vel1(i, downwind) - vel1(i, donor); |
201: limiter = 0.0; |
202: if (vdiffuw * vdiffdw > 0.0) { |
203: auw = std::fabs(vdiffuw); |
204: adw = std::fabs(vdiffdw); |
205: wind = 1.0; |
206: if (vdiffdw <= 0.0) wind = -1.0; |
207: limiter = |
208: wind * std::fmin(std::fmin(width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldy[dif]) / 6.0, auw), adw); |
209: } |
210: advec_vel_s = vel1(i, donor) + (1.0 - sigma) * limiter; |
211: mom_flux(i, j) = advec_vel_s * node_flux(i, j); |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►98.38+ | omp_fulfill_event | libgomp.so.1.0.0 | |
| ○ | start_thread | libc.so.6 | |
| ○ | thread_start | libc.so.6 |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.40 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 5.05 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.78 |
| Bottlenecks | P4, |
| Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
| Source | context.h:46-46,context.h:69-69,advec_mom.cpp:182-182 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 12.00 |
| CQA cycles if no scalar integer | 5.00 |
| CQA cycles if FP arith vectorized | 12.00 |
| CQA cycles if fully vectorized | 2.38 |
| Front-end cycles | 5.50 |
| P0 cycles | 1.00 |
| P1 cycles | 1.00 |
| P2 cycles | 6.75 |
| P3 cycles | 6.75 |
| P4 cycles | 12.00 |
| P5 cycles | 6.50 |
| P6 cycles | 1.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 0.00 |
| P10 cycles | 3.00 |
| P11 cycles | 3.00 |
| P12 cycles | 3.00 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 44.00 |
| Nb uops | 44.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 0.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | 0.00 |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 20.96 |
| Vector-efficiency ratio load | 25.00 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 20.45 |
| Vector-efficiency ratio fma | 25.00 |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 18.27 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.40 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 5.05 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.78 |
| Bottlenecks | P4, |
| Function | advec_mom_kernel(int, int, int, int, clover::Buffer2D |
| Source | context.h:46-46,context.h:69-69,advec_mom.cpp:182-182 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 12.00 |
| CQA cycles if no scalar integer | 5.00 |
| CQA cycles if FP arith vectorized | 12.00 |
| CQA cycles if fully vectorized | 2.38 |
| Front-end cycles | 5.50 |
| P0 cycles | 1.00 |
| P1 cycles | 1.00 |
| P2 cycles | 6.75 |
| P3 cycles | 6.75 |
| P4 cycles | 12.00 |
| P5 cycles | 6.50 |
| P6 cycles | 1.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 0.00 |
| P10 cycles | 3.00 |
| P11 cycles | 3.00 |
| P12 cycles | 3.00 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 44.00 |
| Nb uops | 44.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 0.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | 0.00 |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 20.96 |
| Vector-efficiency ratio load | 25.00 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | 20.45 |
| Vector-efficiency ratio fma | 25.00 |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 18.27 |
| Path / |
| nb instructions | 44 |
| nb uops | 44 |
| loop length | 176 |
| used w registers | 15 |
| used x registers | 18 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 0 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 8 |
| nb stack references | 0 |
| micro-operation queue | 5.50 cycles |
| front end | 5.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.00 | 1.00 | 6.75 | 6.75 | 12.00 | 6.50 | 1.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| cycles | 1.00 | 1.00 | 6.75 | 6.75 | 12.00 | 6.50 | 1.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.50 |
| Dispatch | 12.00 |
| Overall L1 | 12.00 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 20% |
| load | 25% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 20% |
| fma | 25% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 18% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LDR X7, [X15] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD W6, W8, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SBFM X1, X4, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| DUP Z6.S, W6 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| DUP Z7.S, W5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| INDEX Z21.S, W4, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | N/A |
| LDR X3, [X13] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| SUB W9, W8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD X22, X8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| DUP Z24.D, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (25.0%) |
| DUP Z20.S, W9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| MOVZ X0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR X6, [X15, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ORR X10, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| WHILELO P6.D, XZR, X2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | N/A |
| MADD X7, X8, X7, X1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| UQDECD X10, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| DUP Z23.D, X22 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (25.0%) |
| LDR X5, [X13, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| WHILELO P5.D, XZR, X10 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | N/A |
| LD1RD {Z19.D}, P4/Z, [X14] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 | scal (25.0%) |
| MADD X3, X8, X3, X1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| LD1RD {Z25.D}, P4/Z, [X16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 | scal (25.0%) |
| LDR X4, [X18, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X7, X6, X7,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR X1, [X16, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| ADD X3, X5, X3,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADDVL X9, X7, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| LDR X6, [X14, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X8, X4, X8,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADDVL X5, X3, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| ORR X8, XZR, X22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| CMP W19, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 42c3b0 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x38c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| SUB W2, W17, W12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W0, WZR, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W1, WZR, W12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W2, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| CSEL X2, X2, X0, #9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD W12, W1, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR W4, WZR, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD W5, W8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP W1, W12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.CC 42c104 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0xe0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| nb instructions | 44 |
| nb uops | 44 |
| loop length | 176 |
| used w registers | 15 |
| used x registers | 18 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 0 |
| used d registers | 0 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 8 |
| nb stack references | 0 |
| micro-operation queue | 5.50 cycles |
| front end | 5.50 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 1.00 | 1.00 | 6.75 | 6.75 | 12.00 | 6.50 | 1.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| cycles | 1.00 | 1.00 | 6.75 | 6.75 | 12.00 | 6.50 | 1.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 5.50 |
| Dispatch | 12.00 |
| Overall L1 | 12.00 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 20% |
| load | 25% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 20% |
| fma | 25% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 18% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| LDR X7, [X15] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD W6, W8, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SBFM X1, X4, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| DUP Z6.S, W6 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| DUP Z7.S, W5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| INDEX Z21.S, W4, #1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 1 | N/A |
| LDR X3, [X13] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| SUB W9, W8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD X22, X8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| DUP Z24.D, X8 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (25.0%) |
| DUP Z20.S, W9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (12.5%) |
| MOVZ X0, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR X6, [X15, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ORR X10, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| WHILELO P6.D, XZR, X2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | N/A |
| MADD X7, X8, X7, X1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| UQDECD X10, ALL | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| DUP Z23.D, X22 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 1 | scal (25.0%) |
| LDR X5, [X13, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| WHILELO P5.D, XZR, X10 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 2 | N/A |
| LD1RD {Z19.D}, P4/Z, [X14] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 | scal (25.0%) |
| MADD X3, X8, X3, X1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| LD1RD {Z25.D}, P4/Z, [X16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 6 | 0.50 | scal (25.0%) |
| LDR X4, [X18, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X7, X6, X7,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR X1, [X16, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| ADD X3, X5, X3,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADDVL X9, X7, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| LDR X6, [X14, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| ADD X8, X4, X8,LSL #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADDVL X5, X3, #1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (25.0%) |
| ORR X8, XZR, X22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| CMP W19, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LE 42c3b0 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x38c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| SUB W2, W17, W12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W0, WZR, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ORR W1, WZR, W12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP W2, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| CSEL X2, X2, X0, #9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD W12, W1, W2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR W4, WZR, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| ADD W5, W8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP W1, W12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.CC 42c104 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0xe0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
