Function: advec_mom_kernel(int, int, int, int, clover::Buffer2D<double>&, clover::Buffer2D<double>&, ... | Module: exec | Source: advec_mom.cpp:180-211 [...] | Coverage: 3.93% |
---|
Function: advec_mom_kernel(int, int, int, int, clover::Buffer2D<double>&, clover::Buffer2D<double>&, ... | Module: exec | Source: advec_mom.cpp:180-211 [...] | Coverage: 3.93% |
---|
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/advec_mom.cpp: 180 - 211 |
-------------------------------------------------------------------------------- |
180: #pragma omp parallel for simd collapse(2) |
181: for (int j = (y_min - 1 + 1); j < (y_max + 1 + 2); j++) { |
182: for (int i = (x_min + 1); i < (x_max + 1 + 2); i++) |
183: ({ |
184: int upwind, donor, downwind, dif; |
185: double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; |
186: if (node_flux(i, j) < 0.0) { |
[...] |
197: sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(i, donor)); |
198: width = celldy[j]; |
199: vdiffuw = vel1(i, donor) - vel1(i, upwind); |
200: vdiffdw = vel1(i, downwind) - vel1(i, donor); |
201: limiter = 0.0; |
202: if (vdiffuw * vdiffdw > 0.0) { |
203: auw = std::fabs(vdiffuw); |
204: adw = std::fabs(vdiffdw); |
205: wind = 1.0; |
206: if (vdiffdw <= 0.0) wind = -1.0; |
207: limiter = |
208: wind * std::fmin(std::fmin(width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldy[dif]) / 6.0, auw), adw); |
209: } |
210: advec_vel_s = vel1(i, donor) + (1.0 - sigma) * limiter; |
211: mom_flux(i, j) = advec_vel_s * node_flux(i, j); |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
0x421c84 STP X29, X30, [SP, #912]! |
0x421c88 ADD X29, SP, #0 |
0x421c8c STP X19, X20, [SP, #16] |
0x421c90 LDR W19, [X0, #52] |
0x421c94 STP X23, X24, [SP, #48] |
0x421c98 ORR X23, XZR, X0 |
0x421c9c STP X25, X26, [SP, #64] |
0x421ca0 ADD W19, W19, #3 |
0x421ca4 LDR W26, [X23, #48] |
0x421ca8 LDR W20, [X0, #40] |
0x421cac LDR W0, [X0, #44] |
0x421cb0 CMP W26, W19 |
0x421cb4 B.GE 421ff0 |
0x421cb8 ADD W20, W20, #1 |
0x421cbc STP X21, X22, [SP, #32] |
0x421cc0 ADD W22, W0, #3 |
0x421cc4 SUB W25, W19, W26 |
0x421cc8 CMP W20, W22 |
0x421ccc B.GE 422004 |
0x421cd0 SUB W21, W22, W20 |
0x421cd4 BL 403530 |
0x421cd8 MADD W25, W25, W21, WZR |
0x421cdc ORR W24, WZR, W0 |
0x421ce0 BL 4033c0 |
0x421ce4 ORR W3, WZR, W0 |
0x421ce8 UDIV W2, W25, W24 |
0x421cec MSUB W1, W2, W24, W25 |
0x421cf0 CMP W0, W1 |
0x421cf4 B.CC 42201c |
(258) 0x421cf8 MADD W8, W2, W3, W1 |
(258) 0x421cfc ADD W15, W2, W8 |
(258) 0x421d00 CMP W8, W15 |
(258) 0x421d04 B.CS 422004 |
(260) 0x421d08 UDIV W5, W8, W21 |
(260) 0x421d0c ADRP X16, |
(260) 0x421d10 STP D8, D9, [SP, #80] |
(260) 0x421d14 LDP X14, X13, [X23] |
(260) 0x421d18 ADD X25, X16, #4072 |
(260) 0x421d1c CNTW X10, ALL |
(260) 0x421d20 PTRUE P0.B, ALL |
(260) 0x421d24 DUP Z27.S, #2 |
(260) 0x421d28 DUP Z26.B, #255 |
(260) 0x421d2c FDUP Z22.D, #240 |
(260) 0x421d30 FDUP Z21.D, #0 |
(260) 0x421d34 LDP X12, X11, [X23, #16] |
(260) 0x421d38 LDR X17, [X23, #32] |
(260) 0x421d3c STR D10, [SP, #96] |
(260) 0x421d40 MSUB W4, W5, W21, W8 |
(260) 0x421d44 ADD W24, W5, W26 |
(260) 0x421d48 SBFM X18, X24, #0, #31 |
(260) 0x421d4c ADD W3, W4, W20 |
(260) 0x421d50 SUB W26, W22, W3 |
(260) 0x421d54 ADD W22, W24, #1 |
(260) 0x421d58 CMP W2, W26 |
(260) 0x421d5c CSEL X26, X2, X26, #9 |
(260) 0x421d60 ADD W30, W8, W26 |
(260) 0x421d64 CMP W8, W30 |
(260) 0x421d68 B.CS 421fd4 |
(261) 0x421d6c LDR X6, [X11] |
(261) 0x421d70 SBFM X9, X3, #0, #31 |
(261) 0x421d74 INDEX Z16.S, W3, #1 |
(261) 0x421d78 DUP Z17.S, W24 |
(261) 0x421d7c MOVZ X0, #0 |
(261) 0x421d80 DUP Z23.S, W22 |
(261) 0x421d84 LDR X3, [X13] |
(261) 0x421d88 ORR X8, XZR, X26 |
(261) 0x421d8c WHILELO P2.D, XZR, X26 |
(261) 0x421d90 UQDECD X8, ALL |
(261) 0x421d94 SUNPKLO Z20.D, Z23 |
(261) 0x421d98 WHILELO P1.D, XZR, X8 |
(261) 0x421d9c LDR X7, [X11, #16] |
(261) 0x421da0 SUNPKHI Z19.D, Z23 |
(261) 0x421da4 MOVPRFX Z25, Z17 |
(261) 0x421da8 SUB Z25.S, Z25.S, #1 |
(261) 0x421dac MADD X23, X18, X6, X9 |
(261) 0x421db0 DUP Z7.D, X18 |
(261) 0x421db4 LD1RD {Z18.D}, P0/Z, [X12] |
(261) 0x421db8 LDR X2, [X13, #16] |
(261) 0x421dbc LD1RD {Z6.D}, P0/Z, [X14] |
(261) 0x421dc0 LD1RD {Z24.D}, P0/Z, [X25] |
(261) 0x421dc4 MADD X1, X18, X3, X9 |
(261) 0x421dc8 LDR X4, [X17, #8] |
(261) 0x421dcc ADD X24, X7, X23,LSL #3 |
(261) 0x421dd0 LDR X6, [X12, #16] |
(261) 0x421dd4 ADD X16, X2, X1,LSL #3 |
(261) 0x421dd8 ADDVL X5, X24, #1 |
(261) 0x421ddc LDR X23, [X14, #16] |
(261) 0x421de0 ADD X9, X4, X18,LSL #3 |
(261) 0x421de4 ADDVL X7, X16, #1 |
(259) 0x421de8 LD1D {Z28.D}, P2/Z, [X16, X0,LSL #3] |
(259) 0x421dec LD1D {Z5.D}, P1/Z, [X7, X0,LSL #3] |
(259) 0x421df0 SUNPKLO Z9.D, Z16 |
(259) 0x421df4 SUNPKHI Z8.D, Z16 |
(259) 0x421df8 FCMGE P4.D, P0/Z, Z28.D, #0 |
(259) 0x421dfc FCMGE P5.D, P0/Z, Z5.D, #0 |
(259) 0x421e00 SEL Z1.D, P4, Z7.D, Z20.D |
(259) 0x421e04 UZP1 P3.S, P4.S, P5.S |
(259) 0x421e08 MOVPRFX Z3, Z9 |
(259) 0x421e0c MLA Z3.D, P0/M, Z6.D, Z1.D |
(259) 0x421e10 EOR P6.B, P0/Z, P3.B, P0.B |
(259) 0x421e14 MAD Z1.D, P0/M, Z18.D, Z9.D |
(259) 0x421e18 SEL Z0.S, P6, Z17.S, Z25.S |
(259) 0x421e1c LD1D {Z30.D}, P2/Z, [X6, Z1.D,LSL #3] |
(259) 0x421e20 ADD Z0.S, P6/M, Z0.S, Z27.S |
(259) 0x421e24 SUNPKHI Z2.D, Z0 |
(259) 0x421e28 MAD Z2.D, P0/M, Z6.D, Z8.D |
(259) 0x421e2c LD1D {Z3.D}, P2/Z, [X23, Z3.D,LSL #3] |
(259) 0x421e30 FCMLT P7.D, P0/Z, Z28.D, #0 |
(259) 0x421e34 SUNPKLO Z29.D, Z0 |
(259) 0x421e38 MOVPRFX Z4, Z28 |
(259) 0x421e3c FABS Z4.D, P0/M, Z28.D |
(259) 0x421e40 LD1D {Z0.D}, P1/Z, [X23, Z2.D,LSL #3] |
(259) 0x421e44 FDIV Z4.D, P0/M, Z4.D, Z30.D |
(259) 0x421e48 MAD Z29.D, P0/M, Z6.D, Z9.D |
(259) 0x421e4c SEL Z1.D, P7, Z7.D, Z20.D |
(259) 0x421e50 LD1D {Z29.D}, P2/Z, [X23, Z29.D,LSL #3] |
(259) 0x421e54 MAD Z1.D, P0/M, Z6.D, Z9.D |
(259) 0x421e58 FSUB Z10.D, Z3.D, Z29.D |
(259) 0x421e5c LD1D {Z2.D}, P2/Z, [X23, Z1.D,LSL #3] |
(259) 0x421e60 FABD Z29.D, P0/M, Z29.D, Z3.D |
(259) 0x421e64 FSUB Z30.D, Z2.D, Z3.D |
(259) 0x421e68 SEL Z31.S, P3, Z17.S, Z23.S |
(259) 0x421e6c FMUL Z10.D, Z10.D, Z30.D |
(259) 0x421e70 ADD Z31.S, P3/M, Z31.S, Z26.S |
(259) 0x421e74 SUNPKLO Z9.D, Z31 |
(259) 0x421e78 FCMGT P3.D, P2/Z, Z10.D, #0 |
(259) 0x421e7c FABD Z2.D, P0/M, Z2.D, Z3.D |
(259) 0x421e80 FCMGT P4.D, P0/Z, Z30.D, #0 |
(259) 0x421e84 MOVPRFX Z1, Z4 |
(259) 0x421e88 FADD Z1.D, P0/M, Z1.D, #1 |
(259) 0x421e8c LD1D {Z30.D}, P3/Z, [X4, Z9.D,LSL #3] |
(259) 0x421e90 FCMGT P6.D, P4/Z, Z10.D, #0 |
(259) 0x421e94 MOVPRFX Z9, Z29 |
(259) 0x421e98 FMINNM Z9.D, P0/M, Z9.D, Z2.D |
(259) 0x421e9c FMUL Z1.D, Z1.D, Z29.D |
(259) 0x421ea0 MOVPRFX Z10, Z22 |
(259) 0x421ea4 FCPY Z10.D, P6/M, #1.0000000 |
(259) 0x421ea8 FSUB Z29.D, Z21.D, Z4.D |
(259) 0x421eac FDIV Z1.D, P0/M, Z1.D, Z30.D |
(259) 0x421eb0 FMUL Z2.D, Z29.D, Z2.D |
(259) 0x421eb4 LD1RD {Z30.D}, P0/Z, [X9] |
(259) 0x421eb8 FSUBR Z4.D, P0/M, Z4.D, #1 |
(259) 0x421ebc FDIV Z2.D, P0/M, Z2.D, Z30.D |
(259) 0x421ec0 FMUL Z29.D, Z30.D, Z24.D |
(259) 0x421ec4 FMUL Z4.D, Z4.D, Z10.D |
(259) 0x421ec8 FADD Z1.D, Z1.D, Z2.D |
(259) 0x421ecc SEL Z10.D, P5, Z7.D, Z19.D |
(259) 0x421ed0 FMUL Z1.D, Z1.D, Z29.D |
(259) 0x421ed4 FMINNM Z1.D, P0/M, Z1.D, Z9.D |
(259) 0x421ed8 MOVPRFX Z9, Z8 |
(259) 0x421edc MLA Z9.D, P0/M, Z6.D, Z10.D |
(259) 0x421ee0 FMLA Z3.D, P3/M, Z4.D, Z1.D |
(259) 0x421ee4 LD1D {Z2.D}, P1/Z, [X23, Z9.D,LSL #3] |
(259) 0x421ee8 FMUL Z3.D, Z28.D, Z3.D |
(259) 0x421eec FSUB Z1.D, Z2.D, Z0.D |
(259) 0x421ef0 MAD Z10.D, P0/M, Z18.D, Z8.D |
(259) 0x421ef4 FCMLT P8.D, P0/Z, Z5.D, #0 |
(259) 0x421ef8 LD1D {Z10.D}, P1/Z, [X6, Z10.D,LSL #3] |
(259) 0x421efc SEL Z28.D, P8, Z7.D, Z19.D |
(259) 0x421f00 MAD Z28.D, P0/M, Z6.D, Z8.D |
(259) 0x421f04 LD1D {Z28.D}, P1/Z, [X23, Z28.D,LSL #3] |
(259) 0x421f08 FSUB Z8.D, Z28.D, Z2.D |
(259) 0x421f0c FMUL Z9.D, Z1.D, Z8.D |
(259) 0x421f10 FABD Z0.D, P0/M, Z0.D, Z2.D |
(259) 0x421f14 FCMGT P7.D, P1/Z, Z9.D, #0 |
(259) 0x421f18 FABD Z28.D, P0/M, Z28.D, Z2.D |
(259) 0x421f1c MOVPRFX Z4, Z5 |
(259) 0x421f20 FABS Z4.D, P0/M, Z5.D |
(259) 0x421f24 FCMGT P5.D, P0/Z, Z8.D, #0 |
(259) 0x421f28 FDIV Z4.D, P0/M, Z4.D, Z10.D |
(259) 0x421f2c SUNPKHI Z31.D, Z31 |
(259) 0x421f30 MOVPRFX Z10, Z0 |
(259) 0x421f34 FMINNM Z10.D, P0/M, Z10.D, Z28.D |
(259) 0x421f38 LD1D {Z31.D}, P7/Z, [X4, Z31.D,LSL #3] |
(259) 0x421f3c MOVPRFX Z1, Z4 |
(259) 0x421f40 FADD Z1.D, P0/M, Z1.D, #1 |
(259) 0x421f44 FSUB Z8.D, Z21.D, Z4.D |
(259) 0x421f48 FCMGT P4.D, P5/Z, Z9.D, #0 |
(259) 0x421f4c FMUL Z0.D, Z1.D, Z0.D |
(259) 0x421f50 FMUL Z28.D, Z8.D, Z28.D |
(259) 0x421f54 FDIV Z0.D, P0/M, Z0.D, Z31.D |
(259) 0x421f58 FDIV Z28.D, P0/M, Z28.D, Z30.D |
(259) 0x421f5c FSUBR Z4.D, P0/M, Z4.D, #1 |
(259) 0x421f60 FADD Z30.D, Z0.D, Z28.D |
(259) 0x421f64 MOVPRFX Z9, Z22 |
(259) 0x421f68 FCPY Z9.D, P4/M, #1.0000000 |
(259) 0x421f6c FMUL Z29.D, Z30.D, Z29.D |
(259) 0x421f70 FMUL Z4.D, Z4.D, Z9.D |
(259) 0x421f74 FMINNM Z29.D, P0/M, Z29.D, Z10.D |
(259) 0x421f78 FMLA Z2.D, P7/M, Z29.D, Z4.D |
(259) 0x421f7c FMUL Z5.D, Z5.D, Z2.D |
(259) 0x421f80 ST1D {Z3.D}, P2, [X24, X0,LSL #3] |
(259) 0x421f84 ST1D {Z5.D}, P1, [X5, X0,LSL #3] |
(259) 0x421f88 ADD X0, X0, X10 |
(259) 0x421f8c WHILELO P1.D, X0, X8 |
(259) 0x421f90 INCW Z16.S, ALL |
(259) 0x421f94 WHILELO P2.D, X0, X26 |
(259) 0x421f98 B.NE 421de8 |
(261) 0x421f9c ADD X18, X18, #1 |
(261) 0x421fa0 CMP W19, W22 |
(261) 0x421fa4 B.LE 421fe4 |
(261) 0x421fa8 SUB W2, W15, W30 |
(261) 0x421fac ORR W26, WZR, W21 |
(261) 0x421fb0 ORR W8, WZR, W30 |
(261) 0x421fb4 ORR W24, WZR, W22 |
(261) 0x421fb8 CMP W2, W26 |
(261) 0x421fbc CSEL X26, X2, X26, #9 |
(261) 0x421fc0 ADD W30, W8, W26 |
(261) 0x421fc4 ORR W3, WZR, W20 |
(261) 0x421fc8 ADD W22, W24, #1 |
(261) 0x421fcc CMP W8, W30 |
(261) 0x421fd0 B.CC 421d6c |
(262) 0x421fd4 ORR W30, WZR, W8 |
(262) 0x421fd8 ADD X18, X18, #1 |
(262) 0x421fdc CMP W19, W22 |
(262) 0x421fe0 B.GT 421fa8 |
(260) 0x421fe4 LDP D8, D9, [SP, #80] |
(260) 0x421fe8 LDP X21, X22, [SP, #32] |
(260) 0x421fec LDR D10, [SP, #96] |
(260) 0x421ff0 LDP X19, X20, [SP, #16] |
(260) 0x421ff4 LDP X23, X24, [SP, #48] |
(260) 0x421ff8 LDP X25, X26, [SP, #64] |
(260) 0x421ffc LDP X29, X30, [SP], #112 |
(260) 0x422000 RET |
(258) 0x422004 LDP X19, X20, [SP, #16] |
(258) 0x422008 LDP X21, X22, [SP, #32] |
(258) 0x42200c LDP X23, X24, [SP, #48] |
(258) 0x422010 LDP X25, X26, [SP, #64] |
(258) 0x422014 LDP X29, X30, [SP], #112 |
(258) 0x422018 RET |
(258) 0x42201c ADD W2, W2, #1 |
(258) 0x422020 MOVZ W1, #0 |
(258) 0x422024 B 421cf8 |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►98.42+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so |
Path / |
Source file and lines | advec_mom.cpp:180-211 |
Module | exec |
nb instructions | 29 |
loop length | 116 |
nb stack references | 0 |
front end | 3.63 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.50 | 4.50 | 4.50 | 4.50 | 0.00 | 0.00 | 0.00 | 0.00 | 3.17 | 2.83 | 3.00 | 2.50 | 2.50 |
cycles | 2.50 | 2.50 | 4.50 | 4.50 | 4.50 | 4.50 | 0.00 | 0.00 | 0.00 | 0.00 | 3.17 | 2.83 | 3.00 | 2.50 | 2.50 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Front-end | 3.63 |
Overall L1 | 4.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #912]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDR W19, [X0, #52] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X23, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD W19, W19, #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR W26, [X23, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W20, [X0, #40] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W0, [X0, #44] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
CMP W26, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 421ff0 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x36c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W20, W20, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD W22, W0, #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB W25, W19, W26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W20, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 422004 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x380> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W21, W22, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MADD W25, W25, W21, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ORR W24, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W3, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
UDIV W2, W25, W24 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
MSUB W1, W2, W24, W25 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W0, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 42201c <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x398> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Source file and lines | advec_mom.cpp:180-211 |
Module | exec |
nb instructions | 29 |
loop length | 116 |
nb stack references | 0 |
front end | 3.63 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.50 | 4.50 | 4.50 | 4.50 | 0.00 | 0.00 | 0.00 | 0.00 | 3.17 | 2.83 | 3.00 | 2.50 | 2.50 |
cycles | 2.50 | 2.50 | 4.50 | 4.50 | 4.50 | 4.50 | 0.00 | 0.00 | 0.00 | 0.00 | 3.17 | 2.83 | 3.00 | 2.50 | 2.50 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Front-end | 3.63 |
Overall L1 | 4.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #912]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDR W19, [X0, #52] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X23, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD W19, W19, #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR W26, [X23, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W20, [X0, #40] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W0, [X0, #44] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
CMP W26, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 421ff0 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x36c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W20, W20, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD W22, W0, #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB W25, W19, W26 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W20, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 422004 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x380> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W21, W22, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MADD W25, W25, W21, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ORR W24, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W3, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
UDIV W2, W25, W24 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
MSUB W1, W2, W24, W25 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W0, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 42201c <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.10+0x398> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼advec_mom_kernel(int, int, int, int, clover::Buffer2D | 3.93 | 5.22 |
▼Loop 260 - advec_mom.cpp:180-211 - exec– | 0 | 0 |
▼Loop 261 - advec_mom.cpp:182-211 - exec– | 0.02 | 0.02 |
○Loop 259 - context.h:69-69 - exec | 3.92 | 5.19 |
○Loop 262 - advec_mom.cpp:182-182 - exec | 0 | 0.01 |
○Loop 258 - advec_mom.cpp:180-182 - exec | 0 | 0 |