Function: advec_cell_kernel(int, int, int, int, int, int, clover::Buffer1D<double>&, clover::Buffer1 ... | Module: exec | Source: advec_cell.cpp:65-110 [...] | Coverage: 3.02% |
---|
Function: advec_cell_kernel(int, int, int, int, int, int, clover::Buffer1D<double>&, clover::Buffer1 ... | Module: exec | Source: advec_cell.cpp:65-110 [...] | Coverage: 3.02% |
---|
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/advec_cell.cpp: 65 - 110 |
-------------------------------------------------------------------------------- |
65: #pragma omp parallel for simd collapse(2) |
66: for (int j = (y_min + 1); j < (y_max + 2); j++) { |
67: for (int i = (x_min + 1); i < (x_max + 2 + 2); i++) |
68: ({ |
69: int upwind, donor, downwind, dif; |
70: double sigmat, sigma3, sigma4, sigmav, sigmam, diffuw, diffdw, limiter, wind; |
71: if (vol_flux_x(i, j) > 0.0) { |
[...] |
77: upwind = std::min(i + 1, x_max + 2); |
78: donor = i; |
79: downwind = i - 1; |
80: dif = upwind; |
81: } |
82: sigmat = std::fabs(vol_flux_x(i, j)) / pre_vol(donor, j); |
83: sigma3 = (1.0 + sigmat) * (vertexdx[i] / vertexdx[dif]); |
84: sigma4 = 2.0 - sigmat; |
85: sigmav = sigmat; |
86: diffuw = density1(donor, j) - density1(upwind, j); |
87: diffdw = density1(downwind, j) - density1(donor, j); |
88: wind = 1.0; |
89: if (diffdw <= 0.0) wind = -1.0; |
90: if (diffuw * diffdw > 0.0) { |
91: limiter = (1.0 - sigmav) * wind * |
92: std::fmin(std::fmin(std::fabs(diffuw), std::fabs(diffdw)), |
93: one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); |
94: } else { |
95: limiter = 0.0; |
96: } |
97: mass_flux_x(i, j) = vol_flux_x(i, j) * (density1(donor, j) + limiter); |
98: sigmam = std::fabs(mass_flux_x(i, j)) / (density1(donor, j) * pre_vol(donor, j)); |
99: diffuw = energy1(donor, j) - energy1(upwind, j); |
100: diffdw = energy1(downwind, j) - energy1(donor, j); |
101: wind = 1.0; |
102: if (diffdw <= 0.0) wind = -1.0; |
103: if (diffuw * diffdw > 0.0) { |
104: limiter = (1.0 - sigmam) * wind * |
105: std::fmin(std::fmin(std::fabs(diffuw), std::fabs(diffdw)), |
106: one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); |
107: } else { |
108: limiter = 0.0; |
109: } |
110: ener_flux(i, j) = mass_flux_x(i, j) * (energy1(donor, j) + limiter); |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
0x41fc80 STP X29, X30, [SP, #896]! |
0x41fc84 ADD X29, SP, #0 |
0x41fc88 STP X21, X22, [SP, #32] |
0x41fc8c STP X23, X24, [SP, #48] |
0x41fc90 ORR X23, XZR, X0 |
0x41fc94 LDP W24, W22, [X0, #64] |
0x41fc98 STP X19, X20, [SP, #16] |
0x41fc9c LDR W0, [X0, #56] |
0x41fca0 ADD W24, W24, #1 |
0x41fca4 ADD W22, W22, #2 |
0x41fca8 LDR W20, [X23, #60] |
0x41fcac ADD W1, W0, #1 |
0x41fcb0 STR W1, [SP, #108] |
0x41fcb4 CMP W24, W22 |
0x41fcb8 B.GE 41ff40 |
0x41fcbc ADD W21, W20, #4 |
0x41fcc0 STP X25, X26, [SP, #64] |
0x41fcc4 SUB W25, W22, W24 |
0x41fcc8 CMP W1, W21 |
0x41fccc B.GE 41ff3c |
0x41fcd0 SUB W2, W21, W1 |
0x41fcd4 MADD W26, W25, W2, WZR |
0x41fcd8 STR W2, [SP, #112] |
0x41fcdc BL 403530 |
0x41fce0 ORR W19, WZR, W0 |
0x41fce4 BL 4033c0 |
0x41fce8 UDIV W3, W26, W19 |
0x41fcec ORR W4, WZR, W0 |
0x41fcf0 MSUB W5, W3, W19, W26 |
0x41fcf4 CMP W0, W5 |
0x41fcf8 B.CC 41ffa4 |
(204) 0x41fcfc MADD W5, W3, W4, W5 |
(204) 0x41fd00 ADD W6, W3, W5 |
(204) 0x41fd04 STR W6, [SP, #116] |
(204) 0x41fd08 CMP W5, W6 |
(204) 0x41fd0c B.CS 41ff3c |
(204) 0x41fd10 LDR W8, [SP, #112] |
(204) 0x41fd14 ORR X7, XZR, #3840 |
(204) 0x41fd18 FMOV D7, #1.0000000 |
(204) 0x41fd1c MOVK X7, #16325 |
(204) 0x41fd20 STP X27, X28, [SP, #80] |
(204) 0x41fd24 FMOV D18, #2.0000000 |
(204) 0x41fd28 FMOV D16, #-1.0000000 |
(204) 0x41fd2c ADD W20, W20, #2 |
(204) 0x41fd30 FMOV D17, X7 |
(204) 0x41fd34 UDIV W9, W5, W8 |
(204) 0x41fd38 LDR W12, [SP, #108] |
(204) 0x41fd3c LDP X13, X28, [X23] |
(204) 0x41fd40 LDP X27, X26, [X23, #16] |
(204) 0x41fd44 STR X13, [SP, #120] |
(204) 0x41fd48 MSUB W10, W9, W8, W5 |
(204) 0x41fd4c ADD W11, W9, W24 |
(204) 0x41fd50 LDP X25, X24, [X23, #32] |
(204) 0x41fd54 SBFM X19, X11, #0, #31 |
(204) 0x41fd58 ADD W18, W10, W12 |
(204) 0x41fd5c LDR X23, [X23, #48] |
(204) 0x41fd60 SUB W21, W21, W18 |
(204) 0x41fd64 CMP W3, W21 |
(204) 0x41fd68 CSEL W14, W3, W21, #9 |
(204) 0x41fd6c ADD W30, W5, W14 |
(204) 0x41fd70 CMP W5, W30 |
(204) 0x41fd74 B.CS 41ff68 |
(205) 0x41fd78 SBFM X0, X18, #0, #31 |
(205) 0x41fd7c LDR X6, [X26] |
(205) 0x41fd80 SUB W15, W14, #1 |
(205) 0x41fd84 ADD X5, X0, #1 |
(205) 0x41fd88 LDR X18, [X23] |
(205) 0x41fd8c ADD X12, X5, X15 |
(205) 0x41fd90 LDR X2, [X25] |
(205) 0x41fd94 MADD X8, X19, X6, XZR |
(205) 0x41fd98 LDR X16, [X23, #16] |
(205) 0x41fd9c MADD X17, X19, X18, XZR |
(205) 0x41fda0 LDR X4, [X25, #16] |
(205) 0x41fda4 MADD X3, X19, X2, XZR |
(205) 0x41fda8 LDR X7, [X26, #16] |
(205) 0x41fdac ADD X18, X16, X17,LSL #3 |
(205) 0x41fdb0 LDR X11, [SP, #120] |
(205) 0x41fdb4 ADD X17, X4, X3,LSL #3 |
(205) 0x41fdb8 LDR X1, [X24] |
(205) 0x41fdbc ADD X16, X7, X8,LSL #3 |
(205) 0x41fdc0 LDR X9, [X27] |
(205) 0x41fdc4 LDR X10, [X28] |
(205) 0x41fdc8 MADD X21, X19, X1, XZR |
(205) 0x41fdcc LDR X14, [X11, #8] |
(205) 0x41fdd0 MADD X6, X19, X9, XZR |
(205) 0x41fdd4 LDR X13, [X24, #16] |
(205) 0x41fdd8 MADD X7, X19, X10, XZR |
(205) 0x41fddc LDR X8, [X27, #16] |
(205) 0x41fde0 LDR X4, [X28, #16] |
(205) 0x41fde4 SUB W15, W0, #2 |
(205) 0x41fde8 SUB X2, X0, #1 |
(205) 0x41fdec LDR D6, [X17, X0,LSL #3] |
(205) 0x41fdf0 ORR X3, XZR, X0 |
(205) 0x41fdf4 SBFM X1, X15, #0, #31 |
(205) 0x41fdf8 ADD W9, W0, #1 |
(205) 0x41fdfc ORR X11, XZR, X2 |
(205) 0x41fe00 FCMPE D6, #0 |
(205) 0x41fe04 B.GT 41fe20 |
(206) 0x41fe08 CMP W20, W9 |
(206) 0x41fe0c CSEL W1, W20, W9, #13 |
(206) 0x41fe10 SBFM X11, X1, #0, #31 |
(206) 0x41fe14 ORR X3, XZR, X2 |
(206) 0x41fe18 ORR X1, XZR, X11 |
(206) 0x41fe1c ORR X2, XZR, X0 |
(206) 0x41fe20 ADD X9, X21, X2 |
(206) 0x41fe24 FABS D3, D6 |
(206) 0x41fe28 ADD X10, X7, X2 |
(206) 0x41fe2c ADD X15, X7, X3 |
(206) 0x41fe30 LDR D1, [X14, X11,LSL #3] |
(206) 0x41fe34 ADD X11, X6, X1 |
(206) 0x41fe38 ADD X1, X7, X1 |
(206) 0x41fe3c ADD X2, X6, X2 |
(206) 0x41fe40 LDR D2, [X13, X9,LSL #3] |
(206) 0x41fe44 ADD X3, X6, X3 |
(206) 0x41fe48 LDR D5, [X14, X0,LSL #3] |
(206) 0x41fe4c LDR D0, [X4, X10,LSL #3] |
(206) 0x41fe50 FDIV D20, D3, D2 |
(206) 0x41fe54 LDR D19, [X4, X15,LSL #3] |
(206) 0x41fe58 LDR D4, [X4, X1,LSL #3] |
(206) 0x41fe5c FDIV D23, D5, D1 |
(206) 0x41fe60 FSUB D21, D19, S0 |
(206) 0x41fe64 FSUB D24, D0, S4 |
(206) 0x41fe68 FCMPE D21, #0 |
(206) 0x41fe6c FABS D25, D21 |
(206) 0x41fe70 FMUL D27, D21, D24 |
(206) 0x41fe74 FABS D26, D24 |
(206) 0x41fe78 FMINNM D22, D25, D26 |
(206) 0x41fe7c FADD D29, D20, D7 |
(206) 0x41fe80 FSUB D28, D7, S20 |
(206) 0x41fe84 FSUB D30, D18, S20 |
(206) 0x41fe88 FCSEL D31, D16, D7, #9 |
(206) 0x41fe8c FCMPE D27, #0 |
(206) 0x41fe90 FMUL D5, D23, D29 |
(206) 0x41fe94 B.LS 41feb0 |
(206) 0x41fe98 FMUL D3, D25, D30 |
(206) 0x41fe9c FMADD D1, D26, D5, D3 |
(206) 0x41fea0 FMUL D2, D1, D17 |
(206) 0x41fea4 FMINNM D20, D2, D22 |
(206) 0x41fea8 FMUL D19, D20, D28 |
(206) 0x41feac FMADD D0, D19, D31, D0 |
(206) 0x41feb0 FMUL D24, D0, D6 |
(206) 0x41feb4 STR D24, [X16, X0,LSL #3] |
(206) 0x41feb8 FABS D6, D24 |
(206) 0x41febc LDR D4, [X8, X2,LSL #3] |
(206) 0x41fec0 LDR D21, [X8, X3,LSL #3] |
(206) 0x41fec4 LDR D25, [X8, X11,LSL #3] |
(206) 0x41fec8 LDR D26, [X4, X10,LSL #3] |
(206) 0x41fecc FMUL D23, D4, D24 |
(206) 0x41fed0 FSUB D27, D21, S4 |
(206) 0x41fed4 LDR D22, [X13, X9,LSL #3] |
(206) 0x41fed8 FSUB D28, D4, S25 |
(206) 0x41fedc FABS D31, D27 |
(206) 0x41fee0 FCMPE D27, #0 |
(206) 0x41fee4 FMUL D29, D22, D26 |
(206) 0x41fee8 FMUL D2, D27, D28 |
(206) 0x41feec FABS D20, D28 |
(206) 0x41fef0 FMINNM D19, D31, D20 |
(206) 0x41fef4 FMUL D30, D31, D30 |
(206) 0x41fef8 FCSEL D3, D16, D7, #9 |
(206) 0x41fefc FCMPE D2, #0 |
(206) 0x41ff00 FMADD D5, D20, D5, D30 |
(206) 0x41ff04 B.LS 41ff54 |
(206) 0x41ff08 FDIV D0, D6, D29 |
(206) 0x41ff0c FMUL D1, D5, D17 |
(206) 0x41ff10 FMINNM D21, D1, D19 |
(206) 0x41ff14 FSUB D6, D7, S0 |
(206) 0x41ff18 FMUL D25, D6, D21 |
(206) 0x41ff1c FMADD D4, D25, D3, D4 |
(206) 0x41ff20 FMUL D24, D4, D24 |
(206) 0x41ff24 STR D24, [X18, X0,LSL #3] |
(206) 0x41ff28 ORR X0, XZR, X5 |
(206) 0x41ff2c CMP X5, X12 |
(206) 0x41ff30 B.EQ 41ff64 |
(206) 0x41ff34 ADD X5, X5, #1 |
(206) 0x41ff38 B 41fde4 |
(203) 0x41ff3c LDP X25, X26, [SP, #64] |
(203) 0x41ff40 LDP X19, X20, [SP, #16] |
(203) 0x41ff44 LDP X21, X22, [SP, #32] |
(203) 0x41ff48 LDP X23, X24, [SP, #48] |
(203) 0x41ff4c LDP X29, X30, [SP], #128 |
(203) 0x41ff50 RET |
(202) 0x41ff54 STR D23, [X18, X0,LSL #3] |
(202) 0x41ff58 ORR X0, XZR, X5 |
(202) 0x41ff5c CMP X5, X12 |
(202) 0x41ff60 B.NE 41ff34 |
(203) 0x41ff64 ORR W5, WZR, W30 |
(204) 0x41ff68 ADD X19, X19, #1 |
(204) 0x41ff6c ADD W30, W19, #0 |
(204) 0x41ff70 CMP W22, W30 |
(204) 0x41ff74 B.LE 41ff88 |
(204) 0x41ff78 LDP W21, W12, [SP, #112] |
(204) 0x41ff7c LDR W18, [SP, #108] |
(204) 0x41ff80 SUB W3, W12, W5 |
(204) 0x41ff84 B 41fd64 |
(204) 0x41ff88 LDP X19, X20, [SP, #16] |
(204) 0x41ff8c LDP X21, X22, [SP, #32] |
(204) 0x41ff90 LDP X23, X24, [SP, #48] |
(204) 0x41ff94 LDP X25, X26, [SP, #64] |
(204) 0x41ff98 LDP X27, X28, [SP, #80] |
(204) 0x41ff9c LDP X29, X30, [SP], #128 |
(204) 0x41ffa0 RET |
(204) 0x41ffa4 ADD W3, W3, #1 |
(204) 0x41ffa8 MOVZ W5, #0 |
(204) 0x41ffac B 41fcfc |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►98.42+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so |
Path / |
Source file and lines | advec_cell.cpp:65-110 |
Module | exec |
nb instructions | 31 |
loop length | 124 |
nb stack references | 0 |
front end | 3.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.75 | 4.75 | 4.75 | 4.75 | 0.00 | 0.00 | 0.00 | 0.00 | 3.50 | 3.50 | 3.00 | 3.50 | 3.50 |
cycles | 2.50 | 2.50 | 4.75 | 4.75 | 4.75 | 4.75 | 0.00 | 0.00 | 0.00 | 0.00 | 3.50 | 3.50 | 3.00 | 3.50 | 3.50 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Front-end | 3.88 |
Overall L1 | 4.75 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #896]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X23, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDP W24, W22, [X0, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDR W0, [X0, #56] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W24, W24, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W22, W22, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR W20, [X23, #60] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W1, W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STR W1, [SP, #108] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
CMP W24, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 41ff40 <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.2+0x2c0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W21, W20, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
SUB W25, W22, W24 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W1, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 41ff3c <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.2+0x2bc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W2, W21, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MADD W26, W25, W2, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
STR W2, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W19, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W3, W26, W19 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ORR W4, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MSUB W5, W3, W19, W26 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W0, W5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 41ffa4 <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.2+0x324> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Source file and lines | advec_cell.cpp:65-110 |
Module | exec |
nb instructions | 31 |
loop length | 124 |
nb stack references | 0 |
front end | 3.88 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.75 | 4.75 | 4.75 | 4.75 | 0.00 | 0.00 | 0.00 | 0.00 | 3.50 | 3.50 | 3.00 | 3.50 | 3.50 |
cycles | 2.50 | 2.50 | 4.75 | 4.75 | 4.75 | 4.75 | 0.00 | 0.00 | 0.00 | 0.00 | 3.50 | 3.50 | 3.00 | 3.50 | 3.50 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Front-end | 3.88 |
Overall L1 | 4.75 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #896]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X23, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDP W24, W22, [X0, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDR W0, [X0, #56] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W24, W24, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W22, W22, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR W20, [X23, #60] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W1, W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STR W1, [SP, #108] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
CMP W24, W22 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 41ff40 <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.2+0x2c0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W21, W20, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
SUB W25, W22, W24 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W1, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 41ff3c <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.2+0x2bc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W2, W21, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MADD W26, W25, W2, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
STR W2, [SP, #112] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W19, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W3, W26, W19 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ORR W4, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MSUB W5, W3, W19, W26 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W0, W5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 41ffa4 <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.2+0x324> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼advec_cell_kernel(int, int, int, int, int, int, clover::Buffer1D | 3.02 | 4 |
▼Loop 203 - advec_cell.cpp:65-110 - exec– | 0 | 0 |
▼Loop 202 - advec_cell.cpp:71-110 - exec– | 0.97 | 1.28 |
▼Loop 206 - advec_cell.cpp:71-110 - exec– | 1.77 | 2.34 |
○Loop 205 - advec_cell.cpp:71-110 - exec | 0.27 | 0.36 |
○Loop 204 - advec_cell.cpp:65-110 - exec | 0.01 | 0.01 |