Loop Id: 202 | Module: exec | Source: advec_cell.cpp:71-110 [...] | Coverage: 0.97% |
---|
Loop Id: 202 | Module: exec | Source: advec_cell.cpp:71-110 [...] | Coverage: 0.97% |
---|
(205) 0x41fd78 SBFM X0, X18, #0, #31 |
(205) 0x41fd7c LDR X6, [X26] |
(205) 0x41fd80 SUB W15, W14, #1 |
(205) 0x41fd84 ADD X5, X0, #1 |
(205) 0x41fd88 LDR X18, [X23] |
(205) 0x41fd8c ADD X12, X5, X15 |
(205) 0x41fd90 LDR X2, [X25] |
(205) 0x41fd94 MADD X8, X19, X6, XZR |
(205) 0x41fd98 LDR X16, [X23, #16] |
(205) 0x41fd9c MADD X17, X19, X18, XZR |
(205) 0x41fda0 LDR X4, [X25, #16] |
(205) 0x41fda4 MADD X3, X19, X2, XZR |
(205) 0x41fda8 LDR X7, [X26, #16] |
(205) 0x41fdac ADD X18, X16, X17,LSL #3 |
(205) 0x41fdb0 LDR X11, [SP, #120] |
(205) 0x41fdb4 ADD X17, X4, X3,LSL #3 |
(205) 0x41fdb8 LDR X1, [X24] |
(205) 0x41fdbc ADD X16, X7, X8,LSL #3 |
(205) 0x41fdc0 LDR X9, [X27] |
(205) 0x41fdc4 LDR X10, [X28] |
(205) 0x41fdc8 MADD X21, X19, X1, XZR |
(205) 0x41fdcc LDR X14, [X11, #8] |
(205) 0x41fdd0 MADD X6, X19, X9, XZR |
(205) 0x41fdd4 LDR X13, [X24, #16] |
(205) 0x41fdd8 MADD X7, X19, X10, XZR |
(205) 0x41fddc LDR X8, [X27, #16] |
(205) 0x41fde0 LDR X4, [X28, #16] |
(205) 0x41fde4 SUB W15, W0, #2 |
(205) 0x41fde8 SUB X2, X0, #1 |
(205) 0x41fdec LDR D6, [X17, X0,LSL #3] |
(205) 0x41fdf0 ORR X3, XZR, X0 |
(205) 0x41fdf4 SBFM X1, X15, #0, #31 |
(205) 0x41fdf8 ADD W9, W0, #1 |
(205) 0x41fdfc ORR X11, XZR, X2 |
(205) 0x41fe00 FCMPE D6, #0 |
(205) 0x41fe04 B.GT 41fe20 |
(206) 0x41fe08 CMP W20, W9 |
(206) 0x41fe0c CSEL W1, W20, W9, #13 |
(206) 0x41fe10 SBFM X11, X1, #0, #31 |
(206) 0x41fe14 ORR X3, XZR, X2 |
(206) 0x41fe18 ORR X1, XZR, X11 |
(206) 0x41fe1c ORR X2, XZR, X0 |
(206) 0x41fe20 ADD X9, X21, X2 |
(206) 0x41fe24 FABS D3, D6 |
(206) 0x41fe28 ADD X10, X7, X2 |
(206) 0x41fe2c ADD X15, X7, X3 |
(206) 0x41fe30 LDR D1, [X14, X11,LSL #3] |
(206) 0x41fe34 ADD X11, X6, X1 |
(206) 0x41fe38 ADD X1, X7, X1 |
(206) 0x41fe3c ADD X2, X6, X2 |
(206) 0x41fe40 LDR D2, [X13, X9,LSL #3] |
(206) 0x41fe44 ADD X3, X6, X3 |
(206) 0x41fe48 LDR D5, [X14, X0,LSL #3] |
(206) 0x41fe4c LDR D0, [X4, X10,LSL #3] |
(206) 0x41fe50 FDIV D20, D3, D2 |
(206) 0x41fe54 LDR D19, [X4, X15,LSL #3] |
(206) 0x41fe58 LDR D4, [X4, X1,LSL #3] |
(206) 0x41fe5c FDIV D23, D5, D1 |
(206) 0x41fe60 FSUB D21, D19, S0 |
(206) 0x41fe64 FSUB D24, D0, S4 |
(206) 0x41fe68 FCMPE D21, #0 |
(206) 0x41fe6c FABS D25, D21 |
(206) 0x41fe70 FMUL D27, D21, D24 |
(206) 0x41fe74 FABS D26, D24 |
(206) 0x41fe78 FMINNM D22, D25, D26 |
(206) 0x41fe7c FADD D29, D20, D7 |
(206) 0x41fe80 FSUB D28, D7, S20 |
(206) 0x41fe84 FSUB D30, D18, S20 |
(206) 0x41fe88 FCSEL D31, D16, D7, #9 |
(206) 0x41fe8c FCMPE D27, #0 |
(206) 0x41fe90 FMUL D5, D23, D29 |
(206) 0x41fe94 B.LS 41feb0 |
(206) 0x41fe98 FMUL D3, D25, D30 |
(206) 0x41fe9c FMADD D1, D26, D5, D3 |
(206) 0x41fea0 FMUL D2, D1, D17 |
(206) 0x41fea4 FMINNM D20, D2, D22 |
(206) 0x41fea8 FMUL D19, D20, D28 |
(206) 0x41feac FMADD D0, D19, D31, D0 |
(206) 0x41feb0 FMUL D24, D0, D6 |
(206) 0x41feb4 STR D24, [X16, X0,LSL #3] |
(206) 0x41feb8 FABS D6, D24 |
(206) 0x41febc LDR D4, [X8, X2,LSL #3] |
(206) 0x41fec0 LDR D21, [X8, X3,LSL #3] |
(206) 0x41fec4 LDR D25, [X8, X11,LSL #3] |
(206) 0x41fec8 LDR D26, [X4, X10,LSL #3] |
(206) 0x41fecc FMUL D23, D4, D24 |
(206) 0x41fed0 FSUB D27, D21, S4 |
(206) 0x41fed4 LDR D22, [X13, X9,LSL #3] |
(206) 0x41fed8 FSUB D28, D4, S25 |
(206) 0x41fedc FABS D31, D27 |
(206) 0x41fee0 FCMPE D27, #0 |
(206) 0x41fee4 FMUL D29, D22, D26 |
(206) 0x41fee8 FMUL D2, D27, D28 |
(206) 0x41feec FABS D20, D28 |
(206) 0x41fef0 FMINNM D19, D31, D20 |
(206) 0x41fef4 FMUL D30, D31, D30 |
(206) 0x41fef8 FCSEL D3, D16, D7, #9 |
(206) 0x41fefc FCMPE D2, #0 |
(206) 0x41ff00 FMADD D5, D20, D5, D30 |
(206) 0x41ff04 B.LS 41ff54 |
(206) 0x41ff08 FDIV D0, D6, D29 |
(206) 0x41ff0c FMUL D1, D5, D17 |
(206) 0x41ff10 FMINNM D21, D1, D19 |
(206) 0x41ff14 FSUB D6, D7, S0 |
(206) 0x41ff18 FMUL D25, D6, D21 |
(206) 0x41ff1c FMADD D4, D25, D3, D4 |
(206) 0x41ff20 FMUL D24, D4, D24 |
(206) 0x41ff24 STR D24, [X18, X0,LSL #3] |
(206) 0x41ff28 ORR X0, XZR, X5 |
(206) 0x41ff2c CMP X5, X12 |
(206) 0x41ff30 B.EQ 41ff64 |
(206) 0x41ff34 ADD X5, X5, #1 |
(206) 0x41ff38 B 41fde4 |
0x41ff54 STR D23, [X18, X0,LSL #3] |
0x41ff58 ORR X0, XZR, X5 |
0x41ff5c CMP X5, X12 |
0x41ff60 B.NE 41ff34 |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/advec_cell.cpp: 71 - 110 |
-------------------------------------------------------------------------------- |
71: if (vol_flux_x(i, j) > 0.0) { |
[...] |
82: sigmat = std::fabs(vol_flux_x(i, j)) / pre_vol(donor, j); |
83: sigma3 = (1.0 + sigmat) * (vertexdx[i] / vertexdx[dif]); |
84: sigma4 = 2.0 - sigmat; |
85: sigmav = sigmat; |
86: diffuw = density1(donor, j) - density1(upwind, j); |
87: diffdw = density1(downwind, j) - density1(donor, j); |
88: wind = 1.0; |
89: if (diffdw <= 0.0) wind = -1.0; |
90: if (diffuw * diffdw > 0.0) { |
91: limiter = (1.0 - sigmav) * wind * |
92: std::fmin(std::fmin(std::fabs(diffuw), std::fabs(diffdw)), |
93: one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); |
94: } else { |
95: limiter = 0.0; |
96: } |
97: mass_flux_x(i, j) = vol_flux_x(i, j) * (density1(donor, j) + limiter); |
98: sigmam = std::fabs(mass_flux_x(i, j)) / (density1(donor, j) * pre_vol(donor, j)); |
99: diffuw = energy1(donor, j) - energy1(upwind, j); |
100: diffdw = energy1(downwind, j) - energy1(donor, j); |
101: wind = 1.0; |
102: if (diffdw <= 0.0) wind = -1.0; |
103: if (diffuw * diffdw > 0.0) { |
104: limiter = (1.0 - sigmam) * wind * |
105: std::fmin(std::fmin(std::fabs(diffuw), std::fabs(diffdw)), |
106: one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); |
107: } else { |
108: limiter = 0.0; |
109: } |
110: ener_flux(i, j) = mass_flux_x(i, j) * (energy1(donor, j) + limiter); |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►98.50+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.00 |
Bottlenecks | micro-operation queue, P0, P1, P2, P3, P4, P5, P6, P7, P10, P11, |
Function | advec_cell_kernel(int, int, int, int, int, int, clover::Buffer1D |
Source | advec_cell.cpp:110-110 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 0.50 |
CQA cycles if no scalar integer | 0.50 |
CQA cycles if FP arith vectorized | 0.50 |
CQA cycles if fully vectorized | 0.13 |
Front-end cycles | 0.50 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 0.50 |
P2 cycles | 0.50 |
P3 cycles | 0.50 |
P4 cycles | 0.50 |
P5 cycles | 0.50 |
P6 cycles | 0.50 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.50 |
P10 cycles | 0.50 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 4.00 |
Nb uops | 4.00 |
Nb loads | NA |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 0.00 |
Bytes stored | 8.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | NA |
Vector-efficiency ratio all | 25.00 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | 25.00 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | NA |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.00 |
Bottlenecks | micro-operation queue, P0, P1, P2, P3, P4, P5, P6, P7, P10, P11, |
Function | advec_cell_kernel(int, int, int, int, int, int, clover::Buffer1D |
Source | advec_cell.cpp:110-110 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 0.50 |
CQA cycles if no scalar integer | 0.50 |
CQA cycles if FP arith vectorized | 0.50 |
CQA cycles if fully vectorized | 0.13 |
Front-end cycles | 0.50 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 0.50 |
P2 cycles | 0.50 |
P3 cycles | 0.50 |
P4 cycles | 0.50 |
P5 cycles | 0.50 |
P6 cycles | 0.50 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.50 |
P10 cycles | 0.50 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 4.00 |
Nb uops | 4.00 |
Nb loads | NA |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 0.00 |
Bytes stored | 8.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | NA |
Vector-efficiency ratio all | 25.00 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | 25.00 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | NA |
Path / |
nb instructions | 4 |
loop length | 16 |
nb stack references | 0 |
front end | 0.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 0.50 |
Overall L1 | 0.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STR D23, [X18, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 |
ORR X0, XZR, X5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP X5, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.NE 41ff34 <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.2+0x2b4> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
nb instructions | 4 |
loop length | 16 |
nb stack references | 0 |
front end | 0.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 0.50 |
Overall L1 | 0.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STR D23, [X18, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 |
ORR X0, XZR, X5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP X5, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.NE 41ff34 <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.2+0x2b4> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |