Loop Id: 207 | Module: exec | Source: advec_cell.cpp:163-202 [...] | Coverage: 0.56% |
---|
Loop Id: 207 | Module: exec | Source: advec_cell.cpp:163-202 [...] | Coverage: 0.56% |
---|
(210) 0x4200c4 LDR X23, [X28] |
(210) 0x4200c8 SBFM X0, X18, #0, #31 |
(210) 0x4200cc SUB W22, W30, #1 |
(210) 0x4200d0 LDP X9, X8, [SP, #128] |
(210) 0x4200d4 ADD X5, X0, #1 |
(210) 0x4200d8 SBFM X30, X25, #0, #31 |
(210) 0x4200dc ADD X12, X5, X22 |
(210) 0x4200e0 LDR X1, [X27] |
(210) 0x4200e4 MADD X7, X13, X23, XZR |
(210) 0x4200e8 LDR X2, [X26] |
(210) 0x4200ec LDR X6, [X28, #16] |
(210) 0x4200f0 MADD X4, X13, X1, XZR |
(210) 0x4200f4 LDR X3, [X27, #16] |
(210) 0x4200f8 MADD X20, X13, X2, XZR |
(210) 0x4200fc LDR X10, [SP, #144] |
(210) 0x420100 ADD X17, X6, X7,LSL #3 |
(210) 0x420104 LDR X16, [SP, #152] |
(210) 0x420108 ADD X18, X3, X4,LSL #3 |
(210) 0x42010c LDR X21, [X8, #8] |
(210) 0x420110 LDR X15, [X26, #16] |
(210) 0x420114 LDR X8, [X10] |
(210) 0x420118 ADD X23, X21, X13,LSL #3 |
(210) 0x42011c LDR X22, [X9] |
(210) 0x420120 ADD X20, X15, X20,LSL #3 |
(210) 0x420124 LDR X7, [X16] |
(210) 0x420128 LDR X4, [X10, #16] |
(210) 0x42012c LDR X14, [X9, #16] |
(210) 0x420130 LDR X6, [X16, #16] |
(210) 0x420134 HINT #0 |
(210) 0x420138 HINT #0 |
(210) 0x42013c HINT #0 |
(210) 0x420140 ORR X1, XZR, X11 |
(210) 0x420144 ORR X2, XZR, X30 |
(210) 0x420148 LDR D6, [X18, X0,LSL #3] |
(210) 0x42014c ORR X3, XZR, X13 |
(210) 0x420150 ORR X10, XZR, X11 |
(210) 0x420154 FCMPE D6, #0 |
(210) 0x420158 B.GT 420174 |
(211) 0x42015c CMP W24, W19 |
(211) 0x420160 CSEL W2, W24, W19, #13 |
(211) 0x420164 SBFM X10, X2, #0, #31 |
(211) 0x420168 ORR X3, XZR, X11 |
(211) 0x42016c ORR X1, XZR, X13 |
(211) 0x420170 ORR X2, XZR, X10 |
(211) 0x420174 MADD X9, X22, X1, X0 |
(211) 0x420178 FABS D2, D6 |
(211) 0x42017c LDR D4, [X21, X10,LSL #3] |
(211) 0x420180 MADD X16, X8, X3, X0 |
(211) 0x420184 MADD X10, X8, X1, X0 |
(211) 0x420188 LDR D5, [X23] |
(211) 0x42018c MADD X15, X8, X2, X0 |
(211) 0x420190 LDR D1, [X14, X9,LSL #3] |
(211) 0x420194 MADD X1, X7, X1, X0 |
(211) 0x420198 MADD X3, X7, X3, X0 |
(211) 0x42019c LDR D0, [X4, X10,LSL #3] |
(211) 0x4201a0 MADD X2, X7, X2, X0 |
(211) 0x4201a4 LDR D3, [X4, X16,LSL #3] |
(211) 0x4201a8 FDIV D26, D5, D4 |
(211) 0x4201ac LDR D19, [X4, X15,LSL #3] |
(211) 0x4201b0 FDIV D23, D2, D1 |
(211) 0x4201b4 FSUB D24, D3, S0 |
(211) 0x4201b8 FSUB D27, D0, S19 |
(211) 0x4201bc FCMPE D24, #0 |
(211) 0x4201c0 FABS D25, D24 |
(211) 0x4201c4 FMUL D29, D24, D27 |
(211) 0x4201c8 FABS D28, D27 |
(211) 0x4201cc FMINNM D22, D25, D28 |
(211) 0x4201d0 FCSEL D20, D16, D7, #9 |
(211) 0x4201d4 FCMPE D29, #0 |
(211) 0x4201d8 FADD D30, D23, D7 |
(211) 0x4201dc FSUB D21, D7, S23 |
(211) 0x4201e0 FSUB D31, D18, S23 |
(211) 0x4201e4 FMUL D24, D26, D30 |
(211) 0x4201e8 B.LS 420204 |
(211) 0x4201ec FMUL D2, D25, D31 |
(211) 0x4201f0 FMADD D4, D28, D24, D2 |
(211) 0x4201f4 FMUL D1, D4, D17 |
(211) 0x4201f8 FMINNM D3, D1, D22 |
(211) 0x4201fc FMUL D23, D3, D21 |
(211) 0x420200 FMADD D0, D23, D20, D0 |
(211) 0x420204 FMUL D25, D0, D6 |
(211) 0x420208 STR D25, [X17, X0,LSL #3] |
(211) 0x42020c FABS D6, D25 |
(211) 0x420210 LDR D5, [X6, X1,LSL #3] |
(211) 0x420214 LDR D21, [X6, X3,LSL #3] |
(211) 0x420218 LDR D26, [X6, X2,LSL #3] |
(211) 0x42021c LDR D20, [X4, X10,LSL #3] |
(211) 0x420220 FMUL D27, D5, D25 |
(211) 0x420224 FSUB D28, D21, S5 |
(211) 0x420228 LDR D22, [X14, X9,LSL #3] |
(211) 0x42022c FSUB D19, D5, S26 |
(211) 0x420230 FABS D30, D28 |
(211) 0x420234 FCMPE D28, #0 |
(211) 0x420238 FMUL D29, D22, D20 |
(211) 0x42023c FMUL D2, D28, D19 |
(211) 0x420240 FABS D4, D19 |
(211) 0x420244 FMINNM D23, D30, D4 |
(211) 0x420248 FMUL D31, D30, D31 |
(211) 0x42024c FCSEL D3, D16, D7, #9 |
(211) 0x420250 FCMPE D2, #0 |
(211) 0x420254 FMADD D24, D4, D24, D31 |
(211) 0x420258 B.LS 4202a4 |
(211) 0x42025c FDIV D0, D6, D29 |
(211) 0x420260 FMUL D1, D24, D17 |
(211) 0x420264 FMINNM D21, D1, D23 |
(211) 0x420268 FSUB D6, D7, S0 |
(211) 0x42026c FMUL D26, D6, D21 |
(211) 0x420270 FMADD D5, D26, D3, D5 |
(211) 0x420274 FMUL D25, D5, D25 |
(211) 0x420278 STR D25, [X20, X0,LSL #3] |
(211) 0x42027c ORR X0, XZR, X5 |
(211) 0x420280 CMP X12, X5 |
(211) 0x420284 B.EQ 4202b4 |
(211) 0x420288 ADD X5, X5, #1 |
(211) 0x42028c B 420140 |
0x4202a4 STR D27, [X20, X0,LSL #3] |
0x4202a8 ORR X0, XZR, X5 |
0x4202ac CMP X12, X5 |
0x4202b0 B.NE 420288 |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/advec_cell.cpp: 163 - 202 |
-------------------------------------------------------------------------------- |
163: if (vol_flux_y(i, j) > 0.0) { |
[...] |
174: sigmat = std::fabs(vol_flux_y(i, j)) / pre_vol(i, donor); |
175: sigma3 = (1.0 + sigmat) * (vertexdy[j] / vertexdy[dif]); |
176: sigma4 = 2.0 - sigmat; |
177: sigmav = sigmat; |
178: diffuw = density1(i, donor) - density1(i, upwind); |
179: diffdw = density1(i, downwind) - density1(i, donor); |
180: wind = 1.0; |
181: if (diffdw <= 0.0) wind = -1.0; |
182: if (diffuw * diffdw > 0.0) { |
183: limiter = (1.0 - sigmav) * wind * |
184: std::fmin(std::fmin(std::fabs(diffuw), std::fabs(diffdw)), |
185: one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); |
186: } else { |
187: limiter = 0.0; |
188: } |
189: mass_flux_y(i, j) = vol_flux_y(i, j) * (density1(i, donor) + limiter); |
190: sigmam = std::fabs(mass_flux_y(i, j)) / (density1(i, donor) * pre_vol(i, donor)); |
191: diffuw = energy1(i, donor) - energy1(i, upwind); |
192: diffdw = energy1(i, downwind) - energy1(i, donor); |
193: wind = 1.0; |
194: if (diffdw <= 0.0) wind = -1.0; |
195: if (diffuw * diffdw > 0.0) { |
196: limiter = (1.0 - sigmam) * wind * |
197: std::fmin(std::fmin(std::fabs(diffuw), std::fabs(diffdw)), |
198: one_by_six * (sigma3 * std::fabs(diffuw) + sigma4 * std::fabs(diffdw))); |
199: } else { |
200: limiter = 0.0; |
201: } |
202: ener_flux(i, j) = mass_flux_y(i, j) * (energy1(i, donor) + limiter); |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►98.48+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so |
Path / |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.00 |
Bottlenecks | micro-operation queue, P0, P1, P2, P3, P4, P5, P6, P7, P10, P11, |
Function | advec_cell_kernel(int, int, int, int, int, int, clover::Buffer1D |
Source | advec_cell.cpp:202-202 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 0.50 |
CQA cycles if no scalar integer | 0.50 |
CQA cycles if FP arith vectorized | 0.50 |
CQA cycles if fully vectorized | 0.13 |
Front-end cycles | 0.50 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 0.50 |
P2 cycles | 0.50 |
P3 cycles | 0.50 |
P4 cycles | 0.50 |
P5 cycles | 0.50 |
P6 cycles | 0.50 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.50 |
P10 cycles | 0.50 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 4.00 |
Nb uops | 4.00 |
Nb loads | NA |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 0.00 |
Bytes stored | 8.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | NA |
Vector-efficiency ratio all | 25.00 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | 25.00 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | NA |
Metric | Value |
---|---|
CQA speedup if no scalar integer | 1.00 |
CQA speedup if FP arith vectorized | 1.00 |
CQA speedup if fully vectorized | 4.00 |
CQA speedup if no inter-iteration dependency | NA |
CQA speedup if next bottleneck killed | 1.00 |
Bottlenecks | micro-operation queue, P0, P1, P2, P3, P4, P5, P6, P7, P10, P11, |
Function | advec_cell_kernel(int, int, int, int, int, int, clover::Buffer1D |
Source | advec_cell.cpp:202-202 |
Source loop unroll info | NA |
Source loop unroll confidence level | NA |
Unroll/vectorization loop type | NA |
Unroll factor | NA |
CQA cycles | 0.50 |
CQA cycles if no scalar integer | 0.50 |
CQA cycles if FP arith vectorized | 0.50 |
CQA cycles if fully vectorized | 0.13 |
Front-end cycles | 0.50 |
DIV/SQRT cycles | 0.50 |
P0 cycles | 0.50 |
P1 cycles | 0.50 |
P2 cycles | 0.50 |
P3 cycles | 0.50 |
P4 cycles | 0.50 |
P5 cycles | 0.50 |
P6 cycles | 0.50 |
P7 cycles | 0.00 |
P8 cycles | 0.00 |
P9 cycles | 0.50 |
P10 cycles | 0.50 |
P11 cycles | 0.00 |
P12 cycles | 0.00 |
P13 cycles | 0.00 |
P14 cycles | 0.00 |
Inter-iter dependencies cycles | NA |
FE+BE cycles (UFS) | NA |
Stall cycles (UFS) | NA |
Nb insns | 4.00 |
Nb uops | 4.00 |
Nb loads | NA |
Nb stores | 1.00 |
Nb stack references | 0.00 |
FLOP/cycle | 0.00 |
Nb FLOP add-sub | 0.00 |
Nb FLOP mul | 0.00 |
Nb FLOP fma | 0.00 |
Nb FLOP div | 0.00 |
Nb FLOP rcp | 0.00 |
Nb FLOP sqrt | 0.00 |
Nb FLOP rsqrt | 0.00 |
Bytes/cycle | 16.00 |
Bytes prefetched | 0.00 |
Bytes loaded | 0.00 |
Bytes stored | 8.00 |
Stride 0 | NA |
Stride 1 | NA |
Stride n | NA |
Stride unknown | NA |
Stride indirect | NA |
Vectorization ratio all | 0.00 |
Vectorization ratio load | NA |
Vectorization ratio store | 0.00 |
Vectorization ratio mul | NA |
Vectorization ratio add_sub | NA |
Vectorization ratio fma | NA |
Vectorization ratio div_sqrt | NA |
Vectorization ratio other | NA |
Vector-efficiency ratio all | 25.00 |
Vector-efficiency ratio load | NA |
Vector-efficiency ratio store | 25.00 |
Vector-efficiency ratio mul | NA |
Vector-efficiency ratio add_sub | NA |
Vector-efficiency ratio fma | NA |
Vector-efficiency ratio div_sqrt | NA |
Vector-efficiency ratio other | NA |
Path / |
nb instructions | 4 |
loop length | 16 |
nb stack references | 0 |
front end | 0.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 0.50 |
Overall L1 | 0.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STR D27, [X20, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 |
ORR X0, XZR, X5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP X12, X5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.NE 420288 <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.6+0x2d8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
nb instructions | 4 |
loop length | 16 |
nb stack references | 0 |
front end | 0.50 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 |
cycles | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 | 0.00 | 0.00 | 0.50 | 0.50 | 0.00 | 0.00 | 0.00 |
Cycles executing div or sqrt instructions | NA |
Front-end | 0.50 |
Overall L1 | 0.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | 0% |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STR D27, [X20, X0,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 2 | 0.50 |
ORR X0, XZR, X5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP X12, X5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.NE 420288 <_Z17advec_cell_kerneliiiiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_S5_._omp_fn.6+0x2d8> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |