Function: viscosity_kernel(int, int, int, int, clover::Buffer1D<double>&, clover::Buffer1D<double>&, ... | Module: exec | Source: viscosity.cpp:36-64 [...] | Coverage: 2.52% |
---|
Function: viscosity_kernel(int, int, int, int, clover::Buffer1D<double>&, clover::Buffer1D<double>&, ... | Module: exec | Source: viscosity.cpp:36-64 [...] | Coverage: 2.52% |
---|
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/viscosity.cpp: 36 - 64 |
-------------------------------------------------------------------------------- |
36: #pragma omp parallel for simd collapse(2) |
37: for (int j = (y_min + 1); j < (y_max + 2); j++) { |
38: for (int i = (x_min + 1); i < (x_max + 2); i++) { |
39: double ugrad = (xvel0(i + 1, j + 0) + xvel0(i + 1, j + 1)) - (xvel0(i, j) + xvel0(i + 0, j + 1)); |
40: double vgrad = (yvel0(i + 0, j + 1) + yvel0(i + 1, j + 1)) - (yvel0(i, j) + yvel0(i + 1, j + 0)); |
41: double div = (celldx[i] * (ugrad) + celldy[j] * (vgrad)); |
42: double strain2 = 0.5 * (xvel0(i + 0, j + 1) + xvel0(i + 1, j + 1) - xvel0(i, j) - xvel0(i + 1, j + 0)) / celldy[j] + |
43: 0.5 * (yvel0(i + 1, j + 0) + yvel0(i + 1, j + 1) - yvel0(i, j) - yvel0(i + 0, j + 1)) / celldx[i]; |
44: double pgradx = (pressure(i + 1, j + 0) - pressure(i - 1, j + 0)) / (celldx[i] + celldx[i + 1]); |
45: double pgrady = (pressure(i + 0, j + 1) - pressure(i + 0, j - 1)) / (celldy[j] + celldy[j + 2]); |
46: double pgradx2 = pgradx * pgradx; |
47: double pgrady2 = pgrady * pgrady; |
48: double limiter = ((0.5 * (ugrad) / celldx[i]) * pgradx2 + (0.5 * (vgrad) / celldy[j]) * pgrady2 + strain2 * pgradx * pgrady) / |
49: std::fmax(pgradx2 + pgrady2, g_small); |
50: if ((limiter > 0.0) || (div >= 0.0)) { |
51: viscosity(i, j) = 0.0; |
52: } else { |
53: double dirx = 1.0; |
54: if (pgradx < 0.0) dirx = -1.0; |
55: pgradx = dirx * std::fmax(g_small, std::fabs(pgradx)); |
56: double diry = 1.0; |
57: if (pgradx < 0.0) diry = -1.0; |
58: pgrady = diry * std::fmax(g_small, std::fabs(pgrady)); |
59: double pgrad = std::sqrt(pgradx * pgradx + pgrady * pgrady); |
60: double xgrad = std::fabs(celldx[i] * pgrad / pgradx); |
61: double ygrad = std::fabs(celldy[j] * pgrad / pgrady); |
62: double grad = std::fmin(xgrad, ygrad); |
63: double grad2 = grad * grad; |
64: viscosity(i, j) = 2.0 * density0(i, j) * grad2 * limiter * limiter; |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
0x449200 STP X29, X30, [SP, #912]! |
0x449204 ADD X29, SP, #0 |
0x449208 STP X19, X20, [SP, #16] |
0x44920c STP X25, X26, [SP, #64] |
0x449210 LDP W25, W20, [X0, #64] |
0x449214 STP X21, X22, [SP, #32] |
0x449218 ORR X22, XZR, X0 |
0x44921c LDR W21, [X0, #56] |
0x449220 ADD W25, W25, #1 |
0x449224 ADD W20, W20, #2 |
0x449228 LDR W0, [X0, #60] |
0x44922c CMP W25, W20 |
0x449230 B.GE 4494fc |
0x449234 ADD W21, W21, #1 |
0x449238 ADD W19, W0, #2 |
0x44923c SUB W26, W20, W25 |
0x449240 CMP W21, W19 |
0x449244 B.GE 4494fc |
0x449248 STP X23, X24, [SP, #48] |
0x44924c SUB W24, W19, W21 |
0x449250 MADD W26, W26, W24, WZR |
0x449254 BL 403530 |
0x449258 ORR W23, WZR, W0 |
0x44925c BL 4033c0 |
0x449260 UDIV W2, W26, W23 |
0x449264 ORR W3, WZR, W0 |
0x449268 MSUB W1, W2, W23, W26 |
0x44926c CMP W0, W1 |
0x449270 B.CC 449558 |
0x449274 MADD W3, W2, W3, W1 |
0x449278 ADD W4, W2, W3 |
0x44927c STR W4, [SP, #100] |
0x449280 CMP W3, W4 |
0x449284 B.CS 449510 |
0x449288 UDIV W9, W3, W24 |
0x44928c ADRP X5, |
0x449290 STP X27, X28, [SP, #80] |
0x449294 FMOV D6, #0.5000000 |
0x449298 MOVI D7, #0 |
0x44929c LDR D5, [X5, #256] |
0x4492a0 LDP X7, X28, [X22] |
0x4492a4 LDP X15, X27, [X22, #16] |
0x4492a8 MSUB W6, W9, W24, W3 |
0x4492ac ADD W4, W9, W25 |
0x4492b0 LDP X26, X25, [X22, #32] |
0x4492b4 SBFM X13, X4, #0, #31 |
0x4492b8 ADD W30, W6, W21 |
0x4492bc STR X7, [SP, #104] |
0x4492c0 LDR X23, [X22, #48] |
0x4492c4 SUB W16, W19, W30 |
(867) 0x4492c8 CMP W2, W16 |
(867) 0x4492cc CSEL W8, W2, W16, #9 |
(867) 0x4492d0 ADD W16, W3, W8 |
(867) 0x4492d4 ADD W17, W4, #1 |
(867) 0x4492d8 CMP W3, W16 |
(867) 0x4492dc B.CS 449528 |
(867) 0x4492e0 LDR X22, [X23] |
(867) 0x4492e4 ADD W14, W30, #1 |
(867) 0x4492e8 SUB W10, W4, #1 |
(867) 0x4492ec SBFM X1, X14, #0, #31 |
(867) 0x4492f0 ADD W9, W4, #2 |
(867) 0x4492f4 LDR X14, [X26] |
(867) 0x4492f8 SBFM X12, X10, #0, #31 |
(867) 0x4492fc SBFM X0, X30, #0, #31 |
(867) 0x449300 SUB W11, W8, #1 |
(867) 0x449304 ADD X3, X0, #1 |
(867) 0x449308 LDR X19, [X23, #16] |
(867) 0x44930c ADD X8, X11, X3 |
(867) 0x449310 ADD X30, X13, #1 |
(867) 0x449314 MADD X18, X13, X22, X22 |
(867) 0x449318 LDR X5, [X25] |
(867) 0x44931c SUB X2, X18, X22 |
(867) 0x449320 LDR X7, [X25, #16] |
(867) 0x449324 ADD X4, X19, X2,LSL #3 |
(867) 0x449328 ADD X6, X19, X18,LSL #3 |
(867) 0x44932c LDR X22, [X26, #16] |
(867) 0x449330 MADD X19, X13, X14, XZR |
(867) 0x449334 LDR X14, [X27] |
(867) 0x449338 MADD X10, X5, X13, XZR |
(867) 0x44933c ADD X11, X5, X10 |
(867) 0x449340 ADD X5, X7, X11,LSL #3 |
(867) 0x449344 MADD X18, X14, X13, XZR |
(867) 0x449348 MADD X12, X12, X14, XZR |
(867) 0x44934c ADD X11, X22, X19,LSL #3 |
(867) 0x449350 LDR X19, [X28, #8] |
(867) 0x449354 ADD X9, X19, W9,SXTW #3 |
(867) 0x449358 ADD X2, X0, X18 |
(867) 0x44935c ADD X7, X7, X10,LSL #3 |
(867) 0x449360 SUB X10, X12, X18 |
(867) 0x449364 LDR X22, [X27, #16] |
(867) 0x449368 LDR X18, [SP, #104] |
(867) 0x44936c ADD X12, X19, X13,LSL #3 |
(867) 0x449370 ADD X2, X22, X2,LSL #3 |
(867) 0x449374 LDR X18, [X18, #8] |
(867) 0x449378 B 4493ec |
(868) 0x44937c FABS D20, D29 |
(868) 0x449380 FMAXNM D29, D20, D5 |
(868) 0x449384 FMUL D25, D29, D31 |
(868) 0x449388 LDR X19, [X15] |
(868) 0x44938c LDR X22, [X15, #16] |
(868) 0x449390 FMUL D1, D25, D25 |
(868) 0x449394 MADD X19, X13, X19, X0 |
(868) 0x449398 LDR D3, [X22, X19,LSL #3] |
(868) 0x44939c FMADD D0, D21, D21, D1 |
(868) 0x4493a0 FADD D23, D3, D3 |
(868) 0x4493a4 FSQRT D22, D0 |
(868) 0x4493a8 FMUL D4, D22, D4 |
(868) 0x4493ac FMUL D18, D22, D18 |
(868) 0x4493b0 FDIV D26, D4, D25 |
(868) 0x4493b4 FDIV D30, D18, D21 |
(868) 0x4493b8 FABS D2, D26 |
(868) 0x4493bc FABS D19, D30 |
(868) 0x4493c0 FMINNM D28, D2, D19 |
(868) 0x4493c4 FMUL D27, D28, D17 |
(868) 0x4493c8 FMUL D16, D27, D27 |
(868) 0x4493cc FMUL D20, D23, D16 |
(868) 0x4493d0 STR D20, [X11, X0,LSL #3] |
(868) 0x4493d4 ADD X1, X1, #1 |
(868) 0x4493d8 ADD X2, X2, #8 |
(868) 0x4493dc ORR X0, XZR, X3 |
(868) 0x4493e0 CMP X3, X8 |
(868) 0x4493e4 B.EQ 449530 |
(868) 0x4493e8 ADD X3, X3, #1 |
(868) 0x4493ec LDR D21, [X7, X1,LSL #3] |
(868) 0x4493f0 MOVI D20, #0 |
(868) 0x4493f4 LDR D24, [X5, X1,LSL #3] |
(868) 0x4493f8 LDUR D3, [X2, #504] |
(868) 0x4493fc LDR D18, [X18, X0,LSL #3] |
(868) 0x449400 FADD D19, D24, D21 |
(868) 0x449404 LDR D25, [X7, X0,LSL #3] |
(868) 0x449408 LDR D2, [X2, X14,LSL #3] |
(868) 0x44940c LDR D1, [X2, X10,LSL #3] |
(868) 0x449410 FDIV D23, D6, D18 |
(868) 0x449414 LDR D17, [X18, X1,LSL #3] |
(868) 0x449418 LDR D16, [X2, #8] |
(868) 0x44941c FSUB D29, D2, S1 |
(868) 0x449420 FSUB D1, D19, S25 |
(868) 0x449424 LDR D26, [X5, X0,LSL #3] |
(868) 0x449428 FADD D0, D18, D17 |
(868) 0x44942c LDR D22, [X6, X0,LSL #3] |
(868) 0x449430 FSUB D4, D16, S3 |
(868) 0x449434 LDR D30, [X6, X1,LSL #3] |
(868) 0x449438 FADD D2, D26, D24 |
(868) 0x44943c FSUB D16, D1, S26 |
(868) 0x449440 LDR D26, [X9] |
(868) 0x449444 LDR D28, [X4, X0,LSL #3] |
(868) 0x449448 FDIV D31, D4, D0 |
(868) 0x44944c LDR D4, [X12] |
(868) 0x449450 FADD D19, D30, D22 |
(868) 0x449454 FSUB D17, D2, S25 |
(868) 0x449458 LDR D27, [X4, X1,LSL #3] |
(868) 0x44945c FADD D25, D4, D26 |
(868) 0x449460 FSUB D2, D19, S28 |
(868) 0x449464 FDIV D24, D6, D4 |
(868) 0x449468 FADD D30, D27, D30 |
(868) 0x44946c FSUB D3, D17, S21 |
(868) 0x449470 FSUB D27, D2, S27 |
(868) 0x449474 FDIV D29, D29, D25 |
(868) 0x449478 FSUB D28, D30, S28 |
(868) 0x44947c FABS D0, D31 |
(868) 0x449480 FMAXNM D21, D0, D5 |
(868) 0x449484 FMUL D25, D4, D27 |
(868) 0x449488 FSUB D1, D28, S22 |
(868) 0x44948c FMUL D22, D31, D31 |
(868) 0x449490 FMUL D30, D1, D23 |
(868) 0x449494 FMUL D17, D29, D29 |
(868) 0x449498 FMUL D26, D29, D31 |
(868) 0x44949c FMADD D3, D24, D3, D30 |
(868) 0x4494a0 FMUL D0, D17, D27 |
(868) 0x4494a4 FMUL D27, D22, D16 |
(868) 0x4494a8 FMADD D16, D18, D16, D25 |
(868) 0x4494ac FADD D19, D17, D22 |
(868) 0x4494b0 FMAXNM D28, D19, D5 |
(868) 0x4494b4 FMUL D2, D0, D24 |
(868) 0x4494b8 FMADD D23, D27, D23, D2 |
(868) 0x4494bc FCMPE D16, D20 |
(868) 0x4494c0 FMADD D24, D26, D3, D23 |
(868) 0x4494c4 FDIV D17, D24, D28 |
(868) 0x4494c8 FCCMPE D17, D7, #0, #11 |
(868) 0x4494cc B.GT 4493d0 |
(868) 0x4494d0 FCMPE D31, D20 |
(868) 0x4494d4 FMOV D31, #1.0000000 |
(868) 0x4494d8 B.GE 44937c |
(868) 0x4494dc FCMPE D21, D20 |
(868) 0x4494e0 FMOV D31, #-1.0000000 |
(868) 0x4494e4 FNEG D21, D21 |
(868) 0x4494e8 B.GT 44937c |
(868) 0x4494ec FMOV D31, #1.0000000 |
(868) 0x4494f0 B 44937c |
(865) 0x4494f4 LDP X23, X24, [SP, #48] |
(865) 0x4494f8 LDP X27, X28, [SP, #80] |
(865) 0x4494fc LDP X19, X20, [SP, #16] |
(865) 0x449500 LDP X21, X22, [SP, #32] |
(865) 0x449504 LDP X25, X26, [SP, #64] |
(865) 0x449508 LDP X29, X30, [SP], #112 |
(865) 0x44950c RET |
0x449510 LDP X19, X20, [SP, #16] |
0x449514 LDP X21, X22, [SP, #32] |
0x449518 LDP X23, X24, [SP, #48] |
0x44951c LDP X25, X26, [SP, #64] |
0x449520 LDP X29, X30, [SP], #112 |
0x449524 RET |
(866) 0x449528 ORR W16, WZR, W3 |
(866) 0x44952c ADD X30, X13, #1 |
(867) 0x449530 ORR X13, XZR, X30 |
(867) 0x449534 CMP W20, W17 |
(867) 0x449538 B.LE 4494f4 |
(867) 0x44953c ORR W4, WZR, W17 |
(867) 0x449540 LDR W17, [SP, #100] |
(867) 0x449544 ORR W3, WZR, W16 |
(867) 0x449548 ORR W30, WZR, W21 |
(867) 0x44954c SUB W2, W17, W16 |
(867) 0x449550 ORR W16, WZR, W24 |
(867) 0x449554 B 4492c8 |
0x449558 ADD W2, W2, #1 |
0x44955c MOVZ W1, #0 |
0x449560 B 449274 |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►98.41+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so | |
►1.59+ | GOMP_parallel | libomp.so | |
○ | viscosity(global_variables&) | viscosity.cpp:36 | exec |
○ | timestep(global_variables&, pa[...] | timestep.cpp:64 | exec |
○ | hydro(global_variables&, paral[...] | hydro.cpp:60 | exec |
○ | main | iostream:74 | exec |
○ | __libc_start_main | libc-2.31.so | |
○ | _start | iostream:74 | exec |
Path / |
Source file and lines | viscosity.cpp:36-64 |
Module | exec |
nb instructions | 59 |
loop length | 236 |
nb stack references | 0 |
front end | 7.38 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.00 | 4.00 | 7.75 | 7.75 | 7.75 | 7.75 | 0.50 | 0.50 | 0.50 | 0.50 | 7.00 | 7.00 | 7.00 | 4.00 | 4.00 |
cycles | 4.00 | 4.00 | 7.75 | 7.75 | 7.75 | 7.75 | 0.50 | 0.50 | 0.50 | 0.50 | 7.00 | 7.00 | 7.00 | 4.00 | 4.00 |
Cycles executing div or sqrt instructions | 2.00-1.00 |
Front-end | 7.38 |
Overall L1 | 7.75 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #912]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDP W25, W20, [X0, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X22, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR W21, [X0, #56] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W25, W25, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W20, W20, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR W0, [X0, #60] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
CMP W25, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 4494fc <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x2fc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W21, W21, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W19, W0, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB W26, W20, W25 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W21, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 4494fc <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x2fc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
SUB W24, W19, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MADD W26, W26, W24, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W23, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W2, W26, W23 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ORR W3, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MSUB W1, W2, W23, W26 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W0, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 449558 <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x358> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MADD W3, W2, W3, W1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD W4, W2, W3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STR W4, [SP, #100] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
CMP W3, W4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CS 449510 <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x310> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W9, W3, W24 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ADRP X5, <44a28c> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
FMOV D6, #0.5000000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 |
MOVI D7, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 |
LDR D5, [X5, #256] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
LDP X7, X28, [X22] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X15, X27, [X22, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
MSUB W6, W9, W24, W3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD W4, W9, W25 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDP X26, X25, [X22, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
SBFM X13, X4, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W30, W6, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STR X7, [SP, #104] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDR X23, [X22, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
SUB W16, W19, W30 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X29, X30, [SP], #112 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W2, W2, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOVZ W1, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
B 449274 <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x74> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Source file and lines | viscosity.cpp:36-64 |
Module | exec |
nb instructions | 59 |
loop length | 236 |
nb stack references | 0 |
front end | 7.38 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 4.00 | 4.00 | 7.75 | 7.75 | 7.75 | 7.75 | 0.50 | 0.50 | 0.50 | 0.50 | 7.00 | 7.00 | 7.00 | 4.00 | 4.00 |
cycles | 4.00 | 4.00 | 7.75 | 7.75 | 7.75 | 7.75 | 0.50 | 0.50 | 0.50 | 0.50 | 7.00 | 7.00 | 7.00 | 4.00 | 4.00 |
Cycles executing div or sqrt instructions | 2.00-1.00 |
Front-end | 7.38 |
Overall L1 | 7.75 |
all | 0% |
load | 0% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #912]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDP W25, W20, [X0, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X22, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR W21, [X0, #56] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W25, W25, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W20, W20, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDR W0, [X0, #60] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
CMP W25, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 4494fc <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x2fc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W21, W21, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W19, W0, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB W26, W20, W25 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W21, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 4494fc <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x2fc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
SUB W24, W19, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MADD W26, W26, W24, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W23, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W2, W26, W23 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ORR W3, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MSUB W1, W2, W23, W26 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W0, W1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 449558 <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x358> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
MADD W3, W2, W3, W1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD W4, W2, W3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STR W4, [SP, #100] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
CMP W3, W4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CS 449510 <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x310> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W9, W3, W24 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ADRP X5, <44a28c> | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X27, X28, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
FMOV D6, #0.5000000 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 |
MOVI D7, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 |
LDR D5, [X5, #256] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 |
LDP X7, X28, [X22] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X15, X27, [X22, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
MSUB W6, W9, W24, W3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
ADD W4, W9, W25 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDP X26, X25, [X22, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
SBFM X13, X4, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W30, W6, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STR X7, [SP, #104] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDR X23, [X22, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
SUB W16, W19, W30 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
LDP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
LDP X29, X30, [SP], #112 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 |
RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W2, W2, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MOVZ W1, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
B 449274 <_Z16viscosity_kerneliiiiRN6clover8Buffer1DIdEES2_RNS_8Buffer2DIdEES5_S5_S5_S5_._omp_fn.0+0x74> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼viscosity_kernel(int, int, int, int, clover::Buffer1D | 2.52 | 3.35 |
▼Loop 866 - viscosity.cpp:38-64 - exec– | 0 | 0 |
▼Loop 867 - viscosity.cpp:38-64 - exec– | 0.01 | 0.01 |
○Loop 868 - viscosity.cpp:39-64 - exec | 2.51 | 3.33 |
○Loop 865 - viscosity.cpp:36-56 - exec | 0 | 0 |