Function: PdV_kernel(bool, int, int, int, int, double, clover::Buffer2D<double>&, clover::Buffer2D<d ... | Module: exec | Source: PdV.cpp:48-63 [...] | Coverage: 6.65% |
---|
Function: PdV_kernel(bool, int, int, int, int, double, clover::Buffer2D<double>&, clover::Buffer2D<d ... | Module: exec | Source: PdV.cpp:48-63 [...] | Coverage: 6.65% |
---|
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/PdV.cpp: 48 - 63 |
-------------------------------------------------------------------------------- |
48: #pragma omp parallel for simd collapse(2) |
49: for (int j = (y_min + 1); j < (y_max + 2); j++) { |
50: for (int i = (x_min + 1); i < (x_max + 2); i++) { |
51: double left_flux = (xarea(i, j) * (xvel0(i, j) + xvel0(i + 0, j + 1) + xvel0(i, j) + xvel0(i + 0, j + 1))) * 0.25 * dt * 0.5; |
52: double right_flux = |
53: (xarea(i + 1, j + 0) * (xvel0(i + 1, j + 0) + xvel0(i + 1, j + 1) + xvel0(i + 1, j + 0) + xvel0(i + 1, j + 1))) * 0.25 * dt * |
54: 0.5; |
55: double bottom_flux = (yarea(i, j) * (yvel0(i, j) + yvel0(i + 1, j + 0) + yvel0(i, j) + yvel0(i + 1, j + 0))) * 0.25 * dt * 0.5; |
56: double top_flux = (yarea(i + 0, j + 1) * (yvel0(i + 0, j + 1) + yvel0(i + 1, j + 1) + yvel0(i + 0, j + 1) + yvel0(i + 1, j + 1))) * |
57: 0.25 * dt * 0.5; |
58: double total_flux = right_flux - left_flux + top_flux - bottom_flux; |
59: double volume_change_s = volume(i, j) / (volume(i, j) + total_flux); |
60: double recip_volume = 1.0 / volume(i, j); |
61: double energy_change = (pressure(i, j) / density0(i, j) + viscosity(i, j) / density0(i, j)) * total_flux * recip_volume; |
62: energy1(i, j) = energy0(i, j) - energy_change; |
63: density1(i, j) = density0(i, j) * volume_change_s; |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 69 - 69 |
-------------------------------------------------------------------------------- |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
0x42cae0 STP X29, X30, [SP, #832]! |
0x42cae4 ADD X29, SP, #0 |
0x42cae8 STP X19, X20, [SP, #16] |
0x42caec ORR X20, XZR, X0 |
0x42caf0 STP X23, X24, [SP, #48] |
0x42caf4 LDP W23, W1, [X0, #104] |
0x42caf8 LDR W0, [X0, #96] |
0x42cafc LDR W2, [X20, #100] |
0x42cb00 ADD W3, W1, #2 |
0x42cb04 ADD W24, W23, #1 |
0x42cb08 ADD W4, W0, #1 |
0x42cb0c STP W3, W4, [SP, #160] |
0x42cb10 CMP W24, W3 |
0x42cb14 B.GE 42ce8c |
0x42cb18 ADD W19, W2, #2 |
0x42cb1c STP X21, X22, [SP, #32] |
0x42cb20 SUB W22, W3, W24 |
0x42cb24 CMP W4, W19 |
0x42cb28 B.GE 42ce9c |
0x42cb2c SUB W5, W19, W4 |
0x42cb30 MADD W23, W22, W5, WZR |
0x42cb34 STR W5, [SP, #168] |
0x42cb38 BL 403530 |
0x42cb3c ORR W21, WZR, W0 |
0x42cb40 BL 4033c0 |
0x42cb44 UDIV W7, W23, W21 |
0x42cb48 ORR W6, WZR, W0 |
0x42cb4c MSUB W8, W7, W21, W23 |
0x42cb50 CMP W0, W8 |
0x42cb54 B.CC 42ceb0 |
(397) 0x42cb58 MADD W6, W7, W6, W8 |
(397) 0x42cb5c ADD W9, W7, W6 |
(397) 0x42cb60 STR W9, [SP, #172] |
(397) 0x42cb64 CMP W6, W9 |
(397) 0x42cb68 B.CS 42ce9c |
(399) 0x42cb6c LDR W10, [SP, #168] |
(399) 0x42cb70 FMOV D0, #0.1250000 |
(399) 0x42cb74 CNTD X22, ALL |
(399) 0x42cb78 STP X25, X26, [SP, #64] |
(399) 0x42cb7c PTRUE P0.B, ALL |
(399) 0x42cb80 FDUP Z6.D, #112 |
(399) 0x42cb84 LDR W16, [SP, #164] |
(399) 0x42cb88 UDIV W11, W6, W10 |
(399) 0x42cb8c STP X27, X28, [SP, #80] |
(399) 0x42cb90 LDR X18, [X20, #56] |
(399) 0x42cb94 LDP X28, X27, [X20, #8] |
(399) 0x42cb98 LDR D5, [X20] |
(399) 0x42cb9c STR X18, [SP, #136] |
(399) 0x42cba0 MSUB W12, W11, W10, W6 |
(399) 0x42cba4 LDR X14, [X20, #24] |
(399) 0x42cba8 ADD W13, W11, W24 |
(399) 0x42cbac SBFM X3, X13, #0, #31 |
(399) 0x42cbb0 LDR X17, [X20, #32] |
(399) 0x42cbb4 ADD W10, W12, W16 |
(399) 0x42cbb8 LDR X25, [X20, #40] |
(399) 0x42cbbc SUB W18, W19, W10 |
(399) 0x42cbc0 FMUL D1, D5, D0 |
(399) 0x42cbc4 CMP W7, W18 |
(399) 0x42cbc8 CSEL W24, W7, W18, #9 |
(399) 0x42cbcc STR X14, [SP, #104] |
(399) 0x42cbd0 LDR X15, [X20, #48] |
(399) 0x42cbd4 DUP Z16.D, Z1.D[0] |
(399) 0x42cbd8 STR X17, [SP, #112] |
(399) 0x42cbdc LDR X26, [X20, #64] |
(399) 0x42cbe0 STR X25, [SP, #144] |
(399) 0x42cbe4 LDR X30, [X20, #72] |
(399) 0x42cbe8 STR X15, [SP, #128] |
(399) 0x42cbec LDR X0, [X20, #80] |
(399) 0x42cbf0 STR X26, [SP, #184] |
(399) 0x42cbf4 LDR X20, [X20, #88] |
(399) 0x42cbf8 STR X30, [SP, #120] |
(399) 0x42cbfc ADD W30, W6, W24 |
(399) 0x42cc00 STR X0, [SP, #176] |
(399) 0x42cc04 STR X20, [SP, #96] |
(399) 0x42cc08 CMP W6, W30 |
(399) 0x42cc0c B.CS 42ce5c |
(400) 0x42cc10 LDR X4, [X27] |
(400) 0x42cc14 SBFM X1, X10, #0, #31 |
(400) 0x42cc18 ORR W2, WZR, W24 |
(400) 0x42cc1c LDP X21, X7, [SP, #96] |
(400) 0x42cc20 ADD X23, X1, #1 |
(400) 0x42cc24 WHILELO P1.D, XZR, X2 |
(400) 0x42cc28 LDP X13, X6, [SP, #176] |
(400) 0x42cc2c MADD X19, X3, X4, XZR |
(400) 0x42cc30 LDP X11, X12, [SP, #112] |
(400) 0x42cc34 ADD X5, X4, X19 |
(400) 0x42cc38 ADD X26, X19, X1 |
(400) 0x42cc3c ORR X9, XZR, X6 |
(400) 0x42cc40 ADD X8, X5, X1 |
(400) 0x42cc44 LDP X14, X15, [SP, #128] |
(400) 0x42cc48 LDR X10, [X21] |
(400) 0x42cc4c LDR X19, [X13] |
(400) 0x42cc50 LDR X4, [X9] |
(400) 0x42cc54 MADD X16, X3, X10, XZR |
(400) 0x42cc58 LDR X9, [X28] |
(400) 0x42cc5c MADD X5, X3, X19, XZR |
(400) 0x42cc60 LDR X25, [X7] |
(400) 0x42cc64 ADD X17, X10, X16 |
(400) 0x42cc68 ADD X18, X16, X23 |
(400) 0x42cc6c ADD X20, X17, X1 |
(400) 0x42cc70 ADD X24, X16, X1 |
(400) 0x42cc74 MADD X16, X3, X4, X1 |
(400) 0x42cc78 ADD X7, X19, X5 |
(400) 0x42cc7c LDR X11, [X11] |
(400) 0x42cc80 ADD X19, X5, X1 |
(400) 0x42cc84 MADD X10, X3, X9, XZR |
(400) 0x42cc88 ADD X21, X7, X1 |
(400) 0x42cc8c ADD X7, X7, X23 |
(400) 0x42cc90 LDR X12, [X12] |
(400) 0x42cc94 ADD X9, X10, X1 |
(400) 0x42cc98 LDR X0, [X6, #16] |
(400) 0x42cc9c ADD X6, X17, X23 |
(400) 0x42cca0 MADD X17, X3, X25, X1 |
(400) 0x42cca4 ADD X25, X5, X23 |
(400) 0x42cca8 ADD X23, X10, X23 |
(400) 0x42ccac LDR X10, [SP, #96] |
(400) 0x42ccb0 MADD X5, X3, X11, X1 |
(400) 0x42ccb4 LDR X4, [X15] |
(400) 0x42ccb8 MADD X15, X3, X12, X1 |
(400) 0x42ccbc STR X0, [SP, #152] |
(400) 0x42ccc0 LDR X13, [X13, #16] |
(400) 0x42ccc4 LDR X12, [X10, #16] |
(400) 0x42ccc8 MADD X4, X3, X4, X1 |
(400) 0x42cccc LDR X11, [SP, #144] |
(400) 0x42ccd0 ADD X21, X13, X21,LSL #3 |
(400) 0x42ccd4 ADD X7, X13, X7,LSL #3 |
(400) 0x42ccd8 LDR X14, [X14] |
(400) 0x42ccdc ADD X10, X12, X24,LSL #3 |
(400) 0x42cce0 ADD X19, X13, X19,LSL #3 |
(400) 0x42cce4 LDR X0, [SP, #112] |
(400) 0x42cce8 ADD X20, X12, X20,LSL #3 |
(400) 0x42ccec LDR X24, [X11] |
(400) 0x42ccf0 ADD X11, X13, X25,LSL #3 |
(400) 0x42ccf4 ADD X6, X12, X6,LSL #3 |
(400) 0x42ccf8 LDR X25, [X27, #16] |
(400) 0x42ccfc ADD X18, X12, X18,LSL #3 |
(400) 0x42cd00 MADD X14, X3, X14, X1 |
(400) 0x42cd04 MADD X1, X3, X24, X1 |
(400) 0x42cd08 ADD X13, X25, X26,LSL #3 |
(400) 0x42cd0c LDR X26, [X28, #16] |
(400) 0x42cd10 ADD X8, X25, X8,LSL #3 |
(400) 0x42cd14 LDR X25, [X0, #16] |
(400) 0x42cd18 ADD X12, X26, X23,LSL #3 |
(400) 0x42cd1c LDR X23, [SP, #104] |
(400) 0x42cd20 ADD X9, X26, X9,LSL #3 |
(400) 0x42cd24 LDR X26, [SP, #120] |
(400) 0x42cd28 ADD X5, X25, X5,LSL #3 |
(400) 0x42cd2c LDR X24, [X23, #16] |
(400) 0x42cd30 LDR X23, [X26, #16] |
(400) 0x42cd34 LDR X26, [SP, #136] |
(400) 0x42cd38 ADD X17, X24, X17,LSL #3 |
(400) 0x42cd3c LDR X24, [SP, #128] |
(400) 0x42cd40 ADD X15, X23, X15,LSL #3 |
(400) 0x42cd44 LDR X0, [SP, #152] |
(400) 0x42cd48 LDR X23, [X26, #16] |
(400) 0x42cd4c LDR X25, [X24, #16] |
(400) 0x42cd50 ADD X16, X0, X16,LSL #3 |
(400) 0x42cd54 LDR X24, [SP, #144] |
(400) 0x42cd58 ADD X4, X23, X4,LSL #3 |
(400) 0x42cd5c ADD X14, X25, X14,LSL #3 |
(400) 0x42cd60 MOVZ X25, #0 |
(400) 0x42cd64 LDR X0, [X24, #16] |
(400) 0x42cd68 ADD X1, X0, X1,LSL #3 |
(398) 0x42cd6c LD1D {Z2.D}, P1/Z, [X13, X25,LSL #3] |
(398) 0x42cd70 LD1D {Z3.D}, P1/Z, [X10, X25,LSL #3] |
(398) 0x42cd74 LD1D {Z7.D}, P1/Z, [X18, X25,LSL #3] |
(398) 0x42cd78 LD1D {Z17.D}, P1/Z, [X9, X25,LSL #3] |
(398) 0x42cd7c LD1D {Z21.D}, P1/Z, [X19, X25,LSL #3] |
(398) 0x42cd80 LD1D {Z22.D}, P1/Z, [X21, X25,LSL #3] |
(398) 0x42cd84 LD1D {Z23.D}, P1/Z, [X8, X25,LSL #3] |
(398) 0x42cd88 FADD Z18.D, Z7.D, Z3.D |
(398) 0x42cd8c FADD Z24.D, Z22.D, Z21.D |
(398) 0x42cd90 FADD Z4.D, Z2.D, Z2.D |
(398) 0x42cd94 FADD Z19.D, Z17.D, Z17.D |
(398) 0x42cd98 FMUL Z20.D, Z4.D, Z18.D |
(398) 0x42cd9c FADD Z25.D, Z23.D, Z23.D |
(398) 0x42cda0 FMLA Z20.D, P0/M, Z19.D, Z24.D |
(398) 0x42cda4 LD1D {Z26.D}, P1/Z, [X20, X25,LSL #3] |
(398) 0x42cda8 LD1D {Z19.D}, P1/Z, [X17, X25,LSL #3] |
(398) 0x42cdac LD1D {Z27.D}, P1/Z, [X6, X25,LSL #3] |
(398) 0x42cdb0 LD1D {Z28.D}, P1/Z, [X12, X25,LSL #3] |
(398) 0x42cdb4 LD1D {Z0.D}, P1/Z, [X11, X25,LSL #3] |
(398) 0x42cdb8 LD1D {Z5.D}, P1/Z, [X7, X25,LSL #3] |
(398) 0x42cdbc LD1D {Z2.D}, P1/Z, [X16, X25,LSL #3] |
(398) 0x42cdc0 LD1D {Z1.D}, P1/Z, [X15, X25,LSL #3] |
(398) 0x42cdc4 LD1D {Z7.D}, P1/Z, [X5, X25,LSL #3] |
(398) 0x42cdc8 MOVPRFX Z21, Z6 |
(398) 0x42cdcc FDIV Z21.D, P0/M, Z21.D, Z19.D |
(398) 0x42cdd0 FADD Z29.D, Z27.D, Z26.D |
(398) 0x42cdd4 FADD Z30.D, Z28.D, Z28.D |
(398) 0x42cdd8 FMUL Z31.D, Z25.D, Z29.D |
(398) 0x42cddc FADD Z3.D, Z5.D, Z0.D |
(398) 0x42cde0 FADD Z17.D, Z1.D, Z2.D |
(398) 0x42cde4 FMAD Z30.D, P0/M, Z3.D, Z31.D |
(398) 0x42cde8 FDIV Z17.D, P0/M, Z17.D, Z7.D |
(398) 0x42cdec FSUB Z4.D, Z30.D, Z20.D |
(398) 0x42cdf0 FMUL Z23.D, Z17.D, Z21.D |
(398) 0x42cdf4 LD1D {Z20.D}, P1/Z, [X14, X25,LSL #3] |
(398) 0x42cdf8 FMUL Z18.D, Z4.D, Z16.D |
(398) 0x42cdfc FADD Z22.D, Z19.D, Z18.D |
(398) 0x42ce00 FMSB Z18.D, P0/M, Z23.D, Z20.D |
(398) 0x42ce04 FDIV Z19.D, P0/M, Z19.D, Z22.D |
(398) 0x42ce08 ST1D {Z18.D}, P1, [X4, X25,LSL #3] |
(398) 0x42ce0c LD1D {Z24.D}, P1/Z, [X5, X25,LSL #3] |
(398) 0x42ce10 FMUL Z25.D, Z24.D, Z19.D |
(398) 0x42ce14 ST1D {Z25.D}, P1, [X1, X25,LSL #3] |
(398) 0x42ce18 ADD X25, X25, X22 |
(398) 0x42ce1c WHILELO P1.D, X25, X2 |
(398) 0x42ce20 B.NE 42cd6c |
(400) 0x42ce24 LDR W20, [SP, #160] |
(400) 0x42ce28 ADD X3, X3, #1 |
(400) 0x42ce2c ADD W2, W3, #0 |
(400) 0x42ce30 CMP W20, W2 |
(400) 0x42ce34 B.LE 42ce80 |
(400) 0x42ce38 LDR W21, [SP, #172] |
(400) 0x42ce3c ORR W6, WZR, W30 |
(400) 0x42ce40 LDP W10, W18, [SP, #164] |
(400) 0x42ce44 SUB W7, W21, W30 |
(400) 0x42ce48 CMP W7, W18 |
(400) 0x42ce4c CSEL W24, W7, W18, #9 |
(400) 0x42ce50 ADD W30, W6, W24 |
(400) 0x42ce54 CMP W6, W30 |
(400) 0x42ce58 B.CC 42cc10 |
(401) 0x42ce5c LDR W20, [SP, #160] |
(401) 0x42ce60 ADD X3, X3, #1 |
(401) 0x42ce64 ORR W30, WZR, W6 |
(401) 0x42ce68 ADD W2, W3, #0 |
(401) 0x42ce6c CMP W20, W2 |
(401) 0x42ce70 B.GT 42ce38 |
(399) 0x42ce74 HINT #0 |
(399) 0x42ce78 HINT #0 |
(399) 0x42ce7c HINT #0 |
(399) 0x42ce80 LDP X21, X22, [SP, #32] |
(399) 0x42ce84 LDP X25, X26, [SP, #64] |
(399) 0x42ce88 LDP X27, X28, [SP, #80] |
(399) 0x42ce8c LDP X19, X20, [SP, #16] |
(399) 0x42ce90 LDP X23, X24, [SP, #48] |
(399) 0x42ce94 LDP X29, X30, [SP], #192 |
(399) 0x42ce98 RET |
(397) 0x42ce9c LDP X19, X20, [SP, #16] |
(397) 0x42cea0 LDP X21, X22, [SP, #32] |
(397) 0x42cea4 LDP X23, X24, [SP, #48] |
(397) 0x42cea8 LDP X29, X30, [SP], #192 |
(397) 0x42ceac RET |
(397) 0x42ceb0 ADD W7, W7, #1 |
(397) 0x42ceb4 MOVZ W8, #0 |
(397) 0x42ceb8 B 42cb58 |
0x42cebc HINT #0 |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►98.42+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so | |
►1.58+ | GOMP_parallel | libomp.so | |
○ | PdV(global_variables&, bool) | PdV.cpp:99 | exec |
○ | hydro(global_variables&, paral[...] | basic_string.h:906 | exec |
○ | main | iostream:74 | exec |
○ | __libc_start_main | libc-2.31.so | |
○ | _start | iostream:74 | exec |
Path / |
Source file and lines | PdV.cpp:48-63 |
Module | exec |
nb instructions | 31 |
loop length | 124 |
nb stack references | 0 |
front end | 3.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.75 | 4.75 | 4.75 | 4.75 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 3.00 | 3.00 |
cycles | 2.50 | 2.50 | 4.75 | 4.75 | 4.75 | 4.75 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 3.00 | 3.00 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Front-end | 3.75 |
Overall L1 | 4.75 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #832]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X20, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDP W23, W1, [X0, #104] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W0, [X0, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W2, [X20, #100] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W3, W1, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W24, W23, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W4, W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP W3, W4, [SP, #160] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
CMP W24, W3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 42ce8c <_Z10PdV_kernelbiiiidRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_._omp_fn.0+0x3ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W19, W2, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
SUB W22, W3, W24 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W4, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 42ce9c <_Z10PdV_kernelbiiiidRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_._omp_fn.0+0x3bc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W5, W19, W4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MADD W23, W22, W5, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
STR W5, [SP, #168] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W21, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W7, W23, W21 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ORR W6, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MSUB W8, W7, W21, W23 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W0, W8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 42ceb0 <_Z10PdV_kernelbiiiidRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_._omp_fn.0+0x3d0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
HINT #0 |
Source file and lines | PdV.cpp:48-63 |
Module | exec |
nb instructions | 31 |
loop length | 124 |
nb stack references | 0 |
front end | 3.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.75 | 4.75 | 4.75 | 4.75 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 3.00 | 3.00 |
cycles | 2.50 | 2.50 | 4.75 | 4.75 | 4.75 | 4.75 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 | 3.00 | 3.00 | 3.00 | 3.00 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Front-end | 3.75 |
Overall L1 | 4.75 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #832]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X20, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDP W23, W1, [X0, #104] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W0, [X0, #96] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
LDR W2, [X20, #100] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W3, W1, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W24, W23, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W4, W0, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP W3, W4, [SP, #160] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
CMP W24, W3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 42ce8c <_Z10PdV_kernelbiiiidRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_._omp_fn.0+0x3ac> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W19, W2, #2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
SUB W22, W3, W24 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W4, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 42ce9c <_Z10PdV_kernelbiiiidRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_._omp_fn.0+0x3bc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W5, W19, W4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MADD W23, W22, W5, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
STR W5, [SP, #168] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W21, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W7, W23, W21 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ORR W6, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MSUB W8, W7, W21, W23 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W0, W8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 42ceb0 <_Z10PdV_kernelbiiiidRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_._omp_fn.0+0x3d0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
HINT #0 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼PdV_kernel(bool, int, int, int, int, double, clover::Buffer2D | 6.65 | 8.83 |
▼Loop 399 - PdV.cpp:48-63 - exec– | 0 | 0 |
▼Loop 400 - PdV.cpp:50-63 - exec– | 0.01 | 0.02 |
○Loop 398 - PdV.cpp:51-63 - exec | 6.64 | 8.79 |
○Loop 401 - PdV.cpp:50-63 - exec | 0 | 0 |
○Loop 397 - PdV.cpp:48-50 - exec | 0 | 0 |