Function: advec_mom_kernel(int, int, int, int, clover::Buffer2D<double>&, clover::Buffer2D<double>&, ... | Module: exec | Source: advec_mom.cpp:108-139 [...] | Coverage: 3.7% |
---|
Function: advec_mom_kernel(int, int, int, int, clover::Buffer2D<double>&, clover::Buffer2D<double>&, ... | Module: exec | Source: advec_mom.cpp:108-139 [...] | Coverage: 3.7% |
---|
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/advec_mom.cpp: 108 - 139 |
-------------------------------------------------------------------------------- |
108: #pragma omp parallel for simd collapse(2) |
109: for (int j = (y_min + 1); j < (y_max + 1 + 2); j++) { |
110: for (int i = (x_min - 1 + 1); i < (x_max + 1 + 2); i++) |
111: ({ |
112: int upwind, donor, downwind, dif; |
113: double sigma, width, limiter, vdiffuw, vdiffdw, auw, adw, wind, advec_vel_s; |
114: if (node_flux(i, j) < 0.0) { |
[...] |
120: upwind = i - 1; |
121: donor = i; |
122: downwind = i + 1; |
123: dif = upwind; |
124: } |
125: sigma = std::fabs(node_flux(i, j)) / (node_mass_pre(donor, j)); |
126: width = celldx[i]; |
127: vdiffuw = vel1(donor, j) - vel1(upwind, j); |
128: vdiffdw = vel1(downwind, j) - vel1(donor, j); |
129: limiter = 0.0; |
130: if (vdiffuw * vdiffdw > 0.0) { |
131: auw = std::fabs(vdiffuw); |
132: adw = std::fabs(vdiffdw); |
133: wind = 1.0; |
134: if (vdiffdw <= 0.0) wind = -1.0; |
135: limiter = |
136: wind * std::fmin(std::fmin(width * ((2.0 - sigma) * adw / width + (1.0 + sigma) * auw / celldx[dif]) / 6.0, auw), adw); |
137: } |
138: advec_vel_s = vel1(donor, j) + (1.0 - sigma) * limiter; |
139: mom_flux(i, j) = advec_vel_s * node_flux(i, j); |
/home/hbollore/qaas-runs/170-290-5445/intel/CloverLeafCXX/build/CloverLeafCXX/src/omp/context.h: 46 - 69 |
-------------------------------------------------------------------------------- |
46: T &operator[](size_t i) const { return data[i]; } |
[...] |
69: T &operator()(size_t i, size_t j) const { return data[i + j * sizeX]; } |
0x421904 STP X29, X30, [SP, #944]! |
0x421908 ADD X29, SP, #0 |
0x42190c STP X19, X20, [SP, #16] |
0x421910 STP X25, X26, [SP, #64] |
0x421914 LDP W25, W20, [X0, #48] |
0x421918 STP X21, X22, [SP, #32] |
0x42191c LDP W21, W2, [X0, #40] |
0x421920 ADD W25, W25, #1 |
0x421924 ADD W20, W20, #3 |
0x421928 CMP W25, W20 |
0x42192c B.GE 421c64 |
0x421930 ADD W19, W2, #3 |
0x421934 SUB W26, W20, W25 |
0x421938 CMP W21, W19 |
0x42193c B.GE 421c64 |
0x421940 SUB W22, W19, W21 |
0x421944 STP X23, X24, [SP, #48] |
0x421948 ORR X23, XZR, X0 |
0x42194c MADD W26, W26, W22, WZR |
0x421950 BL 403530 |
0x421954 ORR W24, WZR, W0 |
0x421958 BL 4033c0 |
0x42195c UDIV W3, W26, W24 |
0x421960 ORR W1, WZR, W0 |
0x421964 MSUB W0, W3, W24, W26 |
0x421968 CMP W1, W0 |
0x42196c B.CC 421c78 |
(254) 0x421970 MADD W8, W3, W1, W0 |
(254) 0x421974 ADD W15, W3, W8 |
(254) 0x421978 CMP W8, W15 |
(254) 0x42197c B.CS 421c60 |
(254) 0x421980 UDIV W5, W8, W22 |
(254) 0x421984 ADRP X16, |
(254) 0x421988 MOVZ W17, #0 |
(254) 0x42198c ADD X26, X16, #4072 |
(254) 0x421990 CNTW X10, ALL |
(254) 0x421994 PTRUE P0.B, ALL |
(254) 0x421998 DUP Z18.S, #2 |
(254) 0x42199c DUP Z17.B, #255 |
(254) 0x4219a0 FDUP Z16.D, #240 |
(254) 0x4219a4 FDUP Z7.D, #0 |
(254) 0x4219a8 LDP X14, X13, [X23] |
(254) 0x4219ac LDP X12, X11, [X23, #16] |
(254) 0x4219b0 MSUB W4, W5, W22, W8 |
(254) 0x4219b4 ADD W6, W5, W25 |
(254) 0x4219b8 LDR X18, [X23, #32] |
(254) 0x4219bc SBFM X9, X6, #0, #31 |
(254) 0x4219c0 ADD W16, W4, W21 |
(254) 0x4219c4 SUB W19, W19, W16 |
(254) 0x4219c8 CMP W3, W19 |
(254) 0x4219cc CSEL X2, X3, X19, #9 |
(254) 0x4219d0 ADD W19, W8, W2 |
(254) 0x4219d4 CMP W8, W19 |
(254) 0x4219d8 B.CS 421c48 |
(254) 0x4219dc HINT #0 |
(256) 0x4219e0 LDR X7, [X11] |
(256) 0x4219e4 SBFM X30, X16, #0, #31 |
(256) 0x4219e8 INDEX Z5.S, W16, #1 |
(256) 0x4219ec MOVZ X0, #0 |
(256) 0x4219f0 ORR X23, XZR, X2 |
(256) 0x4219f4 WHILELO P2.D, XZR, X2 |
(256) 0x4219f8 LDR X24, [X12] |
(256) 0x4219fc UQDECD X23, ALL |
(256) 0x421a00 LD1RD {Z6.D}, P0/Z, [X26] |
(256) 0x421a04 WHILELO P1.D, XZR, X23 |
(256) 0x421a08 LDR X8, [X11, #16] |
(256) 0x421a0c MADD X25, X9, X7, X30 |
(256) 0x421a10 LDR X1, [X12, #16] |
(256) 0x421a14 MADD X5, X9, X24, XZR |
(256) 0x421a18 LDR X6, [X13] |
(256) 0x421a1c ADD X3, X8, X25,LSL #3 |
(256) 0x421a20 LDR X24, [X14] |
(256) 0x421a24 ADD X16, X1, X5,LSL #3 |
(256) 0x421a28 ADDVL X4, X3, #1 |
(256) 0x421a2c LDR X5, [X18, #8] |
(256) 0x421a30 MADD X7, X9, X6, X30 |
(256) 0x421a34 LDR X8, [X13, #16] |
(256) 0x421a38 MADD X1, X9, X24, XZR |
(256) 0x421a3c LDR X25, [X14, #16] |
(256) 0x421a40 ADD X30, X5, X30,LSL #3 |
(256) 0x421a44 ADD X6, X8, X7,LSL #3 |
(256) 0x421a48 ADDVL X24, X30, #1 |
(256) 0x421a4c ADD X7, X25, X1,LSL #3 |
(256) 0x421a50 ADDVL X8, X6, #1 |
(256) 0x421a54 HINT #0 |
(256) 0x421a58 HINT #0 |
(256) 0x421a5c HINT #0 |
(255) 0x421a60 LD1D {Z23.D}, P2/Z, [X6, X0,LSL #3] |
(255) 0x421a64 LD1D {Z4.D}, P1/Z, [X8, X0,LSL #3] |
(255) 0x421a68 FCMGE P3.D, P0/Z, Z23.D, #0 |
(255) 0x421a6c FCMGE P5.D, P0/Z, Z4.D, #0 |
(255) 0x421a70 MOVPRFX Z20, Z5 |
(255) 0x421a74 SUB Z20.S, Z20.S, #1 |
(255) 0x421a78 UZP1 P6.S, P3.S, P5.S |
(255) 0x421a7c EOR P4.B, P0/Z, P6.B, P0.B |
(255) 0x421a80 MOVPRFX Z20.S, P4/M, Z5.S |
(255) 0x421a84 ADD Z20.S, P4/M, Z20.S, Z18.S |
(255) 0x421a88 MOVPRFX Z24, Z5 |
(255) 0x421a8c ADD Z24.S, Z24.S, #1 |
(255) 0x421a90 SUNPKLO Z2.D, Z5 |
(255) 0x421a94 SUNPKLO Z3.D, Z24 |
(255) 0x421a98 SUNPKLO Z19.D, Z20 |
(255) 0x421a9c SEL Z22.D, P3, Z2.D, Z3.D |
(255) 0x421aa0 LD1D {Z27.D}, P2/Z, [X7, Z19.D,LSL #3] |
(255) 0x421aa4 LD1D {Z21.D}, P2/Z, [X7, Z22.D,LSL #3] |
(255) 0x421aa8 LD1D {Z26.D}, P2/Z, [X16, Z22.D,LSL #3] |
(255) 0x421aac FSUB Z29.D, Z21.D, Z27.D |
(255) 0x421ab0 SUNPKHI Z0.D, Z20 |
(255) 0x421ab4 FCMLT P7.D, P0/Z, Z23.D, #0 |
(255) 0x421ab8 SEL Z25.D, P7, Z2.D, Z3.D |
(255) 0x421abc LD1D {Z30.D}, P2/Z, [X7, Z25.D,LSL #3] |
(255) 0x421ac0 FSUB Z20.D, Z30.D, Z21.D |
(255) 0x421ac4 LD1D {Z28.D}, P1/Z, [X7, Z0.D,LSL #3] |
(255) 0x421ac8 FMUL Z3.D, Z20.D, Z29.D |
(255) 0x421acc SEL Z19.S, P6, Z5.S, Z24.S |
(255) 0x421ad0 FCMGT P3.D, P2/Z, Z3.D, #0 |
(255) 0x421ad4 ADD Z19.S, P6/M, Z19.S, Z17.S |
(255) 0x421ad8 SUNPKHI Z1.D, Z5 |
(255) 0x421adc FABD Z27.D, P0/M, Z27.D, Z21.D |
(255) 0x421ae0 FABD Z30.D, P0/M, Z30.D, Z21.D |
(255) 0x421ae4 SUNPKLO Z0.D, Z19 |
(255) 0x421ae8 MOVPRFX Z31, Z23 |
(255) 0x421aec FABS Z31.D, P0/M, Z23.D |
(255) 0x421af0 LD1D {Z2.D}, P3/Z, [X5, Z0.D,LSL #3] |
(255) 0x421af4 FDIV Z31.D, P0/M, Z31.D, Z26.D |
(255) 0x421af8 FCMGT P4.D, P0/Z, Z20.D, #0 |
(255) 0x421afc MOVPRFX Z26, Z27 |
(255) 0x421b00 FMINNM Z26.D, P0/M, Z26.D, Z30.D |
(255) 0x421b04 LD1D {Z20.D}, P2/Z, [X30, X0,LSL #3] |
(255) 0x421b08 MOVPRFX Z25, Z31 |
(255) 0x421b0c FADD Z25.D, P0/M, Z25.D, #1 |
(255) 0x421b10 FSUB Z29.D, Z7.D, Z31.D |
(255) 0x421b14 FMUL Z27.D, Z25.D, Z27.D |
(255) 0x421b18 FMUL Z30.D, Z29.D, Z30.D |
(255) 0x421b1c FDIV Z27.D, P0/M, Z27.D, Z2.D |
(255) 0x421b20 FDIV Z30.D, P0/M, Z30.D, Z20.D |
(255) 0x421b24 SUNPKHI Z24.D, Z24 |
(255) 0x421b28 FADD Z0.D, Z27.D, Z30.D |
(255) 0x421b2c SEL Z25.D, P5, Z1.D, Z24.D |
(255) 0x421b30 SUNPKHI Z22.D, Z19 |
(255) 0x421b34 LD1D {Z30.D}, P1/Z, [X7, Z25.D,LSL #3] |
(255) 0x421b38 FMUL Z19.D, Z20.D, Z6.D |
(255) 0x421b3c FCMLT P8.D, P0/Z, Z4.D, #0 |
(255) 0x421b40 LD1D {Z20.D}, P1/Z, [X16, Z25.D,LSL #3] |
(255) 0x421b44 FMUL Z27.D, Z0.D, Z19.D |
(255) 0x421b48 FCMGT P6.D, P4/Z, Z3.D, #0 |
(255) 0x421b4c FMINNM Z27.D, P0/M, Z27.D, Z26.D |
(255) 0x421b50 MOVPRFX Z3, Z16 |
(255) 0x421b54 FCPY Z3.D, P6/M, #1.0000000 |
(255) 0x421b58 FSUBR Z31.D, P0/M, Z31.D, #1 |
(255) 0x421b5c FMUL Z31.D, Z31.D, Z3.D |
(255) 0x421b60 MOVPRFX Z2, Z21 |
(255) 0x421b64 FMLA Z2.D, P3/M, Z31.D, Z27.D |
(255) 0x421b68 SEL Z21.D, P8, Z1.D, Z24.D |
(255) 0x421b6c FMUL Z29.D, Z23.D, Z2.D |
(255) 0x421b70 LD1D {Z1.D}, P1/Z, [X7, Z21.D,LSL #3] |
(255) 0x421b74 FSUB Z23.D, Z30.D, Z28.D |
(255) 0x421b78 FSUB Z26.D, Z1.D, Z30.D |
(255) 0x421b7c FABD Z1.D, P0/M, Z1.D, Z30.D |
(255) 0x421b80 LD1D {Z24.D}, P1/Z, [X24, X0,LSL #3] |
(255) 0x421b84 FMUL Z3.D, Z23.D, Z26.D |
(255) 0x421b88 FABD Z28.D, P0/M, Z28.D, Z30.D |
(255) 0x421b8c FCMGT P5.D, P1/Z, Z3.D, #0 |
(255) 0x421b90 MOVPRFX Z31, Z28 |
(255) 0x421b94 FMINNM Z31.D, P0/M, Z31.D, Z1.D |
(255) 0x421b98 LD1D {Z22.D}, P5/Z, [X5, Z22.D,LSL #3] |
(255) 0x421b9c MOVPRFX Z19, Z4 |
(255) 0x421ba0 FABS Z19.D, P0/M, Z4.D |
(255) 0x421ba4 FCMGT P7.D, P0/Z, Z26.D, #0 |
(255) 0x421ba8 FDIV Z19.D, P0/M, Z19.D, Z20.D |
(255) 0x421bac FMUL Z21.D, Z24.D, Z6.D |
(255) 0x421bb0 MOVPRFX Z0, Z19 |
(255) 0x421bb4 FADD Z0.D, P0/M, Z0.D, #1 |
(255) 0x421bb8 FSUB Z27.D, Z7.D, Z19.D |
(255) 0x421bbc FMUL Z28.D, Z0.D, Z28.D |
(255) 0x421bc0 FMUL Z25.D, Z27.D, Z1.D |
(255) 0x421bc4 FDIV Z28.D, P0/M, Z28.D, Z22.D |
(255) 0x421bc8 FDIV Z25.D, P0/M, Z25.D, Z24.D |
(255) 0x421bcc FSUBR Z19.D, P0/M, Z19.D, #1 |
(255) 0x421bd0 FADD Z2.D, Z28.D, Z25.D |
(255) 0x421bd4 FCMGT P3.D, P7/Z, Z3.D, #0 |
(255) 0x421bd8 FMUL Z20.D, Z2.D, Z21.D |
(255) 0x421bdc MOVPRFX Z1, Z16 |
(255) 0x421be0 FCPY Z1.D, P3/M, #1.0000000 |
(255) 0x421be4 FMINNM Z20.D, P0/M, Z20.D, Z31.D |
(255) 0x421be8 FMUL Z23.D, Z19.D, Z1.D |
(255) 0x421bec MOVPRFX Z26, Z30 |
(255) 0x421bf0 FMLA Z26.D, P5/M, Z23.D, Z20.D |
(255) 0x421bf4 FMUL Z4.D, Z4.D, Z26.D |
(255) 0x421bf8 ST1D {Z29.D}, P2, [X3, X0,LSL #3] |
(255) 0x421bfc ST1D {Z4.D}, P1, [X4, X0,LSL #3] |
(255) 0x421c00 ADD X0, X0, X10 |
(255) 0x421c04 WHILELO P1.D, X0, X23 |
(255) 0x421c08 INCW Z5.S, ALL |
(255) 0x421c0c WHILELO P2.D, X0, X2 |
(255) 0x421c10 B.NE 421a60 |
(256) 0x421c14 ADD X9, X9, #1 |
(256) 0x421c18 ADD W2, W17, W9 |
(256) 0x421c1c CMP W20, W2 |
(256) 0x421c20 B.LE 421c60 |
(256) 0x421c24 SUB W3, W15, W19 |
(256) 0x421c28 ORR W8, WZR, W19 |
(256) 0x421c2c ORR W19, WZR, W22 |
(256) 0x421c30 ORR W16, WZR, W21 |
(256) 0x421c34 CMP W3, W19 |
(256) 0x421c38 CSEL X2, X3, X19, #9 |
(256) 0x421c3c ADD W19, W8, W2 |
(256) 0x421c40 CMP W8, W19 |
(256) 0x421c44 B.CC 4219e0 |
(257) 0x421c48 ADD X9, X9, #1 |
(257) 0x421c4c ORR W19, WZR, W8 |
(257) 0x421c50 ADD W2, W17, W9 |
(257) 0x421c54 CMP W20, W2 |
(257) 0x421c58 B.GT 421c24 |
(254) 0x421c5c HINT #0 |
(254) 0x421c60 LDP X23, X24, [SP, #48] |
(254) 0x421c64 LDP X19, X20, [SP, #16] |
(254) 0x421c68 LDP X21, X22, [SP, #32] |
(254) 0x421c6c LDP X25, X26, [SP, #64] |
(254) 0x421c70 LDP X29, X30, [SP], #80 |
(254) 0x421c74 RET |
(254) 0x421c78 ADD W3, W3, #1 |
(254) 0x421c7c MOVZ W0, #0 |
(254) 0x421c80 B 421970 |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►98.42+ | __kmp_GOMP_microtask_wrapper(i[...] | libomp.so | |
○ | __kmp_invoke_microtask | libomp.so |
Path / |
Source file and lines | advec_mom.cpp:108-139 |
Module | exec |
nb instructions | 27 |
loop length | 108 |
nb stack references | 0 |
front end | 3.38 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.50 | 4.50 | 4.50 | 4.50 | 0.00 | 0.00 | 0.00 | 0.00 | 2.50 | 2.50 | 2.00 | 2.50 | 2.50 |
cycles | 2.50 | 2.50 | 4.50 | 4.50 | 4.50 | 4.50 | 0.00 | 0.00 | 0.00 | 0.00 | 2.50 | 2.50 | 2.00 | 2.50 | 2.50 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Front-end | 3.38 |
Overall L1 | 4.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #944]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDP W25, W20, [X0, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDP W21, W2, [X0, #40] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W25, W25, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W20, W20, #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W25, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 421c64 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x360> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W19, W2, #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB W26, W20, W25 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W21, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 421c64 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x360> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W22, W19, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X23, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MADD W26, W26, W22, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W24, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W3, W26, W24 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ORR W1, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MSUB W0, W3, W24, W26 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W1, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 421c78 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x374> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Source file and lines | advec_mom.cpp:108-139 |
Module | exec |
nb instructions | 27 |
loop length | 108 |
nb stack references | 0 |
front end | 3.38 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.50 | 4.50 | 4.50 | 4.50 | 4.50 | 0.00 | 0.00 | 0.00 | 0.00 | 2.50 | 2.50 | 2.00 | 2.50 | 2.50 |
cycles | 2.50 | 2.50 | 4.50 | 4.50 | 4.50 | 4.50 | 0.00 | 0.00 | 0.00 | 0.00 | 2.50 | 2.50 | 2.00 | 2.50 | 2.50 |
Cycles executing div or sqrt instructions | 1.00-0.50 |
Front-end | 3.38 |
Overall L1 | 4.50 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | 0% |
div/sqrt | 0% |
other | 0% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STP X29, X30, [SP, #944]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
STP X25, X26, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDP W25, W20, [X0, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
STP X21, X22, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
LDP W21, W2, [X0, #40] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 |
ADD W25, W25, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
ADD W20, W20, #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W25, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 421c64 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x360> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ADD W19, W2, #3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
SUB W26, W20, W25 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
CMP W21, W19 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.GE 421c64 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x360> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
SUB W22, W19, W21 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
STP X23, X24, [SP, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 |
ORR X23, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MADD W26, W26, W22, WZR | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
BL 403530 <@plt_start@+0x4b0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
ORR W24, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
BL 4033c0 <@plt_start@+0x340> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
UDIV W3, W26, W24 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 1-0.50 |
ORR W1, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 |
MSUB W0, W3, W24, W26 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 |
CMP W1, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 |
B.CC 421c78 <_Z16advec_mom_kerneliiiiRN6clover8Buffer2DIdEES2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_S2_RNS_8Buffer1DIdEES5_iii._omp_fn.6+0x374> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 |
Name | Coverage (%) | Time (s) |
---|---|---|
▼advec_mom_kernel(int, int, int, int, clover::Buffer2D | 3.7 | 4.91 |
▼Loop 254 - advec_mom.cpp:108-139 - exec– | 0 | 0 |
▼Loop 256 - advec_mom.cpp:110-139 - exec– | 0.02 | 0.02 |
○Loop 255 - context.h:69-69 - exec | 3.68 | 4.87 |
○Loop 257 - advec_mom.cpp:110-139 - exec | 0 | 0 |