| Loop Id: 819 | Module: exec | Source: MultiBsplineRef.hpp:226-262 [...] | Coverage: NA% |
|---|
| Loop Id: 819 | Module: exec | Source: MultiBsplineRef.hpp:226-262 [...] | Coverage: NA% |
|---|
(817) 0x43c920 ADD X10, X10, #1 |
(817) 0x43c924 ADD X3, X3, X11 |
(817) 0x43c928 ADD X2, X2, X11 |
(817) 0x43c92c ADD X9, X9, X11 |
(817) 0x43c930 ORR X25, XZR, X23 |
(817) 0x43c934 ADD X0, X0, X11 |
(817) 0x43c938 CMP X10, #4 |
(817) 0x43c93c B.EQ 43cc00 |
(817) 0x43c940 CMP W25, #1 |
(817) 0x43c944 ORR X23, XZR, X25 |
(817) 0x43c948 B.LT 43c920 |
0x43c94c ADD X14, SP, #128 |
0x43c950 ORR X26, XZR, XZR |
0x43c954 ORR X30, XZR, X0 |
0x43c958 ORR X18, XZR, X9 |
0x43c95c ORR X17, XZR, X2 |
0x43c960 LDR D21, [X14, X10,LSL #3] |
0x43c964 ADD X14, SP, #192 |
0x43c968 LDR D22, [X14, X10,LSL #3] |
0x43c96c ADD X14, SP, #256 |
0x43c970 LDR D23, [X14, X10,LSL #3] |
0x43c974 ORR X14, XZR, X3 |
0x43c978 B 43c9a0 |
0x43c980 ADD X26, X26, #1 |
0x43c984 FMOV D8, D1 |
0x43c988 ADD X14, X14, X12 |
0x43c98c ADD X17, X17, X12 |
0x43c990 ADD X18, X18, X12 |
0x43c994 ADD X30, X30, X12 |
0x43c998 CMP X26, #4 |
0x43c99c B.EQ 43c920 |
0x43c9a0 LDR D24, [X5, X26,LSL #3] |
0x43c9a4 LDR D0, [SP, #72] |
0x43c9a8 FMOV D1, D8 |
0x43c9ac ORR X4, XZR, XZR |
0x43c9b0 FMUL D10, D24, D21 |
0x43c9b4 FMUL D9, D22, D24 |
0x43c9b8 FMUL D30, D23, D24 |
0x43c9bc LDR D24, [X6, X26,LSL #3] |
0x43c9c0 FMUL D26, D0, D30 |
0x43c9c4 LDP D0, D2, [SP, #80] |
0x43c9c8 FMUL D27, D8, D30 |
0x43c9cc FMUL D25, D2, D30 |
0x43c9d0 FMUL D31, D24, D22 |
0x43c9d4 FMUL D28, D24, D23 |
0x43c9d8 LDR D24, [X7, X26,LSL #3] |
0x43c9dc FMUL D29, D24, D23 |
0x43c9e0 FMUL D24, D0, D30 |
0x43c9e4 TBZ W16, #0, 43caec |
(820) 0x43c9e8 STR X8, [SP] |
(820) 0x43c9ec SUB X8, X29, #64 |
(820) 0x43c9f0 LDR D0, [X30, X4] |
(820) 0x43c9f4 LDR D8, [X18, X4] |
(820) 0x43c9f8 ADD X25, X19, X4 |
(820) 0x43c9fc LDR D11, [X17, X4] |
(820) 0x43ca00 LDR D12, [X14, X4] |
(820) 0x43ca04 LDR Z13, [X8, #511, MUL VL] |
(820) 0x43ca08 FMUL D13, D0, D13 |
(820) 0x43ca0c LDR Z14, [X8, #510, MUL VL] |
(820) 0x43ca10 LDR Z15, [X8, #509, MUL VL] |
(820) 0x43ca14 LDR Z2, [X8, #506, MUL VL] |
(820) 0x43ca18 FMADD D13, D8, D14, D13 |
(820) 0x43ca1c LDR D14, [X25] |
(820) 0x43ca20 FMADD D13, D11, D15, D13 |
(820) 0x43ca24 LDR Z15, [X8, #508, MUL VL] |
(820) 0x43ca28 FMADD D13, D15, D12, D13 |
(820) 0x43ca2c FMUL D15, D8, D3 |
(820) 0x43ca30 FMADD D15, D12, D4, D15 |
(820) 0x43ca34 FMADD D14, D10, D13, D14 |
(820) 0x43ca38 STR D14, [X25] |
(820) 0x43ca3c ADD X25, X25, X21,LSL #3 |
(820) 0x43ca40 LDR D14, [X25] |
(820) 0x43ca44 FMADD D14, D31, D13, D14 |
(820) 0x43ca48 STR D14, [X25] |
(820) 0x43ca4c LDR Z14, [X8, #507, MUL VL] |
(820) 0x43ca50 ADD X25, X25, X1 |
(820) 0x43ca54 LDR X8, [SP] |
(820) 0x43ca58 FMUL D14, D0, D14 |
(820) 0x43ca5c FMUL D0, D0, D25 |
(820) 0x43ca60 FMADD D14, D11, D2, D14 |
(820) 0x43ca64 FMADD D0, D8, D24, D0 |
(820) 0x43ca68 FMADD D0, D11, D26, D0 |
(820) 0x43ca6c FADD D14, D14, D15 |
(820) 0x43ca70 LDR D15, [X20, X4] |
(820) 0x43ca74 FMADD D0, D12, D27, D0 |
(820) 0x43ca78 FMADD D15, D13, D9, D15 |
(820) 0x43ca7c STR D15, [X20, X4] |
(820) 0x43ca80 LDR D15, [X25] |
(820) 0x43ca84 FMADD D15, D14, D9, D15 |
(820) 0x43ca88 STR D15, [X25] |
(820) 0x43ca8c ADD X25, X25, X1 |
(820) 0x43ca90 LDR D15, [X25] |
(820) 0x43ca94 FMADD D15, D29, D13, D15 |
(820) 0x43ca98 STR D15, [X25] |
(820) 0x43ca9c LDR D15, [X27, X4] |
(820) 0x43caa0 ADD X25, X25, X1 |
(820) 0x43caa4 FMADD D15, D13, D28, D15 |
(820) 0x43caa8 STR D15, [X27, X4] |
(820) 0x43caac LDR D15, [X25] |
(820) 0x43cab0 LDR D8, [X28, X4] |
(820) 0x43cab4 FMADD D15, D14, D28, D15 |
(820) 0x43cab8 FMADD D8, D14, D30, D8 |
(820) 0x43cabc STR D15, [X25] |
(820) 0x43cac0 LDR D11, [X25, X1] |
(820) 0x43cac4 FADD D0, D11, D0 |
(820) 0x43cac8 STR D8, [X28, X4] |
(820) 0x43cacc STR D0, [X25, X1] |
(820) 0x43cad0 LDR D0, [X22, X4] |
(820) 0x43cad4 FMADD D0, D13, D30, D0 |
(820) 0x43cad8 STR D0, [X22, X4] |
(820) 0x43cadc ADD X4, X4, #8 |
(820) 0x43cae0 CMP X13, X4 |
(820) 0x43cae4 B.NE 43c9e8 |
0x43cae8 B 43c980 |
0x43caec DUP Z10.D, Z10.D[0] |
0x43caf0 DUP Z9.D, Z9.D[0] |
0x43caf4 ORR P2.B, P0/Z, P0.B, P0.B |
0x43caf8 DUP Z30.D, Z30.D[0] |
0x43cafc DUP Z31.D, Z31.D[0] |
0x43cb00 DUP Z28.D, Z28.D[0] |
0x43cb04 DUP Z29.D, Z29.D[0] |
0x43cb08 DUP Z27.D, Z27.D[0] |
0x43cb0c DUP Z26.D, Z26.D[0] |
0x43cb10 DUP Z25.D, Z25.D[0] |
0x43cb14 DUP Z24.D, Z24.D[0] |
0x43cb18 HINT #0 |
0x43cb1c HINT #0 |
(818) 0x43cb20 LD1D {Z11.D}, P2/Z, [X30, X4,LSL #3] |
(818) 0x43cb24 LD1D {Z12.D}, P2/Z, [X18, X4,LSL #3] |
(818) 0x43cb28 ADD X25, X19, X4,LSL #3 |
(818) 0x43cb2c ADD X25, X25, X1 |
(818) 0x43cb30 LD1D {Z13.D}, P2/Z, [X17, X4,LSL #3] |
(818) 0x43cb34 LD1D {Z14.D}, P2/Z, [X14, X4,LSL #3] |
(818) 0x43cb38 FMUL Z8.D, Z11.D, Z5.D |
(818) 0x43cb3c LD1D {Z15.D}, P2/Z, [X25, MUL VL] |
(818) 0x43cb40 FMUL Z0.D, Z12.D, Z19.D |
(818) 0x43cb44 FMLA Z8.D, P1/M, Z12.D, Z6.D |
(818) 0x43cb48 FMLA Z0.D, P1/M, Z14.D, Z20.D |
(818) 0x43cb4c FMLA Z8.D, P1/M, Z13.D, Z7.D |
(818) 0x43cb50 FMLA Z0.D, P1/M, Z11.D, Z17.D |
(818) 0x43cb54 FMLA Z8.D, P1/M, Z16.D, Z14.D |
(818) 0x43cb58 FMLA Z0.D, P1/M, Z13.D, Z18.D |
(818) 0x43cb5c FMLA Z15.D, P1/M, Z31.D, Z8.D |
(818) 0x43cb60 ST1D {Z15.D}, P2, [X25, MUL VL] |
(818) 0x43cb64 ADD X25, X25, X1 |
(818) 0x43cb68 LD1D {Z15.D}, P2/Z, [X25, MUL VL] |
(818) 0x43cb6c FMLA Z15.D, P1/M, Z0.D, Z9.D |
(818) 0x43cb70 ST1D {Z15.D}, P2, [X25, MUL VL] |
(818) 0x43cb74 ADD X25, X25, X1 |
(818) 0x43cb78 LD1D {Z15.D}, P2/Z, [X25, MUL VL] |
(818) 0x43cb7c FMLA Z15.D, P1/M, Z29.D, Z8.D |
(818) 0x43cb80 ST1D {Z15.D}, P2, [X25, MUL VL] |
(818) 0x43cb84 ADD X25, X25, X1 |
(818) 0x43cb88 LD1D {Z15.D}, P2/Z, [X25, MUL VL] |
(818) 0x43cb8c FMLA Z15.D, P1/M, Z0.D, Z28.D |
(818) 0x43cb90 ST1D {Z15.D}, P2, [X25, MUL VL] |
(818) 0x43cb94 ADD X25, X25, X1 |
(818) 0x43cb98 LD1D {Z15.D}, P2/Z, [X25, MUL VL] |
(818) 0x43cb9c FMAD Z11.D, P1/M, Z25.D, Z15.D |
(818) 0x43cba0 LD1D {Z15.D}, P2/Z, [X19, X4,LSL #3] |
(818) 0x43cba4 FMLA Z11.D, P1/M, Z12.D, Z24.D |
(818) 0x43cba8 LD1D {Z12.D}, P2/Z, [X20, X4,LSL #3] |
(818) 0x43cbac FMLA Z11.D, P1/M, Z13.D, Z26.D |
(818) 0x43cbb0 FMLA Z11.D, P1/M, Z14.D, Z27.D |
(818) 0x43cbb4 FMLA Z15.D, P1/M, Z10.D, Z8.D |
(818) 0x43cbb8 FMLA Z12.D, P1/M, Z8.D, Z9.D |
(818) 0x43cbbc ST1D {Z11.D}, P2, [X25, MUL VL] |
(818) 0x43cbc0 LD1D {Z11.D}, P2/Z, [X27, X4,LSL #3] |
(818) 0x43cbc4 FMLA Z11.D, P1/M, Z8.D, Z28.D |
(818) 0x43cbc8 ST1D {Z15.D}, P2, [X19, X4,LSL #3] |
(818) 0x43cbcc ST1D {Z12.D}, P2, [X20, X4,LSL #3] |
(818) 0x43cbd0 ST1D {Z11.D}, P2, [X27, X4,LSL #3] |
(818) 0x43cbd4 LD1D {Z11.D}, P2/Z, [X28, X4,LSL #3] |
(818) 0x43cbd8 FMAD Z0.D, P1/M, Z30.D, Z11.D |
(818) 0x43cbdc ST1D {Z0.D}, P2, [X28, X4,LSL #3] |
(818) 0x43cbe0 LD1D {Z0.D}, P2/Z, [X22, X4,LSL #3] |
(818) 0x43cbe4 FMLA Z0.D, P1/M, Z8.D, Z30.D |
(818) 0x43cbe8 ST1D {Z0.D}, P2, [X22, X4,LSL #3] |
(818) 0x43cbec ADD X4, X4, X8 |
(818) 0x43cbf0 SUB X25, X4, X8 |
(818) 0x43cbf4 WHILELO P2.D, X25, X15 |
(818) 0x43cbf8 B.MI 43cb20 |
0x43cbfc B 43c980 |
/home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/build/miniqmc/src/Numerics/Spline2/MultiBsplineRef.hpp: 226 - 262 |
-------------------------------------------------------------------------------- |
226: for (int i = 0; i < 4; i++) |
227: for (int j = 0; j < 4; j++) |
[...] |
234: const T pre20 = d2a[i] * b[j]; |
235: const T pre10 = da[i] * b[j]; |
236: const T pre00 = a[i] * b[j]; |
237: const T pre11 = da[i] * db[j]; |
238: const T pre01 = a[i] * db[j]; |
239: const T pre02 = a[i] * d2b[j]; |
240: |
241: const int iSplitPoint = num_splines; |
242: for (int n = 0; n < iSplitPoint; n++) |
243: { |
244: T coefsv = coefs[n]; |
245: T coefsvzs = coefszs[n]; |
246: T coefsv2zs = coefs2zs[n]; |
247: T coefsv3zs = coefs3zs[n]; |
248: |
249: T sum0 = c[0] * coefsv + c[1] * coefsvzs + c[2] * coefsv2zs + c[3] * coefsv3zs; |
250: T sum1 = dc[0] * coefsv + dc[1] * coefsvzs + dc[2] * coefsv2zs + dc[3] * coefsv3zs; |
251: T sum2 = d2c[0] * coefsv + d2c[1] * coefsvzs + d2c[2] * coefsv2zs + d2c[3] * coefsv3zs; |
252: |
253: hxx[n] += pre20 * sum0; |
254: hxy[n] += pre11 * sum0; |
255: hxz[n] += pre10 * sum1; |
256: hyy[n] += pre02 * sum0; |
257: hyz[n] += pre01 * sum1; |
258: hzz[n] += pre00 * sum2; |
259: gx[n] += pre10 * sum0; |
260: gy[n] += pre01 * sum0; |
261: gz[n] += pre00 * sum1; |
262: vals[n] += pre00 * sum0; |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.27 |
| CQA speedup if FP arith vectorized | 2.10 |
| CQA speedup if fully vectorized | 1.27 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.16 |
| Bottlenecks | micro-operation queue, |
| Function | void miniqmcreference::MultiBsplineEvalRef::evaluate_vgh |
| Source | MultiBsplineRef.hpp:226-227,MultiBsplineRef.hpp:234-239,MultiBsplineRef.hpp:242-242 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 6.38 |
| CQA cycles if no scalar integer | 5.00 |
| CQA cycles if FP arith vectorized | 3.04 |
| CQA cycles if fully vectorized | 5.00 |
| Front-end cycles | 6.38 |
| P0 cycles | 2.50 |
| P1 cycles | 2.50 |
| P2 cycles | 4.00 |
| P3 cycles | 4.00 |
| P4 cycles | 4.00 |
| P5 cycles | 4.00 |
| P6 cycles | 5.50 |
| P7 cycles | 5.50 |
| P8 cycles | 5.50 |
| P9 cycles | 5.50 |
| P10 cycles | 2.67 |
| P11 cycles | 2.67 |
| P12 cycles | 2.67 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 53.00 |
| Nb uops | 51.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 2.00 |
| FLOP/cycle | 1.57 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 10.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 31.58 |
| Vectorization ratio load | 12.50 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 64.71 |
| Vector-efficiency ratio all | 45.07 |
| Vector-efficiency ratio load | 28.13 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | 25.00 |
| Vector-efficiency ratio add_sub | 25.00 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 68.38 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 1.27 |
| CQA speedup if FP arith vectorized | 2.10 |
| CQA speedup if fully vectorized | 1.27 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.16 |
| Bottlenecks | micro-operation queue, |
| Function | void miniqmcreference::MultiBsplineEvalRef::evaluate_vgh |
| Source | MultiBsplineRef.hpp:226-227,MultiBsplineRef.hpp:234-239,MultiBsplineRef.hpp:242-242 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 6.38 |
| CQA cycles if no scalar integer | 5.00 |
| CQA cycles if FP arith vectorized | 3.04 |
| CQA cycles if fully vectorized | 5.00 |
| Front-end cycles | 6.38 |
| P0 cycles | 2.50 |
| P1 cycles | 2.50 |
| P2 cycles | 4.00 |
| P3 cycles | 4.00 |
| P4 cycles | 4.00 |
| P5 cycles | 4.00 |
| P6 cycles | 5.50 |
| P7 cycles | 5.50 |
| P8 cycles | 5.50 |
| P9 cycles | 5.50 |
| P10 cycles | 2.67 |
| P11 cycles | 2.67 |
| P12 cycles | 2.67 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 53.00 |
| Nb uops | 51.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 2.00 |
| FLOP/cycle | 1.57 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 10.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 31.58 |
| Vectorization ratio load | 12.50 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | 0.00 |
| Vectorization ratio add_sub | 0.00 |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 64.71 |
| Vector-efficiency ratio all | 45.07 |
| Vector-efficiency ratio load | 28.13 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | 25.00 |
| Vector-efficiency ratio add_sub | 25.00 |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 68.38 |
| Path / |
| Function | void miniqmcreference::MultiBsplineEvalRef::evaluate_vgh |
| Source file and lines | MultiBsplineRef.hpp:226-262 |
| Module | exec |
| nb instructions | 53 |
| nb uops | 51 |
| loop length | 212 |
| used w registers | 1 |
| used x registers | 16 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 17 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 10 |
| nb stack references | 2 |
| micro-operation queue | 6.38 cycles |
| front end | 6.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.50 | 2.50 | 4.00 | 4.00 | 4.00 | 4.00 | 5.50 | 5.50 | 5.50 | 5.50 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 |
| cycles | 2.50 | 2.50 | 4.00 | 4.00 | 4.00 | 4.00 | 5.50 | 5.50 | 5.50 | 5.50 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 6.38 |
| Dispatch | 5.50 |
| Overall L1 | 6.38 |
| all | 46% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 73% |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 31% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 64% |
| all | 54% |
| load | 28% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 74% |
| all | 25% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 25% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| all | 45% |
| load | 28% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 25% |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 68% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ADD X14, SP, #128 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR X26, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR X30, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR X18, XZR, X9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR X17, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR D21, [X14, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X14, SP, #192 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR D22, [X14, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X14, SP, #256 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR D23, [X14, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ORR X14, XZR, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| B 43c9a0 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x700> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD X26, X26, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| FMOV D8, D1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ADD X14, X14, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ADD X17, X17, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X18, X18, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X30, X30, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| CMP X26, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.EQ 43c920 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x680> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR D24, [X5, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| LDR D0, [SP, #72] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| FMOV D1, D8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ORR X4, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| FMUL D10, D24, D21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D9, D22, D24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D30, D23, D24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| LDR D24, [X6, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| FMUL D26, D0, D30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| LDP D0, D2, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | vect (50.0%) |
| FMUL D27, D8, D30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D25, D2, D30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D31, D24, D22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D28, D24, D23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| LDR D24, [X7, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| FMUL D29, D24, D23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D24, D0, D30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| TBZ W16, #0, 43caec <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x84c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| B 43c980 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x6e0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| DUP Z10.D, Z10.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z9.D, Z9.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| ORR P2.B, P0/Z, P0.B, P0.B | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | vect (12.5%) |
| DUP Z30.D, Z30.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z31.D, Z31.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z28.D, Z28.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z29.D, Z29.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z27.D, Z27.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z26.D, Z26.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z25.D, Z25.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z24.D, Z24.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| HINT #0 | N/A | ||||||||||||||||||
| HINT #0 | N/A | ||||||||||||||||||
| B 43c980 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x6e0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Function | void miniqmcreference::MultiBsplineEvalRef::evaluate_vgh |
| Source file and lines | MultiBsplineRef.hpp:226-262 |
| Module | exec |
| nb instructions | 53 |
| nb uops | 51 |
| loop length | 212 |
| used w registers | 1 |
| used x registers | 16 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 17 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 10 |
| nb stack references | 2 |
| micro-operation queue | 6.38 cycles |
| front end | 6.38 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 2.50 | 2.50 | 4.00 | 4.00 | 4.00 | 4.00 | 5.50 | 5.50 | 5.50 | 5.50 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 |
| cycles | 2.50 | 2.50 | 4.00 | 4.00 | 4.00 | 4.00 | 5.50 | 5.50 | 5.50 | 5.50 | 2.67 | 2.67 | 2.67 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 6.38 |
| Dispatch | 5.50 |
| Overall L1 | 6.38 |
| all | 46% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 73% |
| all | 0% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 31% |
| load | 12% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 0% |
| add-sub | 0% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 64% |
| all | 54% |
| load | 28% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| other | 74% |
| all | 25% |
| load | NA (no load vectorizable/vectorized instructions) |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 25% |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| all | 45% |
| load | 28% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | 25% |
| add-sub | 25% |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 68% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ADD X14, SP, #128 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR X26, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ORR X30, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR X18, XZR, X9 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ORR X17, XZR, X2 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR D21, [X14, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X14, SP, #192 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR D22, [X14, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X14, SP, #256 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR D23, [X14, X10,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ORR X14, XZR, X3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| B 43c9a0 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x700> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD X26, X26, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| FMOV D8, D1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ADD X14, X14, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| ADD X17, X17, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X18, X18, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| ADD X30, X30, X12 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| CMP X26, #4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.EQ 43c920 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x680> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR D24, [X5, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| LDR D0, [SP, #72] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| FMOV D1, D8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ORR X4, XZR, XZR | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| FMUL D10, D24, D21 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D9, D22, D24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D30, D23, D24 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| LDR D24, [X6, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| FMUL D26, D0, D30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| LDP D0, D2, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | vect (50.0%) |
| FMUL D27, D8, D30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D25, D2, D30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D31, D24, D22 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D28, D24, D23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| LDR D24, [X7, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| FMUL D29, D24, D23 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| FMUL D24, D0, D30 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 3 | 0.25 | scal (25.0%) |
| TBZ W16, #0, 43caec <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x84c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| B 43c980 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x6e0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| DUP Z10.D, Z10.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z9.D, Z9.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| ORR P2.B, P0/Z, P0.B, P0.B | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | vect (12.5%) |
| DUP Z30.D, Z30.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z31.D, Z31.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z28.D, Z28.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z29.D, Z29.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z27.D, Z27.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z26.D, Z26.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z25.D, Z25.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| DUP Z24.D, Z24.D[0] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | vect (100.0%) |
| HINT #0 | N/A | ||||||||||||||||||
| HINT #0 | N/A | ||||||||||||||||||
| B 43c980 <_ZN16miniqmcreference19MultiBsplineEvalRef12evaluate_vghIdEEvPKN11qmcplusplus14bspline_traitsIT_Lj3EE10SplineTypeES4_S4_S4_PS4_S9_S9_m+0x6e0> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Run 1x1 | Number processes: 1Number nodes: NARun Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_NUM_THREADS: 1OMP_PLACES: threads |
|---|---|
| Run 1x2 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 2OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x4 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 4OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x8 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 8OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x16 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 16OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x24 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 24OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x32 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 32OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x40 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 40OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x48 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 48OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x56 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 56OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| Run 1x64 | Number processes: 1Run Command: <executable> -g "4 2 2" -bMPI Command: mpirun -n <number_processes> --bind-to core --map-by package:PE=64 --rank-by fill --report-bindings Dataset: Run Directory: /home/eoseret/qaas/qaas_runs/178-212-9071/intel/miniqmc/run/oneview_runs/multicore/armclang/oneview_run_1782144418OMP_NUM_THREADS: 64OMP_PROC_BIND: spreadOMP_DISPLAY_AFFINITY: TRUEOMP_AFFINITY_FORMAT: 'OMP: pid %P tid %i thread %n bound to OS proc set {%A}'OMP_DISPLAY_ENV: TRUEOMP_PLACES: threads |
| (1x1) Efficiency | (1x1) Potential Speed-Up (%) | (1x2) Efficiency | (1x2) Potential Speed-Up (%) | (1x4) Efficiency | (1x4) Potential Speed-Up (%) | (1x8) Efficiency | (1x8) Potential Speed-Up (%) | (1x16) Efficiency | (1x16) Potential Speed-Up (%) | (1x24) Efficiency | (1x24) Potential Speed-Up (%) | (1x32) Efficiency | (1x32) Potential Speed-Up (%) | (1x40) Efficiency | (1x40) Potential Speed-Up (%) | (1x48) Efficiency | (1x48) Potential Speed-Up (%) | (1x56) Efficiency | (1x56) Potential Speed-Up (%) | (1x64) Efficiency | (1x64) Potential Speed-Up (%) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| Run | Number of threads | Efficiency (ideal is 1) | Speedup | Ideal Speedup | Time (s) | Coverage (%) |
|---|---|---|---|---|---|---|
| 1x1 | ||||||
| 1x2 | 1 | 1 | 1 | 2 | 0.0099999997764826 | 0.011293084360659 |
| 1x4 | 4 | 1 | 1 | 4 | 0.0099999997764826 | 0.019353108480573 |
| 1x8 | 5 | 1 | 1 | 8 | 0.025000000372529 | 0.012852004729211 |
| 1x16 | 5 | 1 | 1 | 16 | 0.014999999664724 | 0.0053843995556235 |
| 1x24 | 6 | 1 | 1 | 24 | 0.015000000596046 | 0.0037153046578169 |
| 1x32 | 12 | 1 | 1 | 32 | 0.014999999664724 | 0.004243163857609 |
| 1x40 | 6 | 1 | 1 | 40 | 0.025000000372529 | 0.0024658960755914 |
| 1x48 | 9 | 1 | 1 | 48 | 0.019999999552965 | 0.0024838463868946 |
| 1x56 | 11 | 1 | 1 | 56 | 0.029999999329448 | 0.0026210828218609 |
| 1x64 | 10 | 1 | 1 | 64 | 0.02000000141561 | 0.001812873291783 |
