| Function: cg_calc_ur(int, int, int, double, double*, double*, double const*, double*, double const*) ... | Module: exec | Source: cg.cpp:105-113 | Coverage (incl. loops): 43.51% | (excl. loops): 0.02% |
|---|
| Function: cg_calc_ur(int, int, int, double, double*, double*, double const*, double*, double const*) ... | Module: exec | Source: cg.cpp:105-113 | Coverage (incl. loops): 43.51% | (excl. loops): 0.02% |
|---|
/home/eoseret/qaas/qaas_runs/178-237-4322/intel/TeaLeaf/build/TeaLeaf/src/omp/cg.cpp: 105 - 113 |
-------------------------------------------------------------------------------- |
105: #pragma omp parallel for reduction(+ : rrn_temp) |
106: #endif |
107: for (int jj = halo_depth; jj < y - halo_depth; ++jj) { |
108: for (int kk = halo_depth; kk < x - halo_depth; ++kk) { |
109: const int index = kk + jj * x; |
110: |
111: u[index] += alpha * p[index]; |
112: r[index] -= alpha * w[index]; |
113: rrn_temp += r[index] * r[index]; |
0x411880 STP X29, X30, [SP, #976]! |
0x411884 ADD X29, SP, #0 |
0x411888 STP X19, X20, [SP, #16] |
0x41188c ORR X19, XZR, X0 |
0x411890 LDR W20, [X0, #56] |
0x411894 STR X21, [SP, #32] |
0x411898 BL 410100 |
0x41189c ORR W21, WZR, W0 |
0x4118a0 BL 410110 |
0x4118a4 LDR W1, [X19, #52] |
0x4118a8 ORR W10, WZR, W0 |
0x4118ac SUB W0, W1, W20,LSL #1 |
0x4118b0 SDIV W2, W0, W21 |
0x4118b4 MSUB W3, W2, W21, W0 |
0x4118b8 CMP W10, W3 |
0x4118bc B.LT 411ab0 |
0x4118c0 MADD W4, W2, W10, W3 |
0x4118c4 MOVI D31, #0 |
0x4118c8 ADD W5, W2, W4 |
0x4118cc CMP W4, W5 |
0x4118d0 B.GE 411a7c |
0x4118d4 LDR W12, [X19, #48] |
0x4118d8 ADD W10, W20, W4 |
0x4118dc SBFM X15, X20, #0, #31 |
0x4118e0 LDP X18, X8, [X19, #8] |
0x4118e4 ADD W16, W20, W5 |
0x4118e8 LDP X30, X7, [X19, #24] |
0x4118ec SUB W13, W12, W20,LSL #1 |
0x4118f0 SUB W14, W12, W20 |
0x4118f4 MUL W11, W10, W12 |
0x4118f8 ADD X17, X13, X15 |
0x4118fc LDR D30, [X19] |
(18) 0x411900 CMP W20, W14 |
(18) 0x411904 B.GE 411a6c |
(18) 0x411908 SBFM X6, X11, #0, #31 |
(18) 0x41190c ADD X9, X17, X6 |
(18) 0x411910 ADD X21, X6, X15 |
(18) 0x411914 UBFM X0, X9, #61, #60 |
(18) 0x411918 UBFM X1, X21, #61, #60 |
(18) 0x41191c SUB X2, X0, X21,LSL #3 |
(18) 0x411920 SUB X3, X2, #8 |
(18) 0x411924 UBFM X4, X3, #3, #63 |
(18) 0x411928 ADD X5, X4, #1 |
(18) 0x41192c ANDS X13, X5, #0x3 |
(18) 0x411930 B.EQ 4119c4 |
(18) 0x411934 CMP X13, #1 |
(18) 0x411938 B.EQ 411994 |
(18) 0x41193c CMP X13, #2 |
(18) 0x411940 B.EQ 41196c |
(18) 0x411944 LDR D28, [X8, X1] |
(18) 0x411948 LDR D29, [X18, X1] |
(18) 0x41194c FMADD D0, D30, D28, D29 |
(18) 0x411950 STR D0, [X18, X1] |
(18) 0x411954 LDR D1, [X7, X1] |
(18) 0x411958 LDR D2, [X30, X1] |
(18) 0x41195c FMSUB D3, D30, D1, D2 |
(18) 0x411960 FMADD D31, D3, D3, D31 |
(18) 0x411964 STR D3, [X30, X1] |
(18) 0x411968 ADD X1, X1, #8 |
(18) 0x41196c LDR D4, [X8, X1] |
(18) 0x411970 LDR D5, [X18, X1] |
(18) 0x411974 FMADD D6, D30, D4, D5 |
(18) 0x411978 STR D6, [X18, X1] |
(18) 0x41197c LDR D7, [X7, X1] |
(18) 0x411980 LDR D16, [X30, X1] |
(18) 0x411984 FMSUB D17, D30, D7, D16 |
(18) 0x411988 FMADD D31, D17, D17, D31 |
(18) 0x41198c STR D17, [X30, X1] |
(18) 0x411990 ADD X1, X1, #8 |
(18) 0x411994 LDR D18, [X8, X1] |
(18) 0x411998 LDR D19, [X18, X1] |
(18) 0x41199c FMADD D20, D30, D18, D19 |
(18) 0x4119a0 STR D20, [X18, X1] |
(18) 0x4119a4 LDR D21, [X7, X1] |
(18) 0x4119a8 LDR D22, [X30, X1] |
(18) 0x4119ac FMSUB D23, D30, D21, D22 |
(18) 0x4119b0 FMADD D31, D23, D23, D31 |
(18) 0x4119b4 STR D23, [X30, X1] |
(18) 0x4119b8 ADD X1, X1, #8 |
(18) 0x4119bc CMP X0, X1 |
(18) 0x4119c0 B.EQ 411a6c |
(19) 0x4119c4 LDR D24, [X8, X1] |
(19) 0x4119c8 ADD X9, X1, #8 |
(19) 0x4119cc ADD X6, X1, #16 |
(19) 0x4119d0 ADD X21, X1, #24 |
(19) 0x4119d4 LDR D25, [X18, X1] |
(19) 0x4119d8 FMADD D26, D30, D24, D25 |
(19) 0x4119dc STR D26, [X18, X1] |
(19) 0x4119e0 LDR D27, [X7, X1] |
(19) 0x4119e4 LDR D28, [X30, X1] |
(19) 0x4119e8 FMSUB D29, D30, D27, D28 |
(19) 0x4119ec FMADD D0, D29, D29, D31 |
(19) 0x4119f0 STR D29, [X30, X1] |
(19) 0x4119f4 ADD X1, X1, #32 |
(19) 0x4119f8 LDR D1, [X8, X9] |
(19) 0x4119fc LDR D2, [X18, X9] |
(19) 0x411a00 FMADD D3, D30, D1, D2 |
(19) 0x411a04 STR D3, [X18, X9] |
(19) 0x411a08 LDR D4, [X7, X9] |
(19) 0x411a0c LDR D5, [X30, X9] |
(19) 0x411a10 FMSUB D6, D30, D4, D5 |
(19) 0x411a14 FMADD D7, D6, D6, D0 |
(19) 0x411a18 STR D6, [X30, X9] |
(19) 0x411a1c LDR D16, [X8, X6] |
(19) 0x411a20 LDR D17, [X18, X6] |
(19) 0x411a24 FMADD D18, D30, D16, D17 |
(19) 0x411a28 STR D18, [X18, X6] |
(19) 0x411a2c LDR D19, [X7, X6] |
(19) 0x411a30 LDR D20, [X30, X6] |
(19) 0x411a34 FMSUB D21, D30, D19, D20 |
(19) 0x411a38 FMADD D22, D21, D21, D7 |
(19) 0x411a3c STR D21, [X30, X6] |
(19) 0x411a40 LDR D31, [X18, X21] |
(19) 0x411a44 LDR D23, [X8, X21] |
(19) 0x411a48 FMADD D24, D30, D23, D31 |
(19) 0x411a4c STR D24, [X18, X21] |
(19) 0x411a50 LDR D25, [X7, X21] |
(19) 0x411a54 LDR D26, [X30, X21] |
(19) 0x411a58 FMSUB D27, D30, D25, D26 |
(19) 0x411a5c FMADD D31, D27, D27, D22 |
(19) 0x411a60 STR D27, [X30, X21] |
(19) 0x411a64 CMP X0, X1 |
(19) 0x411a68 B.NE 4119c4 |
(18) 0x411a6c ADD W10, W10, #1 |
(18) 0x411a70 ADD W11, W11, W12 |
(18) 0x411a74 CMP W16, W10 |
(18) 0x411a78 B.GT 411900 |
0x411a7c ADD X19, X19, #40 |
0x411a80 LDR X1, [X19] |
(17) 0x411a84 FMOV D30, X1 |
(17) 0x411a88 ORR X20, XZR, X1 |
(17) 0x411a8c FADD D28, D31, D30 |
(17) 0x411a90 FMOV X15, D28 |
(17) 0x411a94 CAS X20, X15, [X19] |
(17) 0x411a98 CMP X1, X20 |
(17) 0x411a9c B.NE 411abc |
0x411aa0 LDR X21, [SP, #32] |
0x411aa4 LDP X19, X20, [SP, #16] |
0x411aa8 LDP X29, X30, [SP], #48 |
0x411aac RET |
0x411ab0 ADD W2, W2, #1 |
0x411ab4 MOVZ W3, #0 |
0x411ab8 B 4118c0 |
(17) 0x411abc ORR X1, XZR, X20 |
(17) 0x411ac0 B 411a84 |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►98.42+ | omp_fulfill_event | libgomp.so.1.0.0 | |
| ○ | start_thread | libc.so.6 | |
| ○ | thread_start | libc.so.6 | |
| ►1.58+ | GOMP_parallel | libgomp.so.1.0.0 | |
| ○ | run_cg_calc_ur(Chunk*, Setting[...] | cg.cpp:105 | exec |
| ○ | cg_main_step_driver(Chunk*, Se[...] | cg_driver.cpp:69 | exec |
| ○ | cg_driver(Chunk*, Settings&, d[...] | cg_driver.cpp:18 | exec |
| ○ | solve(Chunk*, Settings&, int, [...] | diffuse.cpp:51 | exec |
| ○ | diffuse(Chunk*, Settings&) | diffuse.cpp:12 | exec |
| ○ | main | main.cpp:179 | exec |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | basic_string.h:809 | exec |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.02% of application time for run gcc_5
| Source file and lines | cg.cpp:105-113 |
| Module | exec |
| nb instructions | 41 |
| nb uops | 41 |
| loop length | 164 |
| used w registers | 15 |
| used x registers | 14 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 2 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 6 |
| micro-operation queue | 5.13 cycles |
| front end | 5.13 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 3.00 | 3.00 | 6.00 | 6.00 | 6.00 | 6.00 | 0.25 | 0.25 | 0.25 | 0.25 | 4.50 | 4.17 | 4.33 | 1.50 | 1.50 |
| cycles | 3.00 | 3.00 | 6.00 | 6.00 | 6.00 | 6.00 | 0.25 | 0.25 | 0.25 | 0.25 | 4.50 | 4.17 | 4.33 | 1.50 | 1.50 |
| Cycles executing div or sqrt instructions | 5.00-12.50 |
| Front-end | 5.13 |
| Dispatch | 6.00 |
| DIV/SQRT | 5.00-12.50 |
| Overall L1 | 6.00-12.50 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 25% |
| load | 29% |
| store | 41% |
| mul | 12% |
| add-sub | 16% |
| fma | 12% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #976]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ORR X19, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR W20, [X0, #56] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| STR X21, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| BL 410100 <@plt_start@+0xe0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR W21, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| BL 410110 <@plt_start@+0xf0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR W1, [X19, #52] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| ORR W10, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SUB W0, W1, W20,LSL #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SDIV W2, W0, W21 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 5-12.50 | N/A |
| MSUB W3, W2, W21, W0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (12.5%) |
| CMP W10, W3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LT 411ab0 <_Z10cg_calc_uriiidPdS_PKdS_S1_._omp_fn.0+0x230> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| MADD W4, W2, W10, W3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (12.5%) |
| MOVI D31, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ADD W5, W2, W4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP W4, W5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.GE 411a7c <_Z10cg_calc_uriiidPdS_PKdS_S1_._omp_fn.0+0x1fc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR W12, [X19, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| ADD W10, W20, W4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SBFM X15, X20, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (100.0%) |
| LDP X18, X8, [X19, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| ADD W16, W20, W5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| LDP X30, X7, [X19, #24] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| SUB W13, W12, W20,LSL #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SUB W14, W12, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| MUL W11, W10, W12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (12.5%) |
| ADD X17, X13, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR D30, [X19] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X19, X19, #40 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR X1, [X19] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X21, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDP X29, X30, [SP], #48 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD W2, W2, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOVZ W3, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| B 4118c0 <_Z10cg_calc_uriiidPdS_PKdS_S1_._omp_fn.0+0x40> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.02% of application time for run gcc_5
| Source file and lines | cg.cpp:105-113 |
| Module | exec |
| nb instructions | 41 |
| nb uops | 41 |
| loop length | 164 |
| used w registers | 15 |
| used x registers | 14 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 2 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 6 |
| micro-operation queue | 5.13 cycles |
| front end | 5.13 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 3.00 | 3.00 | 6.00 | 6.00 | 6.00 | 6.00 | 0.25 | 0.25 | 0.25 | 0.25 | 4.50 | 4.17 | 4.33 | 1.50 | 1.50 |
| cycles | 3.00 | 3.00 | 6.00 | 6.00 | 6.00 | 6.00 | 0.25 | 0.25 | 0.25 | 0.25 | 4.50 | 4.17 | 4.33 | 1.50 | 1.50 |
| Cycles executing div or sqrt instructions | 5.00-12.50 |
| Front-end | 5.13 |
| Dispatch | 6.00 |
| DIV/SQRT | 5.00-12.50 |
| Overall L1 | 6.00-12.50 |
| all | 0% |
| load | 0% |
| store | 0% |
| mul | 0% |
| add-sub | 0% |
| fma | 0% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 25% |
| load | 29% |
| store | 41% |
| mul | 12% |
| add-sub | 16% |
| fma | 12% |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 26% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| STP X29, X30, [SP, #976]! | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ADD X29, SP, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| STP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (50.0%) |
| ORR X19, XZR, X0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR W20, [X0, #56] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| STR X21, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0.50 | 0.50 | 1 | 0.50 | scal (25.0%) |
| BL 410100 <@plt_start@+0xe0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ORR W21, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| BL 410110 <@plt_start@+0xf0> | 1 | 0.50 | 0.50 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR W1, [X19, #52] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| ORR W10, WZR, W0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SUB W0, W1, W20,LSL #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| SDIV W2, W0, W21 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5-12 | 5-12.50 | N/A |
| MSUB W3, W2, W21, W0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (12.5%) |
| CMP W10, W3 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.LT 411ab0 <_Z10cg_calc_uriiidPdS_PKdS_S1_._omp_fn.0+0x230> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| MADD W4, W2, W10, W3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | scal (12.5%) |
| MOVI D31, #0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 2 | 0.25 | scal (25.0%) |
| ADD W5, W2, W4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| CMP W4, W5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (12.5%) |
| B.GE 411a7c <_Z10cg_calc_uriiidPdS_PKdS_S1_._omp_fn.0+0x1fc> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR W12, [X19, #48] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (12.5%) |
| ADD W10, W20, W4 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SBFM X15, X20, #0, #31 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (100.0%) |
| LDP X18, X8, [X19, #8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| ADD W16, W20, W5 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| LDP X30, X7, [X19, #24] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| SUB W13, W12, W20,LSL #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| SUB W14, W12, W20 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| MUL W11, W10, W12 | 1 | 0 | 0 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0.50 | scal (12.5%) |
| ADD X17, X13, X15 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (25.0%) |
| LDR D30, [X19] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X19, X19, #40 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR X1, [X19] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X21, [SP, #32] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDP X19, X20, [SP, #16] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDP X29, X30, [SP], #48 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | scal (50.0%) |
| RET | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| ADD W2, W2, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| MOVZ W3, #0 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | scal (12.5%) |
| B 4118c0 <_Z10cg_calc_uriiidPdS_PKdS_S1_._omp_fn.0+0x40> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Name | Coverage (%) | Time (s) |
|---|---|---|
| ▼cg_calc_ur(int, int, int, double, double*, double*, double const*, double*, double const*) [clone ._omp_fn.0]– | 43.51 | 115.92 |
| ▼Loop 18 - cg.cpp:108-113 - exec– | 0.03 | 0.07 |
| ○Loop 19 - cg.cpp:108-113 - exec | 43.45 | 115.71 |
| ○Loop 17 - cg.cpp:105-105 - exec | 0.02 | 0.05 |
