| Loop Id: 2943 | Module: exec | Source: par_csr_matop.c:865-989 [...] | Coverage: 0.21% |
|---|
| Loop Id: 2943 | Module: exec | Source: par_csr_matop.c:865-989 [...] | Coverage: 0.21% |
|---|
(2942) 0x488240 LDUR X8, [X29, #432] |
(2942) 0x488244 ORR X19, XZR, X2 |
(2942) 0x488248 ORR X22, XZR, X1 |
(2942) 0x48824c CMP X26, X8 |
(2942) 0x488250 B.GE 488590 |
(2942) 0x488254 LDUR X8, [X29, #424] |
(2942) 0x488258 LDR X8, [X8] |
(2942) 0x48825c CBZ X8, 48827c |
(2942) 0x488260 LDUR X8, [X29, #408] |
(2942) 0x488264 ADD X1, X22, #1 |
(2942) 0x488268 STR X22, [X0, X26,LSL #3] |
(2942) 0x48826c LDR D0, [X8] |
(2942) 0x488270 LDP X8, X9, [SP, #104] |
(2942) 0x488274 STR X26, [X8, X22,LSL #3] |
(2942) 0x488278 STR D0, [X9, X22,LSL #3] |
(2942) 0x48827c LDUR X8, [X29, #416] |
(2942) 0x488280 LDR X8, [X8] |
(2942) 0x488284 CBZ X8, 488400 |
(2942) 0x488288 LDR X8, [SP, #96] |
(2942) 0x48828c ADD X3, X8, X26,LSL #3 |
(2942) 0x488290 LDP X4, X8, [X3] |
(2942) 0x488294 CMP X4, X8 |
(2942) 0x488298 B.GE 488420 |
(2942) 0x48829c LDP X9, X8, [SP, #48] |
(2942) 0x4882a0 ORR X2, XZR, X19 |
(2942) 0x4882a4 LDR X5, [X9] |
(2942) 0x4882a8 LDR X6, [X8] |
(2942) 0x4882ac B 4882d0 |
(2946) 0x4882c0 LDR X8, [X3, #8] |
(2946) 0x4882c4 ADD X4, X4, #1 |
(2946) 0x4882c8 CMP X4, X8 |
(2946) 0x4882cc B.GE 488404 |
(2946) 0x4882d0 LDP X8, X9, [X29, #984] |
(2946) 0x4882d4 LDR X30, [X9, X4,LSL #3] |
(2946) 0x4882d8 LDR D0, [X8, X4,LSL #3] |
(2946) 0x4882dc ADD X7, X30, #1 |
(2946) 0x4882e0 LDR X8, [X13, X30,LSL #3] |
(2946) 0x4882e4 LDR X23, [X13, X7,LSL #3] |
(2946) 0x4882e8 CMP X8, X23 |
(2946) 0x4882ec B.GE 48836c |
(2946) 0x4882f0 LDUR X9, [X29, #440] |
(2946) 0x4882f4 LDP X10, X11, [X29, #1000] |
(2946) 0x4882f8 LDR X9, [X9] |
(2946) 0x4882fc LDR X10, [X10] |
(2946) 0x488300 LDR X27, [X11] |
(2946) 0x488304 B 488324 |
(2948) 0x488308 LDR D1, [X9, X8,LSL #3] |
(2948) 0x48830c LDR D2, [X10, X11,LSL #3] |
(2948) 0x488310 FMADD D1, D1, D0, D2 |
(2948) 0x488314 STR D1, [X10, X11,LSL #3] |
(2948) 0x488318 ADD X8, X8, #1 |
(2948) 0x48831c CMP X8, X23 |
(2948) 0x488320 B.GE 48836c |
(2948) 0x488324 LDR X11, [X20] |
(2948) 0x488328 LDR X12, [X5, X8,LSL #3] |
(2948) 0x48832c ADD X24, X12, X11 |
(2948) 0x488330 LDR X11, [X0, X24,LSL #3] |
(2948) 0x488334 CMP X11, X19 |
(2948) 0x488338 B.GE 488308 |
(2948) 0x48833c STR X2, [X0, X24,LSL #3] |
(2948) 0x488340 LDR D1, [X9, X8,LSL #3] |
(2948) 0x488344 LDR X11, [X20] |
(2948) 0x488348 SUB X11, X24, X11 |
(2948) 0x48834c STR X11, [X27, X2,LSL #3] |
(2948) 0x488350 FMUL D1, D1, D0 |
(2948) 0x488354 LDR X23, [X13, X7,LSL #3] |
(2948) 0x488358 STR D1, [X10, X2,LSL #3] |
(2948) 0x48835c ADD X2, X2, #1 |
(2948) 0x488360 ADD X8, X8, #1 |
(2948) 0x488364 CMP X8, X23 |
(2948) 0x488368 B.LT 488324 |
(2946) 0x48836c LDR X8, [X14, X30,LSL #3] |
(2946) 0x488370 LDR X23, [X14, X7,LSL #3] |
(2946) 0x488374 CMP X8, X23 |
(2946) 0x488378 B.GE 4882c0 |
(2946) 0x48837c LDUR X9, [X29, #448] |
(2946) 0x488380 LDR X10, [X25] |
(2946) 0x488384 LDR X27, [X21] |
(2946) 0x488388 LDR X9, [X9] |
(2946) 0x48838c B 4883bc |
(2947) 0x4883a0 LDR D1, [X9, X8,LSL #3] |
(2947) 0x4883a4 LDR D2, [X10, X11,LSL #3] |
(2947) 0x4883a8 FMADD D1, D1, D0, D2 |
(2947) 0x4883ac STR D1, [X10, X11,LSL #3] |
(2947) 0x4883b0 ADD X8, X8, #1 |
(2947) 0x4883b4 CMP X8, X23 |
(2947) 0x4883b8 B.GE 4882c0 |
(2947) 0x4883bc LDR X24, [X6, X8,LSL #3] |
(2947) 0x4883c0 LDR X11, [X0, X24,LSL #3] |
(2947) 0x4883c4 CMP X11, X22 |
(2947) 0x4883c8 B.GE 4883a0 |
(2947) 0x4883cc LDR D1, [X9, X8,LSL #3] |
(2947) 0x4883d0 STR X1, [X0, X24,LSL #3] |
(2947) 0x4883d4 STR X24, [X27, X1,LSL #3] |
(2947) 0x4883d8 LDR X23, [X14, X7,LSL #3] |
(2947) 0x4883dc FMUL D1, D1, D0 |
(2947) 0x4883e0 STR D1, [X10, X1,LSL #3] |
(2947) 0x4883e4 ADD X1, X1, #1 |
(2947) 0x4883e8 ADD X8, X8, #1 |
(2947) 0x4883ec CMP X8, X23 |
(2947) 0x4883f0 B.LT 4883bc |
(2946) 0x4883f4 B 4882c0 |
(2942) 0x488400 ORR X2, XZR, X19 |
(2942) 0x488404 LDR X3, [X15, X26,LSL #3] |
(2942) 0x488408 ADD X26, X26, #1 |
(2942) 0x48840c LDR X8, [X15, X26,LSL #3] |
(2942) 0x488410 CMP X3, X8 |
(2942) 0x488414 B.GE 488240 |
0x488418 B 488438 |
(2942) 0x488420 ORR X2, XZR, X19 |
(2942) 0x488424 LDR X3, [X15, X26,LSL #3] |
(2942) 0x488428 ADD X26, X26, #1 |
(2942) 0x48842c LDR X8, [X15, X26,LSL #3] |
(2942) 0x488430 CMP X3, X8 |
(2942) 0x488434 B.GE 488240 |
0x488438 LDP X9, X8, [SP, #64] |
0x48843c LDR X4, [X9] |
0x488440 LDR X5, [X8] |
0x488444 LDP X9, X8, [SP, #80] |
0x488448 LDR X6, [X9] |
0x48844c LDR X7, [X8] |
0x488450 B 488470 |
0x488460 LDR X8, [X15, X26,LSL #3] |
0x488464 ADD X3, X3, #1 |
0x488468 CMP X3, X8 |
0x48846c B.GE 488240 |
0x488470 LDR X8, [X16, X3,LSL #3] |
0x488474 LDR D0, [X17, X3,LSL #3] |
0x488478 ADD X30, X8, #1 |
0x48847c LDR X9, [X18, X8,LSL #3] |
0x488480 LDR X27, [X18, X30,LSL #3] |
0x488484 CMP X9, X27 |
0x488488 B.GE 4884f4 |
0x48848c LDUR X10, [X29, #464] |
0x488490 LDR X23, [X25] |
0x488494 LDR X24, [X21] |
0x488498 LDR X10, [X10] |
0x48849c B 4884bc |
(2945) 0x4884a0 LDR D1, [X10, X9,LSL #3] |
(2945) 0x4884a4 LDR D2, [X23, X12,LSL #3] |
(2945) 0x4884a8 FMADD D1, D1, D0, D2 |
(2945) 0x4884ac STR D1, [X23, X12,LSL #3] |
(2945) 0x4884b0 ADD X9, X9, #1 |
(2945) 0x4884b4 CMP X9, X27 |
(2945) 0x4884b8 B.GE 4884f4 |
(2945) 0x4884bc LDR X11, [X4, X9,LSL #3] |
(2945) 0x4884c0 LDR X12, [X0, X11,LSL #3] |
(2945) 0x4884c4 CMP X12, X22 |
(2945) 0x4884c8 B.GE 4884a0 |
(2945) 0x4884cc LDR D1, [X10, X9,LSL #3] |
(2945) 0x4884d0 STR X1, [X0, X11,LSL #3] |
(2945) 0x4884d4 STR X11, [X24, X1,LSL #3] |
(2945) 0x4884d8 LDR X27, [X18, X30,LSL #3] |
(2945) 0x4884dc FMUL D1, D1, D0 |
(2945) 0x4884e0 STR D1, [X23, X1,LSL #3] |
(2945) 0x4884e4 ADD X1, X1, #1 |
(2945) 0x4884e8 ADD X9, X9, #1 |
(2945) 0x4884ec CMP X9, X27 |
(2945) 0x4884f0 B.LT 4884bc |
0x4884f4 LDR X9, [X28] |
0x4884f8 CBZ X9, 488460 |
0x4884fc LDR X8, [X5, X8,LSL #3] |
0x488500 LDR X23, [X5, X30,LSL #3] |
0x488504 CMP X8, X23 |
0x488508 B.GE 488460 |
0x48850c LDUR X9, [X29, #456] |
0x488510 LDP X10, X11, [X29, #1000] |
0x488514 LDR X9, [X9] |
0x488518 LDR X10, [X10] |
0x48851c LDR X27, [X11] |
0x488520 B 488540 |
(2944) 0x488524 LDR D1, [X9, X8,LSL #3] |
(2944) 0x488528 LDR D2, [X10, X11,LSL #3] |
(2944) 0x48852c FMADD D1, D1, D0, D2 |
(2944) 0x488530 STR D1, [X10, X11,LSL #3] |
(2944) 0x488534 ADD X8, X8, #1 |
(2944) 0x488538 CMP X8, X23 |
(2944) 0x48853c B.GE 488460 |
(2944) 0x488540 LDR X12, [X7, X8,LSL #3] |
(2944) 0x488544 LDR X11, [X20] |
(2944) 0x488548 LDR X12, [X6, X12,LSL #3] |
(2944) 0x48854c ADD X24, X12, X11 |
(2944) 0x488550 LDR X11, [X0, X24,LSL #3] |
(2944) 0x488554 CMP X11, X19 |
(2944) 0x488558 B.GE 488524 |
(2944) 0x48855c STR X2, [X0, X24,LSL #3] |
(2944) 0x488560 LDR D1, [X9, X8,LSL #3] |
(2944) 0x488564 LDR X11, [X20] |
(2944) 0x488568 SUB X11, X24, X11 |
(2944) 0x48856c STR X11, [X27, X2,LSL #3] |
(2944) 0x488570 FMUL D1, D1, D0 |
(2944) 0x488574 LDR X23, [X5, X30,LSL #3] |
(2944) 0x488578 STR D1, [X10, X2,LSL #3] |
(2944) 0x48857c ADD X2, X2, #1 |
(2944) 0x488580 ADD X8, X8, #1 |
(2944) 0x488584 CMP X8, X23 |
(2944) 0x488588 B.LT 488540 |
0x48858c B 488460 |
/home/eoseret/qaas/qaas_runs/178-188-3659/intel/AMG/build/AMG/AMG/parcsr_mv/par_csr_matop.c: 865 - 989 |
-------------------------------------------------------------------------------- |
865: for (i1 = ns; i1 < ne; i1++) |
[...] |
874: if ( allsquare ) |
875: { |
876: B_marker[i1] = jj_count_diag; |
877: C_diag_data[jj_count_diag] = zero; |
878: C_diag_j[jj_count_diag] = i1; |
879: jj_count_diag++; |
[...] |
886: if (num_cols_offd_A) |
887: { |
888: for (jj2 = A_offd_i[i1]; jj2 < A_offd_i[i1+1]; jj2++) |
889: { |
890: i2 = A_offd_j[jj2]; |
891: a_entry = A_offd_data[jj2]; |
[...] |
897: for (jj3 = B_ext_offd_i[i2]; jj3 < B_ext_offd_i[i2+1]; jj3++) |
898: { |
899: i3 = num_cols_diag_B+B_ext_offd_j[jj3]; |
[...] |
907: if (B_marker[i3] < jj_row_begin_offd) |
908: { |
909: B_marker[i3] = jj_count_offd; |
910: C_offd_data[jj_count_offd] = a_entry*B_ext_offd_data[jj3]; |
911: C_offd_j[jj_count_offd] = i3-num_cols_diag_B; |
912: jj_count_offd++; |
913: } |
914: else |
915: C_offd_data[B_marker[i3]] += a_entry*B_ext_offd_data[jj3]; |
916: } |
917: for (jj3 = B_ext_diag_i[i2]; jj3 < B_ext_diag_i[i2+1]; jj3++) |
918: { |
919: i3 = B_ext_diag_j[jj3]; |
920: if (B_marker[i3] < jj_row_begin_diag) |
921: { |
922: B_marker[i3] = jj_count_diag; |
923: C_diag_data[jj_count_diag] = a_entry*B_ext_diag_data[jj3]; |
924: C_diag_j[jj_count_diag] = i3; |
925: jj_count_diag++; |
926: } |
927: else |
928: C_diag_data[B_marker[i3]] += a_entry*B_ext_diag_data[jj3]; |
[...] |
937: for (jj2 = A_diag_i[i1]; jj2 < A_diag_i[i1+1]; jj2++) |
938: { |
939: i2 = A_diag_j[jj2]; |
940: a_entry = A_diag_data[jj2]; |
[...] |
946: for (jj3 = B_diag_i[i2]; jj3 < B_diag_i[i2+1]; jj3++) |
947: { |
948: i3 = B_diag_j[jj3]; |
[...] |
956: if (B_marker[i3] < jj_row_begin_diag) |
957: { |
958: B_marker[i3] = jj_count_diag; |
959: C_diag_data[jj_count_diag] = a_entry*B_diag_data[jj3]; |
960: C_diag_j[jj_count_diag] = i3; |
961: jj_count_diag++; |
962: } |
963: else |
964: { |
965: C_diag_data[B_marker[i3]] += a_entry*B_diag_data[jj3]; |
966: } |
967: } |
968: if (num_cols_offd_B) |
969: { |
970: for (jj3 = B_offd_i[i2]; jj3 < B_offd_i[i2+1]; jj3++) |
971: { |
972: i3 = num_cols_diag_B+map_B_to_C[B_offd_j[jj3]]; |
[...] |
980: if (B_marker[i3] < jj_row_begin_offd) |
981: { |
982: B_marker[i3] = jj_count_offd; |
983: C_offd_data[jj_count_offd] = a_entry*B_offd_data[jj3]; |
984: C_offd_j[jj_count_offd] = i3-num_cols_diag_B; |
985: jj_count_offd++; |
986: } |
987: else |
988: { |
989: C_offd_data[B_marker[i3]] += a_entry*B_offd_data[jj3]; |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►98.57+ | __kmp_invoke_microtask | libomp.so | |
| ○ | __kmp_invoke_task_func | libomp.so | |
| ○ | __kmp_launch_thread | libomp.so | |
| ○ | __kmp_launch_worker(void*) | libomp.so | |
| ○ | start_thread | libc.so.6 | |
| ○ | thread_start | libc.so.6 | |
| ►1.43+ | __kmp_invoke_microtask | libomp.so | |
| ○ | __kmp_invoke_task_func | libomp.so | |
| ○ | __kmp_fork_call | libomp.so | |
| ○ | __kmpc_fork_call | libomp.so | |
| ○ | hypre_ParMatmul | par_csr_matop.c:999 | exec |
| ○ | hypre_BoomerAMGSetup | par_amg_setup.c:1226 | exec |
| ○ | hypre_PCGSetup | pcg.c:234 | exec |
| ○ | main | amg.c:398 | exec |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | exec |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.87 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 4.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.66 |
| Bottlenecks | P10, P11, P12, |
| Function | hypre_ParMatmul.omp_outlined.6 |
| Source | par_csr_matop.c:937-940,par_csr_matop.c:946-946,par_csr_matop.c:968-970 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 7.67 |
| CQA cycles if no scalar integer | 2.67 |
| CQA cycles if FP arith vectorized | 7.67 |
| CQA cycles if fully vectorized | 1.92 |
| Front-end cycles | 4.63 |
| P0 cycles | 4.50 |
| P1 cycles | 4.50 |
| P2 cycles | 1.25 |
| P3 cycles | 1.25 |
| P4 cycles | 1.25 |
| P5 cycles | 1.25 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 0.00 |
| P10 cycles | 7.67 |
| P11 cycles | 7.67 |
| P12 cycles | 7.67 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 37.00 |
| Nb uops | 37.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 2.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 25.00 |
| Vector-efficiency ratio load | 25.00 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 25.00 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.87 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 4.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.66 |
| Bottlenecks | P10, P11, P12, |
| Function | hypre_ParMatmul.omp_outlined.6 |
| Source | par_csr_matop.c:937-940,par_csr_matop.c:946-946,par_csr_matop.c:968-970 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 7.67 |
| CQA cycles if no scalar integer | 2.67 |
| CQA cycles if FP arith vectorized | 7.67 |
| CQA cycles if fully vectorized | 1.92 |
| Front-end cycles | 4.63 |
| P0 cycles | 4.50 |
| P1 cycles | 4.50 |
| P2 cycles | 1.25 |
| P3 cycles | 1.25 |
| P4 cycles | 1.25 |
| P5 cycles | 1.25 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 0.00 |
| P10 cycles | 7.67 |
| P11 cycles | 7.67 |
| P12 cycles | 7.67 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 37.00 |
| Nb uops | 37.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 2.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 25.00 |
| Vector-efficiency ratio load | 25.00 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 25.00 |
| Path / |
| Function | hypre_ParMatmul.omp_outlined.6 |
| Source file and lines | par_csr_matop.c:865-989 |
| Module | exec |
| nb instructions | 37 |
| nb uops | 37 |
| loop length | 148 |
| used w registers | 0 |
| used x registers | 22 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 1 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 2 |
| micro-operation queue | 4.63 cycles |
| front end | 4.63 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 4.50 | 4.50 | 1.25 | 1.25 | 1.25 | 1.25 | 0.00 | 0.00 | 0.00 | 0.00 | 7.67 | 7.67 | 7.67 | 0.00 | 0.00 |
| cycles | 4.50 | 4.50 | 1.25 | 1.25 | 1.25 | 1.25 | 0.00 | 0.00 | 0.00 | 0.00 | 7.67 | 7.67 | 7.67 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 4.63 |
| Dispatch | 7.67 |
| Overall L1 | 7.67 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 25% |
| load | 25% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| B 488438 <hypre_ParMatmul.omp_outlined.6+0x408> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDP X9, X8, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X4, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X5, [X8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDP X9, X8, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X6, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X7, [X8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| B 488470 <hypre_ParMatmul.omp_outlined.6+0x440> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X15, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| ADD X3, X3, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP X3, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.GE 488240 <hypre_ParMatmul.omp_outlined.6+0x210> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X16, X3,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR D0, [X17, X3,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X30, X8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR X9, [X18, X8,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X27, [X18, X30,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| CMP X9, X27 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.GE 4884f4 <hypre_ParMatmul.omp_outlined.6+0x4c4> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDUR X10, [X29, #464] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X23, [X25] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X24, [X21] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X10, [X10] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| B 4884bc <hypre_ParMatmul.omp_outlined.6+0x48c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X9, [X28] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| CBZ X9, 488460 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X5, X8,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X23, [X5, X30,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| CMP X8, X23 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.GE 488460 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDUR X9, [X29, #456] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDP X10, X11, [X29, #1000] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X9, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X10, [X10] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X27, [X11] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| B 488540 <hypre_ParMatmul.omp_outlined.6+0x510> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| B 488460 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Function | hypre_ParMatmul.omp_outlined.6 |
| Source file and lines | par_csr_matop.c:865-989 |
| Module | exec |
| nb instructions | 37 |
| nb uops | 37 |
| loop length | 148 |
| used w registers | 0 |
| used x registers | 22 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 1 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 2 |
| micro-operation queue | 4.63 cycles |
| front end | 4.63 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 4.50 | 4.50 | 1.25 | 1.25 | 1.25 | 1.25 | 0.00 | 0.00 | 0.00 | 0.00 | 7.67 | 7.67 | 7.67 | 0.00 | 0.00 |
| cycles | 4.50 | 4.50 | 1.25 | 1.25 | 1.25 | 1.25 | 0.00 | 0.00 | 0.00 | 0.00 | 7.67 | 7.67 | 7.67 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 4.63 |
| Dispatch | 7.67 |
| Overall L1 | 7.67 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 25% |
| load | 25% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| B 488438 <hypre_ParMatmul.omp_outlined.6+0x408> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDP X9, X8, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X4, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X5, [X8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDP X9, X8, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X6, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X7, [X8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| B 488470 <hypre_ParMatmul.omp_outlined.6+0x440> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X15, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| ADD X3, X3, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP X3, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.GE 488240 <hypre_ParMatmul.omp_outlined.6+0x210> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X16, X3,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR D0, [X17, X3,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X30, X8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR X9, [X18, X8,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X27, [X18, X30,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| CMP X9, X27 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.GE 4884f4 <hypre_ParMatmul.omp_outlined.6+0x4c4> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDUR X10, [X29, #464] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X23, [X25] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X24, [X21] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X10, [X10] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| B 4884bc <hypre_ParMatmul.omp_outlined.6+0x48c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X9, [X28] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| CBZ X9, 488460 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X5, X8,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X23, [X5, X30,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| CMP X8, X23 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.GE 488460 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDUR X9, [X29, #456] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDP X10, X11, [X29, #1000] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X9, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X10, [X10] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X27, [X11] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| B 488540 <hypre_ParMatmul.omp_outlined.6+0x510> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| B 488460 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
