| Loop Id: 2782 | Module: exec | Source: par_csr_matop.c:865-989 [...] | Coverage: 0.20% |
|---|
| Loop Id: 2782 | Module: exec | Source: par_csr_matop.c:865-989 [...] | Coverage: 0.20% |
|---|
(2781) 0x486000 LDUR X8, [X29, #432] |
(2781) 0x486004 ORR X19, XZR, X2 |
(2781) 0x486008 ORR X22, XZR, X1 |
(2781) 0x48600c CMP X26, X8 |
(2781) 0x486010 B.GE 486350 |
(2781) 0x486014 LDUR X8, [X29, #424] |
(2781) 0x486018 LDR X8, [X8] |
(2781) 0x48601c CBZ X8, 48603c |
(2781) 0x486020 LDUR X8, [X29, #408] |
(2781) 0x486024 ADD X1, X22, #1 |
(2781) 0x486028 STR X22, [X0, X26,LSL #3] |
(2781) 0x48602c LDR D0, [X8] |
(2781) 0x486030 LDP X8, X9, [SP, #104] |
(2781) 0x486034 STR X26, [X8, X22,LSL #3] |
(2781) 0x486038 STR D0, [X9, X22,LSL #3] |
(2781) 0x48603c LDUR X8, [X29, #416] |
(2781) 0x486040 LDR X8, [X8] |
(2781) 0x486044 CBZ X8, 4861c0 |
(2781) 0x486048 LDR X8, [SP, #96] |
(2781) 0x48604c ADD X3, X8, X26,LSL #3 |
(2781) 0x486050 LDP X4, X8, [X3] |
(2781) 0x486054 CMP X4, X8 |
(2781) 0x486058 B.GE 4861e0 |
(2781) 0x48605c LDP X9, X8, [SP, #48] |
(2781) 0x486060 ORR X2, XZR, X19 |
(2781) 0x486064 LDR X5, [X9] |
(2781) 0x486068 LDR X6, [X8] |
(2781) 0x48606c B 486090 |
(2785) 0x486080 LDR X8, [X3, #8] |
(2785) 0x486084 ADD X4, X4, #1 |
(2785) 0x486088 CMP X4, X8 |
(2785) 0x48608c B.GE 4861c4 |
(2785) 0x486090 LDP X8, X9, [X29, #984] |
(2785) 0x486094 LDR X30, [X9, X4,LSL #3] |
(2785) 0x486098 LDR D0, [X8, X4,LSL #3] |
(2785) 0x48609c ADD X7, X30, #1 |
(2785) 0x4860a0 LDR X8, [X13, X30,LSL #3] |
(2785) 0x4860a4 LDR X23, [X13, X7,LSL #3] |
(2785) 0x4860a8 CMP X8, X23 |
(2785) 0x4860ac B.GE 48612c |
(2785) 0x4860b0 LDUR X9, [X29, #440] |
(2785) 0x4860b4 LDP X10, X11, [X29, #1000] |
(2785) 0x4860b8 LDR X9, [X9] |
(2785) 0x4860bc LDR X10, [X10] |
(2785) 0x4860c0 LDR X27, [X11] |
(2785) 0x4860c4 B 4860e4 |
(2787) 0x4860c8 LDR D1, [X9, X8,LSL #3] |
(2787) 0x4860cc LDR D2, [X10, X11,LSL #3] |
(2787) 0x4860d0 FMADD D1, D0, D1, D2 |
(2787) 0x4860d4 STR D1, [X10, X11,LSL #3] |
(2787) 0x4860d8 ADD X8, X8, #1 |
(2787) 0x4860dc CMP X8, X23 |
(2787) 0x4860e0 B.GE 48612c |
(2787) 0x4860e4 LDR X11, [X20] |
(2787) 0x4860e8 LDR X12, [X5, X8,LSL #3] |
(2787) 0x4860ec ADD X24, X12, X11 |
(2787) 0x4860f0 LDR X11, [X0, X24,LSL #3] |
(2787) 0x4860f4 CMP X11, X19 |
(2787) 0x4860f8 B.GE 4860c8 |
(2787) 0x4860fc STR X2, [X0, X24,LSL #3] |
(2787) 0x486100 LDR D1, [X9, X8,LSL #3] |
(2787) 0x486104 LDR X11, [X20] |
(2787) 0x486108 SUB X11, X24, X11 |
(2787) 0x48610c STR X11, [X27, X2,LSL #3] |
(2787) 0x486110 FMUL D1, D0, D1 |
(2787) 0x486114 LDR X23, [X13, X7,LSL #3] |
(2787) 0x486118 STR D1, [X10, X2,LSL #3] |
(2787) 0x48611c ADD X2, X2, #1 |
(2787) 0x486120 ADD X8, X8, #1 |
(2787) 0x486124 CMP X8, X23 |
(2787) 0x486128 B.LT 4860e4 |
(2785) 0x48612c LDR X8, [X14, X30,LSL #3] |
(2785) 0x486130 LDR X23, [X14, X7,LSL #3] |
(2785) 0x486134 CMP X8, X23 |
(2785) 0x486138 B.GE 486080 |
(2785) 0x48613c LDUR X9, [X29, #448] |
(2785) 0x486140 LDR X10, [X25] |
(2785) 0x486144 LDR X27, [X21] |
(2785) 0x486148 LDR X9, [X9] |
(2785) 0x48614c B 48617c |
(2786) 0x486160 LDR D1, [X9, X8,LSL #3] |
(2786) 0x486164 LDR D2, [X10, X11,LSL #3] |
(2786) 0x486168 FMADD D1, D0, D1, D2 |
(2786) 0x48616c STR D1, [X10, X11,LSL #3] |
(2786) 0x486170 ADD X8, X8, #1 |
(2786) 0x486174 CMP X8, X23 |
(2786) 0x486178 B.GE 486080 |
(2786) 0x48617c LDR X24, [X6, X8,LSL #3] |
(2786) 0x486180 LDR X11, [X0, X24,LSL #3] |
(2786) 0x486184 CMP X11, X22 |
(2786) 0x486188 B.GE 486160 |
(2786) 0x48618c LDR D1, [X9, X8,LSL #3] |
(2786) 0x486190 STR X1, [X0, X24,LSL #3] |
(2786) 0x486194 STR X24, [X27, X1,LSL #3] |
(2786) 0x486198 LDR X23, [X14, X7,LSL #3] |
(2786) 0x48619c FMUL D1, D0, D1 |
(2786) 0x4861a0 STR D1, [X10, X1,LSL #3] |
(2786) 0x4861a4 ADD X1, X1, #1 |
(2786) 0x4861a8 ADD X8, X8, #1 |
(2786) 0x4861ac CMP X8, X23 |
(2786) 0x4861b0 B.LT 48617c |
(2785) 0x4861b4 B 486080 |
(2781) 0x4861c0 ORR X2, XZR, X19 |
(2781) 0x4861c4 LDR X3, [X15, X26,LSL #3] |
(2781) 0x4861c8 ADD X26, X26, #1 |
(2781) 0x4861cc LDR X8, [X15, X26,LSL #3] |
(2781) 0x4861d0 CMP X3, X8 |
(2781) 0x4861d4 B.GE 486000 |
0x4861d8 B 4861f8 |
(2781) 0x4861e0 ORR X2, XZR, X19 |
(2781) 0x4861e4 LDR X3, [X15, X26,LSL #3] |
(2781) 0x4861e8 ADD X26, X26, #1 |
(2781) 0x4861ec LDR X8, [X15, X26,LSL #3] |
(2781) 0x4861f0 CMP X3, X8 |
(2781) 0x4861f4 B.GE 486000 |
0x4861f8 LDP X9, X8, [SP, #64] |
0x4861fc LDR X4, [X9] |
0x486200 LDR X5, [X8] |
0x486204 LDP X9, X8, [SP, #80] |
0x486208 LDR X6, [X9] |
0x48620c LDR X7, [X8] |
0x486210 B 486230 |
0x486220 LDR X8, [X15, X26,LSL #3] |
0x486224 ADD X3, X3, #1 |
0x486228 CMP X3, X8 |
0x48622c B.GE 486000 |
0x486230 LDR X8, [X16, X3,LSL #3] |
0x486234 LDR D0, [X17, X3,LSL #3] |
0x486238 ADD X30, X8, #1 |
0x48623c LDR X9, [X18, X8,LSL #3] |
0x486240 LDR X27, [X18, X30,LSL #3] |
0x486244 CMP X9, X27 |
0x486248 B.GE 4862b4 |
0x48624c LDUR X10, [X29, #464] |
0x486250 LDR X23, [X25] |
0x486254 LDR X24, [X21] |
0x486258 LDR X10, [X10] |
0x48625c B 48627c |
(2784) 0x486260 LDR D1, [X10, X9,LSL #3] |
(2784) 0x486264 LDR D2, [X23, X12,LSL #3] |
(2784) 0x486268 FMADD D1, D0, D1, D2 |
(2784) 0x48626c STR D1, [X23, X12,LSL #3] |
(2784) 0x486270 ADD X9, X9, #1 |
(2784) 0x486274 CMP X9, X27 |
(2784) 0x486278 B.GE 4862b4 |
(2784) 0x48627c LDR X11, [X4, X9,LSL #3] |
(2784) 0x486280 LDR X12, [X0, X11,LSL #3] |
(2784) 0x486284 CMP X12, X22 |
(2784) 0x486288 B.GE 486260 |
(2784) 0x48628c LDR D1, [X10, X9,LSL #3] |
(2784) 0x486290 STR X1, [X0, X11,LSL #3] |
(2784) 0x486294 STR X11, [X24, X1,LSL #3] |
(2784) 0x486298 LDR X27, [X18, X30,LSL #3] |
(2784) 0x48629c FMUL D1, D0, D1 |
(2784) 0x4862a0 STR D1, [X23, X1,LSL #3] |
(2784) 0x4862a4 ADD X1, X1, #1 |
(2784) 0x4862a8 ADD X9, X9, #1 |
(2784) 0x4862ac CMP X9, X27 |
(2784) 0x4862b0 B.LT 48627c |
0x4862b4 LDR X9, [X28] |
0x4862b8 CBZ X9, 486220 |
0x4862bc LDR X8, [X5, X8,LSL #3] |
0x4862c0 LDR X23, [X5, X30,LSL #3] |
0x4862c4 CMP X8, X23 |
0x4862c8 B.GE 486220 |
0x4862cc LDUR X9, [X29, #456] |
0x4862d0 LDP X10, X11, [X29, #1000] |
0x4862d4 LDR X9, [X9] |
0x4862d8 LDR X10, [X10] |
0x4862dc LDR X27, [X11] |
0x4862e0 B 486300 |
(2783) 0x4862e4 LDR D1, [X9, X8,LSL #3] |
(2783) 0x4862e8 LDR D2, [X10, X11,LSL #3] |
(2783) 0x4862ec FMADD D1, D0, D1, D2 |
(2783) 0x4862f0 STR D1, [X10, X11,LSL #3] |
(2783) 0x4862f4 ADD X8, X8, #1 |
(2783) 0x4862f8 CMP X8, X23 |
(2783) 0x4862fc B.GE 486220 |
(2783) 0x486300 LDR X12, [X7, X8,LSL #3] |
(2783) 0x486304 LDR X11, [X20] |
(2783) 0x486308 LDR X12, [X6, X12,LSL #3] |
(2783) 0x48630c ADD X24, X12, X11 |
(2783) 0x486310 LDR X11, [X0, X24,LSL #3] |
(2783) 0x486314 CMP X11, X19 |
(2783) 0x486318 B.GE 4862e4 |
(2783) 0x48631c STR X2, [X0, X24,LSL #3] |
(2783) 0x486320 LDR D1, [X9, X8,LSL #3] |
(2783) 0x486324 LDR X11, [X20] |
(2783) 0x486328 SUB X11, X24, X11 |
(2783) 0x48632c STR X11, [X27, X2,LSL #3] |
(2783) 0x486330 FMUL D1, D0, D1 |
(2783) 0x486334 LDR X23, [X5, X30,LSL #3] |
(2783) 0x486338 STR D1, [X10, X2,LSL #3] |
(2783) 0x48633c ADD X2, X2, #1 |
(2783) 0x486340 ADD X8, X8, #1 |
(2783) 0x486344 CMP X8, X23 |
(2783) 0x486348 B.LT 486300 |
0x48634c B 486220 |
/home/eoseret/qaas/qaas_runs/178-188-3659/intel/AMG/build/AMG/AMG/parcsr_mv/par_csr_matop.c: 865 - 989 |
-------------------------------------------------------------------------------- |
865: for (i1 = ns; i1 < ne; i1++) |
[...] |
874: if ( allsquare ) |
875: { |
876: B_marker[i1] = jj_count_diag; |
877: C_diag_data[jj_count_diag] = zero; |
878: C_diag_j[jj_count_diag] = i1; |
879: jj_count_diag++; |
[...] |
886: if (num_cols_offd_A) |
887: { |
888: for (jj2 = A_offd_i[i1]; jj2 < A_offd_i[i1+1]; jj2++) |
889: { |
890: i2 = A_offd_j[jj2]; |
891: a_entry = A_offd_data[jj2]; |
[...] |
897: for (jj3 = B_ext_offd_i[i2]; jj3 < B_ext_offd_i[i2+1]; jj3++) |
898: { |
899: i3 = num_cols_diag_B+B_ext_offd_j[jj3]; |
[...] |
907: if (B_marker[i3] < jj_row_begin_offd) |
908: { |
909: B_marker[i3] = jj_count_offd; |
910: C_offd_data[jj_count_offd] = a_entry*B_ext_offd_data[jj3]; |
911: C_offd_j[jj_count_offd] = i3-num_cols_diag_B; |
912: jj_count_offd++; |
913: } |
914: else |
915: C_offd_data[B_marker[i3]] += a_entry*B_ext_offd_data[jj3]; |
916: } |
917: for (jj3 = B_ext_diag_i[i2]; jj3 < B_ext_diag_i[i2+1]; jj3++) |
918: { |
919: i3 = B_ext_diag_j[jj3]; |
920: if (B_marker[i3] < jj_row_begin_diag) |
921: { |
922: B_marker[i3] = jj_count_diag; |
923: C_diag_data[jj_count_diag] = a_entry*B_ext_diag_data[jj3]; |
924: C_diag_j[jj_count_diag] = i3; |
925: jj_count_diag++; |
926: } |
927: else |
928: C_diag_data[B_marker[i3]] += a_entry*B_ext_diag_data[jj3]; |
[...] |
937: for (jj2 = A_diag_i[i1]; jj2 < A_diag_i[i1+1]; jj2++) |
938: { |
939: i2 = A_diag_j[jj2]; |
940: a_entry = A_diag_data[jj2]; |
[...] |
946: for (jj3 = B_diag_i[i2]; jj3 < B_diag_i[i2+1]; jj3++) |
947: { |
948: i3 = B_diag_j[jj3]; |
[...] |
956: if (B_marker[i3] < jj_row_begin_diag) |
957: { |
958: B_marker[i3] = jj_count_diag; |
959: C_diag_data[jj_count_diag] = a_entry*B_diag_data[jj3]; |
960: C_diag_j[jj_count_diag] = i3; |
961: jj_count_diag++; |
962: } |
963: else |
964: { |
965: C_diag_data[B_marker[i3]] += a_entry*B_diag_data[jj3]; |
966: } |
967: } |
968: if (num_cols_offd_B) |
969: { |
970: for (jj3 = B_offd_i[i2]; jj3 < B_offd_i[i2+1]; jj3++) |
971: { |
972: i3 = num_cols_diag_B+map_B_to_C[B_offd_j[jj3]]; |
[...] |
980: if (B_marker[i3] < jj_row_begin_offd) |
981: { |
982: B_marker[i3] = jj_count_offd; |
983: C_offd_data[jj_count_offd] = a_entry*B_offd_data[jj3]; |
984: C_offd_j[jj_count_offd] = i3-num_cols_diag_B; |
985: jj_count_offd++; |
986: } |
987: else |
988: { |
989: C_offd_data[B_marker[i3]] += a_entry*B_offd_data[jj3]; |
| Coverage (%) | Name | Source Location | Module |
|---|---|---|---|
| ►98.64+ | __kmp_invoke_microtask | libomp.so | |
| ○ | __kmp_invoke_task_func | libomp.so | |
| ○ | __kmp_launch_thread | libomp.so | |
| ○ | __kmp_launch_worker(void*) | libomp.so | |
| ○ | start_thread | libc.so.6 | |
| ○ | thread_start | libc.so.6 | |
| ►1.36+ | __kmp_invoke_microtask | libomp.so | |
| ○ | __kmp_invoke_task_func | libomp.so | |
| ○ | __kmp_fork_call | libomp.so | |
| ○ | __kmpc_fork_call | libomp.so | |
| ○ | hypre_ParMatmul | par_csr_matop.c:999 | exec |
| ○ | hypre_BoomerAMGSetup | par_amg_setup.c:1226 | exec |
| ○ | hypre_PCGSetup | pcg.c:234 | exec |
| ○ | main | amg.c:398 | exec |
| ○ | __libc_start_call_main | libc.so.6 | |
| ○ | __libc_start_main | libc.so.6 | |
| ○ | _start | exec |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| min | med | avg | max |
|---|---|---|---|
| Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
|---|---|---|---|---|---|---|---|---|---|---|
| Value |
| Path / |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.87 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 4.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.66 |
| Bottlenecks | P10, P11, P12, |
| Function | hypre_ParMatmul.omp_outlined.6 |
| Source | par_csr_matop.c:937-940,par_csr_matop.c:946-946,par_csr_matop.c:968-970 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 7.67 |
| CQA cycles if no scalar integer | 2.67 |
| CQA cycles if FP arith vectorized | 7.67 |
| CQA cycles if fully vectorized | 1.92 |
| Front-end cycles | 4.63 |
| P0 cycles | 4.50 |
| P1 cycles | 4.50 |
| P2 cycles | 1.25 |
| P3 cycles | 1.25 |
| P4 cycles | 1.25 |
| P5 cycles | 1.25 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 0.00 |
| P10 cycles | 7.67 |
| P11 cycles | 7.67 |
| P12 cycles | 7.67 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 37.00 |
| Nb uops | 37.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 2.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 25.00 |
| Vector-efficiency ratio load | 25.00 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 25.00 |
| Metric | Value |
|---|---|
| CQA speedup if no scalar integer | 2.87 |
| CQA speedup if FP arith vectorized | 1.00 |
| CQA speedup if fully vectorized | 4.00 |
| CQA speedup if no inter-iteration dependency | NA |
| CQA speedup if next bottleneck killed | 1.66 |
| Bottlenecks | P10, P11, P12, |
| Function | hypre_ParMatmul.omp_outlined.6 |
| Source | par_csr_matop.c:937-940,par_csr_matop.c:946-946,par_csr_matop.c:968-970 |
| Source loop unroll info | NA |
| Source loop unroll confidence level | NA |
| Unroll/vectorization loop type | NA |
| Unroll factor | NA |
| CQA cycles | 7.67 |
| CQA cycles if no scalar integer | 2.67 |
| CQA cycles if FP arith vectorized | 7.67 |
| CQA cycles if fully vectorized | 1.92 |
| Front-end cycles | 4.63 |
| P0 cycles | 4.50 |
| P1 cycles | 4.50 |
| P2 cycles | 1.25 |
| P3 cycles | 1.25 |
| P4 cycles | 1.25 |
| P5 cycles | 1.25 |
| P6 cycles | 0.00 |
| P7 cycles | 0.00 |
| P8 cycles | 0.00 |
| P9 cycles | 0.00 |
| P10 cycles | 7.67 |
| P11 cycles | 7.67 |
| P12 cycles | 7.67 |
| P13 cycles | 0.00 |
| P14 cycles | 0.00 |
| DIV/SQRT cycles | 0.00 |
| Inter-iter dependencies cycles | NA |
| FE+BE cycles (UFS) | NA |
| Stall cycles (UFS) | NA |
| Nb insns | 37.00 |
| Nb uops | 37.00 |
| Nb loads | NA |
| Nb stores | 0.00 |
| Nb stack references | 2.00 |
| FLOP/cycle | 0.00 |
| Nb FLOP add-sub | 0.00 |
| Nb FLOP mul | 0.00 |
| Nb FLOP fma | 0.00 |
| Nb FLOP div | 0.00 |
| Nb FLOP rcp | 0.00 |
| Nb FLOP sqrt | 0.00 |
| Nb FLOP rsqrt | 0.00 |
| Bytes/cycle | 0.00 |
| Bytes prefetched | 0.00 |
| Bytes loaded | 0.00 |
| Bytes stored | 0.00 |
| Stride 0 | NA |
| Stride 1 | NA |
| Stride n | NA |
| Stride unknown | NA |
| Stride indirect | NA |
| Vectorization ratio all | 0.00 |
| Vectorization ratio load | 0.00 |
| Vectorization ratio store | NA |
| Vectorization ratio mul | NA |
| Vectorization ratio add_sub | NA |
| Vectorization ratio fma | NA |
| Vectorization ratio div_sqrt | NA |
| Vectorization ratio other | 0.00 |
| Vector-efficiency ratio all | 25.00 |
| Vector-efficiency ratio load | 25.00 |
| Vector-efficiency ratio store | NA |
| Vector-efficiency ratio mul | NA |
| Vector-efficiency ratio add_sub | NA |
| Vector-efficiency ratio fma | NA |
| Vector-efficiency ratio div_sqrt | NA |
| Vector-efficiency ratio other | 25.00 |
| Path / |
| Function | hypre_ParMatmul.omp_outlined.6 |
| Source file and lines | par_csr_matop.c:865-989 |
| Module | exec |
| nb instructions | 37 |
| nb uops | 37 |
| loop length | 148 |
| used w registers | 0 |
| used x registers | 22 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 1 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 2 |
| micro-operation queue | 4.63 cycles |
| front end | 4.63 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 4.50 | 4.50 | 1.25 | 1.25 | 1.25 | 1.25 | 0.00 | 0.00 | 0.00 | 0.00 | 7.67 | 7.67 | 7.67 | 0.00 | 0.00 |
| cycles | 4.50 | 4.50 | 1.25 | 1.25 | 1.25 | 1.25 | 0.00 | 0.00 | 0.00 | 0.00 | 7.67 | 7.67 | 7.67 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 4.63 |
| Dispatch | 7.67 |
| Overall L1 | 7.67 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 25% |
| load | 25% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| B 4861f8 <hypre_ParMatmul.omp_outlined.6+0x408> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDP X9, X8, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X4, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X5, [X8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDP X9, X8, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X6, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X7, [X8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| B 486230 <hypre_ParMatmul.omp_outlined.6+0x440> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X15, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| ADD X3, X3, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP X3, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.GE 486000 <hypre_ParMatmul.omp_outlined.6+0x210> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X16, X3,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR D0, [X17, X3,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X30, X8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR X9, [X18, X8,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X27, [X18, X30,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| CMP X9, X27 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.GE 4862b4 <hypre_ParMatmul.omp_outlined.6+0x4c4> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDUR X10, [X29, #464] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X23, [X25] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X24, [X21] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X10, [X10] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| B 48627c <hypre_ParMatmul.omp_outlined.6+0x48c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X9, [X28] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| CBZ X9, 486220 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X5, X8,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X23, [X5, X30,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| CMP X8, X23 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.GE 486220 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDUR X9, [X29, #456] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDP X10, X11, [X29, #1000] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X9, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X10, [X10] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X27, [X11] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| B 486300 <hypre_ParMatmul.omp_outlined.6+0x510> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| B 486220 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| Function | hypre_ParMatmul.omp_outlined.6 |
| Source file and lines | par_csr_matop.c:865-989 |
| Module | exec |
| nb instructions | 37 |
| nb uops | 37 |
| loop length | 148 |
| used w registers | 0 |
| used x registers | 22 |
| used b registers | 0 |
| used h registers | 0 |
| used s registers | 1 |
| used d registers | 1 |
| used q registers | 0 |
| used v registers | 0 |
| used z registers | 0 |
| nb stack references | 2 |
| micro-operation queue | 4.63 cycles |
| front end | 4.63 cycles |
| P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| uops | 4.50 | 4.50 | 1.25 | 1.25 | 1.25 | 1.25 | 0.00 | 0.00 | 0.00 | 0.00 | 7.67 | 7.67 | 7.67 | 0.00 | 0.00 |
| cycles | 4.50 | 4.50 | 1.25 | 1.25 | 1.25 | 1.25 | 0.00 | 0.00 | 0.00 | 0.00 | 7.67 | 7.67 | 7.67 | 0.00 | 0.00 |
| Cycles executing div or sqrt instructions | NA |
| Front-end | 4.63 |
| Dispatch | 7.67 |
| Overall L1 | 7.67 |
| all | 0% |
| load | 0% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 0% |
| all | 25% |
| load | 25% |
| store | NA (no store vectorizable/vectorized instructions) |
| mul | NA (no mul vectorizable/vectorized instructions) |
| add-sub | NA (no add-sub vectorizable/vectorized instructions) |
| fma | NA (no fma vectorizable/vectorized instructions) |
| div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
| other | 25% |
| Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | Latency | Recip. throughput | Vectorization |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| B 4861f8 <hypre_ParMatmul.omp_outlined.6+0x408> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDP X9, X8, [SP, #64] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X4, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X5, [X8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDP X9, X8, [SP, #80] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X6, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X7, [X8] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| B 486230 <hypre_ParMatmul.omp_outlined.6+0x440> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X15, X26,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| ADD X3, X3, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| CMP X3, X8 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.GE 486000 <hypre_ParMatmul.omp_outlined.6+0x210> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X16, X3,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR D0, [X17, X3,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 6 | 0.33 | scal (25.0%) |
| ADD X30, X8, #1 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.25 | N/A |
| LDR X9, [X18, X8,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X27, [X18, X30,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| CMP X9, X27 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | scal (25.0%) |
| B.GE 4862b4 <hypre_ParMatmul.omp_outlined.6+0x4c4> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDUR X10, [X29, #464] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X23, [X25] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X24, [X21] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| LDR X10, [X10] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| B 48627c <hypre_ParMatmul.omp_outlined.6+0x48c> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X9, [X28] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| CBZ X9, 486220 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDR X8, [X5, X8,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X23, [X5, X30,LSL #3] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| CMP X8, X23 | 1 | 0 | 0 | 0.25 | 0.25 | 0.25 | 0.25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.33 | N/A |
| B.GE 486220 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| LDUR X9, [X29, #456] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDP X10, X11, [X29, #1000] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 1 | N/A |
| LDR X9, [X9] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X10, [X10] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | N/A |
| LDR X27, [X11] | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.33 | 0.33 | 0.33 | 0 | 0 | 4 | 0.33 | scal (25.0%) |
| B 486300 <hypre_ParMatmul.omp_outlined.6+0x510> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
| B 486220 <hypre_ParMatmul.omp_outlined.6+0x430> | 1 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.50 | N/A |
