OV - exec - Loop 3147

	Loop Id: 3147	Module: exec	Source: ams.c:3532-3534	Coverage: 0.07%

0x4ce380 ADD	$0x20,%RDI

0x4ce384 DEC	%RSI

0x4ce387 JE	4ce223

0x4ce38d MOV	-0x10(%R11,%RDI,1),%R8    [1]

0x4ce392 VUCOMISD	(%R12,%R8,8),%XMM3    [5]

0x4ce398 JBE	4ce400

0x4ce39a MOV	-0xb0(%RBP),%R8    [4]

0x4ce3a1 VMOVSD	-0x10(%R8,%RDI,1),%XMM0    [2]

0x4ce3a8 VXORPD	%XMM4,%XMM0,%XMM0

0x4ce3ac VMOVLPD	%XMM0,-0x10(%R8,%RDI,1)    [2]

0x4ce3b3 MOV	-0x8(%R11,%RDI,1),%R8    [1]

0x4ce3b8 VUCOMISD	(%R12,%R8,8),%XMM3    [5]

0x4ce3be JA	4ce40d

0x4ce3c0 MOV	(%R11,%RDI,1),%R8    [1]

0x4ce3c4 VUCOMISD	(%R12,%R8,8),%XMM3    [5]

0x4ce3ca JBE	4ce432

0x4ce3cc MOV	-0xb0(%RBP),%R8    [4]

0x4ce3d3 VMOVSD	(%R8,%RDI,1),%XMM0    [6]

0x4ce3d9 VXORPD	%XMM4,%XMM0,%XMM0

0x4ce3dd VMOVLPD	%XMM0,(%R8,%RDI,1)    [6]

0x4ce3e3 MOV	0x8(%R11,%RDI,1),%R8    [1]

0x4ce3e8 VUCOMISD	(%R12,%R8,8),%XMM3    [5]

0x4ce3ee JBE	4ce380

0x4ce3f0 JMP	4ce443

0x4ce400 MOV	-0x8(%R11,%RDI,1),%R8    [1]

0x4ce405 VUCOMISD	(%R12,%R8,8),%XMM3    [5]

0x4ce40b JBE	4ce3c0

0x4ce40d MOV	-0xb0(%RBP),%R8    [4]

0x4ce414 VMOVSD	-0x8(%R8,%RDI,1),%XMM0    [3]

0x4ce41b VXORPD	%XMM4,%XMM0,%XMM0

0x4ce41f VMOVLPD	%XMM0,-0x8(%R8,%RDI,1)    [3]

0x4ce426 MOV	(%R11,%RDI,1),%R8    [1]

0x4ce42a VUCOMISD	(%R12,%R8,8),%XMM3    [5]

0x4ce430 JA	4ce3cc

0x4ce432 MOV	0x8(%R11,%RDI,1),%R8    [1]

0x4ce437 VUCOMISD	(%R12,%R8,8),%XMM3    [5]

0x4ce43d JBE	4ce380

0x4ce443 MOV	-0xb0(%RBP),%R8    [4]

0x4ce44a VMOVSD	0x8(%R8,%RDI,1),%XMM0    [7]

0x4ce451 VXORPD	%XMM4,%XMM0,%XMM0

0x4ce455 VMOVLPD	%XMM0,0x8(%R8,%RDI,1)    [7]

0x4ce45c JMP	4ce380

/home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c: 3532 - 3534

--------------------------------------------------------------------------------

3532:       for (i = ns; i < ne; i++)

3533:          if (A_diag_data[A_diag_I[i]] < 0)

3534:             l1_norm[i] = -l1_norm[i];

Coverage (%)	Name	Source Location	Module
►100.00+	__kmp_invoke_microtask		libiomp5.so
○	__kmp_invoke_task_func		libiomp5.so

Path /

Metric	Value
CQA speedup if no scalar integer	1.09
CQA speedup if FP arith vectorized	1.00
CQA speedup if fully vectorized	8.00
CQA speedup if no inter-iteration dependency	NA
CQA speedup if next bottleneck killed	1.09
Bottlenecks	micro-operation queue,
Function	hypre_ParCSRComputeL1NormsThreads.extracted
Source	ams.c:3532-3534
Source loop unroll info	unrolled by 4
Source loop unroll confidence level	high
Unroll/vectorization loop type	main
Unroll factor	4
CQA cycles	8.17
CQA cycles if no scalar integer	7.50
CQA cycles if FP arith vectorized	8.17
CQA cycles if fully vectorized	1.02
Front-end cycles	8.17
P0 cycles	7.50
P1 cycles	2.00
P2 cycles	7.33
P3 cycles	7.33
P4 cycles	2.00
P5 cycles	2.00
P6 cycles	7.50
P7 cycles	2.00
P8 cycles	2.00
P9 cycles	2.00
P10 cycles	0.00
P11 cycles	7.33
DIV/SQRT cycles	0.00
Inter-iter dependencies cycles	NA
FE+BE cycles (UFS)	NA
Stall cycles (UFS)	NA
Nb insns	42.00
Nb uops	49.00
Nb loads	22.00
Nb stores	4.00
Nb stack references	1.00
FLOP/cycle	0.00
Nb FLOP add-sub	0.00
Nb FLOP mul	0.00
Nb FLOP fma	0.00
Nb FLOP div	0.00
Nb FLOP rcp	0.00
Nb FLOP sqrt	0.00
Nb FLOP rsqrt	0.00
Bytes/cycle	25.47
Bytes prefetched	0.00
Bytes loaded	176.00
Bytes stored	32.00
Stride 0	NA
Stride 1	NA
Stride n	NA
Stride unknown	NA
Stride indirect	NA
Vectorization ratio all	21.05
Vectorization ratio load	0.00
Vectorization ratio store	0.00
Vectorization ratio mul	NA
Vectorization ratio add_sub	NA
Vectorization ratio fma	NA
Vectorization ratio div_sqrt	NA
Vectorization ratio other	36.36
Vector-efficiency ratio all	15.13
Vector-efficiency ratio load	12.50
Vector-efficiency ratio store	12.50
Vector-efficiency ratio mul	NA
Vector-efficiency ratio add_sub	NA
Vector-efficiency ratio fma	NA
Vector-efficiency ratio div_sqrt	NA
Vector-efficiency ratio other	17.05

Metric	Value
CQA speedup if no scalar integer	1.09
CQA speedup if FP arith vectorized	1.00
CQA speedup if fully vectorized	8.00
CQA speedup if no inter-iteration dependency	NA
CQA speedup if next bottleneck killed	1.09
Bottlenecks	micro-operation queue,
Function	hypre_ParCSRComputeL1NormsThreads.extracted
Source	ams.c:3532-3534
Source loop unroll info	unrolled by 4
Source loop unroll confidence level	high
Unroll/vectorization loop type	main
Unroll factor	4
CQA cycles	8.17
CQA cycles if no scalar integer	7.50
CQA cycles if FP arith vectorized	8.17
CQA cycles if fully vectorized	1.02
Front-end cycles	8.17
P0 cycles	7.50
P1 cycles	2.00
P2 cycles	7.33
P3 cycles	7.33
P4 cycles	2.00
P5 cycles	2.00
P6 cycles	7.50
P7 cycles	2.00
P8 cycles	2.00
P9 cycles	2.00
P10 cycles	0.00
P11 cycles	7.33
DIV/SQRT cycles	0.00
Inter-iter dependencies cycles	NA
FE+BE cycles (UFS)	NA
Stall cycles (UFS)	NA
Nb insns	42.00
Nb uops	49.00
Nb loads	22.00
Nb stores	4.00
Nb stack references	1.00
FLOP/cycle	0.00
Nb FLOP add-sub	0.00
Nb FLOP mul	0.00
Nb FLOP fma	0.00
Nb FLOP div	0.00
Nb FLOP rcp	0.00
Nb FLOP sqrt	0.00
Nb FLOP rsqrt	0.00
Bytes/cycle	25.47
Bytes prefetched	0.00
Bytes loaded	176.00
Bytes stored	32.00
Stride 0	NA
Stride 1	NA
Stride n	NA
Stride unknown	NA
Stride indirect	NA
Vectorization ratio all	21.05
Vectorization ratio load	0.00
Vectorization ratio store	0.00
Vectorization ratio mul	NA
Vectorization ratio add_sub	NA
Vectorization ratio fma	NA
Vectorization ratio div_sqrt	NA
Vectorization ratio other	36.36
Vector-efficiency ratio all	15.13
Vector-efficiency ratio load	12.50
Vector-efficiency ratio store	12.50
Vector-efficiency ratio mul	NA
Vector-efficiency ratio add_sub	NA
Vector-efficiency ratio fma	NA
Vector-efficiency ratio div_sqrt	NA
Vector-efficiency ratio other	17.05

Path /

Average path: Display a virtual path defined by average values of all real paths

Function	hypre_ParCSRComputeL1NormsThreads.extracted
Source file and lines	ams.c:3532-3534
Module	exec

The loop is defined in /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3532-3534.

It is main loop of related source loop which is unrolled by 4 (including vectorization).
Warnings:

Ignoring paths for analysis
Too many paths. Rerun with max-paths=16
RecMII not computed since number of paths is unknown or > max_paths
Streams not analyzed since number of paths is unknown or > max_paths

Try to simplify control and/or increase the maximum number of paths per function/loop through the 'max-paths-nb' option.

This loop has 16 execution paths.

The presence of multiple execution paths is typically the main/first bottleneck.
Try to simplify control inside loop: ideally, try to remove all conditional expressions, for example by (if applicable):

hoisting them (moving them outside the loop)
turning them into conditional moves, MIN or MAX

Ex: if (x<0) x=0 => x = (x<0 ? 0 : x) (or MAX(0,x) after defining the corresponding macro)

gain
potential
hint
expert

Code clean check

Detected a slowdown caused by scalar integer instructions (typically used for address computation). By removing them, you can lower the cost of an iteration from 8.17 to 7.50 cycles (1.09x speedup).

Workaround

Try to reorganize arrays of structures to structures of arrays
Consider to permute loops (see vectorization gain report)

Vectorization

Your loop is probably not vectorized. Only 15% of vector register length is used (average across all SSE/AVX instructions). By vectorizing your loop, you can lower the cost of an iteration from 8.17 to 1.02 cycles (8.00x speedup).

Details

Store and arithmetical SSE/AVX instructions are used in scalar version (process only one data element in vector registers). Since your execution units are vector units, only a vectorized loop can use their full power.

Workaround

Try another compiler or update/tune your current one:
- recompile with ffast-math (included in Ofast) to extend loop vectorization to FP reductions.
Remove inter-iterations dependences from your loop and make it unit-stride:
- If your arrays have 2 or more dimensions, check whether elements are accessed contiguously and, otherwise, try to permute loops accordingly: C storage order is row-major: for(i) for(j) a[j][i] = b[j][i]; (slow, non stride 1) => for(i) for(j) a[i][j] = b[i][j]; (fast, stride 1)
- If your loop streams arrays of structures (AoS), try to use structures of arrays instead (SoA): for(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)

Execution units bottlenecks

Found no such bottlenecks but see expert reports for more complex bottlenecks.

No data for this section

Complex instructions

Detected COMPLEX INSTRUCTIONS.

Details

These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.

VUCOMISD: 7 occurrences►
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533

Type of elements and instruction set

7 SSE or AVX instructions are processing arithmetic or math operations on double precision FP elements in scalar mode (one at a time).

Matching between your loop (in the source code) and the binary loop

The binary loop does not contain any FP arithmetical operations. The binary loop is loading 176 bytes (22 double precision FP elements). The binary loop is storing 32 bytes (4 double precision FP elements).

General properties

nb instructions	42
nb uops	49
loop length	211
used x86 registers	6
used mmx registers	0
used xmm registers	3
used ymm registers	0
used zmm registers	0
nb stack references	1

Front-end

MACRO FUSION NOT POSSIBLE FIT IN UOP CACHE

micro-operation queue	8.17 cycles
front end	8.17 cycles

Back-end

	P0	P1	P2	P3	P4	P5	P6	P7	P8	P9	P10	P11
uops	7.50	2.00	7.33	7.33	2.00	2.00	7.50	2.00	2.00	2.00	0.00	7.33
cycles	7.50	2.00	7.33	7.33	2.00	2.00	7.50	2.00	2.00	2.00	0.00	7.33

Execution ports to units layout:

P0 (256 bits): VPU, BRU, ALU, DIV/SQRT
P1 (256 bits): ALU, VPU
P2 (512 bits): load
P3 (512 bits): load
P4 (512 bits): store data
P5 (512 bits): ALU, VPU
P6: ALU, BRU
P7: store address
P8: store address
P9 (512 bits): store data
P10: ALU
P11 (512 bits): load

Cycles executing div or sqrt instructions

Cycles summary

Front-end	8.17
Dispatch	7.50
Overall L1	8.17

Vectorization ratios

all	21%
load	0%
store	0%
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	NA (no add-sub vectorizable/vectorized instructions)
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	36%

Vector efficiency ratios

all	15%
load	12%
store	12%
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	NA (no add-sub vectorizable/vectorized instructions)
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	17%

Cycles and memory resources usage

Assuming all data fit into the L1 cache, each iteration of the binary loop takes 8.17 cycles. At this rate:

11% of peak load performance is reached (21.55 out of 192.00 bytes loaded per cycle (GB/s @ 1GHz))
3% of peak store performance is reached (3.92 out of 128.00 bytes stored per cycle (GB/s @ 1GHz))

Front-end bottlenecks

Performance is limited by instruction throughput (loading/decoding program instructions to execution core) (front-end is a bottleneck). By removing all these bottlenecks, you can lower the cost of an iteration from 8.17 to 7.50 cycles (1.09x speedup).

ASM code

In the binary file, the address of the loop is: 4ce38d

Instruction	Nb FU	P0	P1	P2	P3	P4	P5	P6	P7	P8	P9	P11	Latency	Recip. throughput	Vectorization
ADD $0x20,%RDI	1	0	0	0	0	0	0	0	0	0	0	0	1	0.17	N/A
DEC %RSI	1	0	0	0	0	0	0	0	0	0	0	0	1	0.17	N/A
JE 4ce223 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2b73>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0x10(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce400 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d50>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0xb0(%RBP),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VMOVSD -0x10(%R8,%RDI,1),%XMM0	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	scal (12.5%)
VXORPD %XMM4,%XMM0,%XMM0	1	0.33	0.33	0	0	0	0.33	0	0	0	0	0	0-1	0.33	vect (25.0%)
VMOVLPD %XMM0,-0x10(%R8,%RDI,1)	1	0	0	0	0	0.50	0	0	0.50	0.50	0.50	0	4-12	0.50	scal (12.5%)
MOV -0x8(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JA 4ce40d <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d5d>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV (%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce432 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d82>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0xb0(%RBP),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VMOVSD (%R8,%RDI,1),%XMM0	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	scal (12.5%)
VXORPD %XMM4,%XMM0,%XMM0	1	0.33	0.33	0	0	0	0.33	0	0	0	0	0	0-1	0.33	vect (25.0%)
VMOVLPD %XMM0,(%R8,%RDI,1)	1	0	0	0	0	0.50	0	0	0.50	0.50	0.50	0	4-12	0.50	scal (12.5%)
MOV 0x8(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce380 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2cd0>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
JMP 4ce443 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d93>	1	0	0	0	0	0	0	0	0	0	0	0	0	5.84	N/A
MOV -0x8(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce3c0 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d10>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0xb0(%RBP),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VMOVSD -0x8(%R8,%RDI,1),%XMM0	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	scal (12.5%)
VXORPD %XMM4,%XMM0,%XMM0	1	0.33	0.33	0	0	0	0.33	0	0	0	0	0	0-1	0.33	vect (25.0%)
VMOVLPD %XMM0,-0x8(%R8,%RDI,1)	1	0	0	0	0	0.50	0	0	0.50	0.50	0.50	0	4-12	0.50	scal (12.5%)
MOV (%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JA 4ce3cc <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d1c>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV 0x8(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce380 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2cd0>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0xb0(%RBP),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VMOVSD 0x8(%R8,%RDI,1),%XMM0	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	scal (12.5%)
VXORPD %XMM4,%XMM0,%XMM0	1	0.33	0.33	0	0	0	0.33	0	0	0	0	0	0-1	0.33	vect (25.0%)
VMOVLPD %XMM0,0x8(%R8,%RDI,1)	1	0	0	0	0	0.50	0	0	0.50	0.50	0.50	0	4-12	0.50	scal (12.5%)
JMP 4ce380 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2cd0>	1	0	0	0	0	0	0	0	0	0	0	0	0	2.08	N/A

Function	hypre_ParCSRComputeL1NormsThreads.extracted
Source file and lines	ams.c:3532-3534
Module	exec

Ignoring paths for analysis
Too many paths. Rerun with max-paths=16
RecMII not computed since number of paths is unknown or > max_paths
Streams not analyzed since number of paths is unknown or > max_paths

hoisting them (moving them outside the loop)
turning them into conditional moves, MIN or MAX

Ex: if (x<0) x=0 => x = (x<0 ? 0 : x) (or MAX(0,x) after defining the corresponding macro)

gain
potential
hint
expert

Code clean check

Detected a slowdown caused by scalar integer instructions (typically used for address computation). By removing them, you can lower the cost of an iteration from 8.17 to 7.50 cycles (1.09x speedup).

Workaround

Try to reorganize arrays of structures to structures of arrays
Consider to permute loops (see vectorization gain report)

Vectorization

Details

Workaround

Try another compiler or update/tune your current one:
- recompile with ffast-math (included in Ofast) to extend loop vectorization to FP reductions.
Remove inter-iterations dependences from your loop and make it unit-stride:
- If your arrays have 2 or more dimensions, check whether elements are accessed contiguously and, otherwise, try to permute loops accordingly: C storage order is row-major: for(i) for(j) a[j][i] = b[j][i]; (slow, non stride 1) => for(i) for(j) a[i][j] = b[i][j]; (fast, stride 1)
- If your loop streams arrays of structures (AoS), try to use structures of arrays instead (SoA): for(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)

Execution units bottlenecks

Found no such bottlenecks but see expert reports for more complex bottlenecks.

No data for this section

Complex instructions

Detected COMPLEX INSTRUCTIONS.

Details

These instructions generate more than one micro-operation and only one of them can be decoded during a cycle and the extra micro-operations increase pressure on execution units.

VUCOMISD: 7 occurrences►
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533
- /home/eoseret/qaas_runs_GNR/173-927-0874/intel/AMG/build/AMG/AMG/parcsr_ls/ams.c:3533

Type of elements and instruction set

7 SSE or AVX instructions are processing arithmetic or math operations on double precision FP elements in scalar mode (one at a time).

Matching between your loop (in the source code) and the binary loop

General properties

nb instructions	42
nb uops	49
loop length	211
used x86 registers	6
used mmx registers	0
used xmm registers	3
used ymm registers	0
used zmm registers	0
nb stack references	1

Front-end

MACRO FUSION NOT POSSIBLE FIT IN UOP CACHE

micro-operation queue	8.17 cycles
front end	8.17 cycles

Back-end

	P0	P1	P2	P3	P4	P5	P6	P7	P8	P9	P10	P11
uops	7.50	2.00	7.33	7.33	2.00	2.00	7.50	2.00	2.00	2.00	0.00	7.33
cycles	7.50	2.00	7.33	7.33	2.00	2.00	7.50	2.00	2.00	2.00	0.00	7.33

Execution ports to units layout:

P0 (256 bits): VPU, BRU, ALU, DIV/SQRT
P1 (256 bits): ALU, VPU
P2 (512 bits): load
P3 (512 bits): load
P4 (512 bits): store data
P5 (512 bits): ALU, VPU
P6: ALU, BRU
P7: store address
P8: store address
P9 (512 bits): store data
P10: ALU
P11 (512 bits): load

Cycles executing div or sqrt instructions

Cycles summary

Front-end	8.17
Dispatch	7.50
Overall L1	8.17

Vectorization ratios

all	21%
load	0%
store	0%
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	NA (no add-sub vectorizable/vectorized instructions)
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	36%

Vector efficiency ratios

all	15%
load	12%
store	12%
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	NA (no add-sub vectorizable/vectorized instructions)
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	17%

Cycles and memory resources usage

Assuming all data fit into the L1 cache, each iteration of the binary loop takes 8.17 cycles. At this rate:

11% of peak load performance is reached (21.55 out of 192.00 bytes loaded per cycle (GB/s @ 1GHz))
3% of peak store performance is reached (3.92 out of 128.00 bytes stored per cycle (GB/s @ 1GHz))

Front-end bottlenecks

ASM code

In the binary file, the address of the loop is: 4ce38d

Instruction	Nb FU	P0	P1	P2	P3	P4	P5	P6	P7	P8	P9	P11	Latency	Recip. throughput	Vectorization
ADD $0x20,%RDI	1	0	0	0	0	0	0	0	0	0	0	0	1	0.17	N/A
DEC %RSI	1	0	0	0	0	0	0	0	0	0	0	0	1	0.17	N/A
JE 4ce223 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2b73>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0x10(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce400 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d50>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0xb0(%RBP),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VMOVSD -0x10(%R8,%RDI,1),%XMM0	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	scal (12.5%)
VXORPD %XMM4,%XMM0,%XMM0	1	0.33	0.33	0	0	0	0.33	0	0	0	0	0	0-1	0.33	vect (25.0%)
VMOVLPD %XMM0,-0x10(%R8,%RDI,1)	1	0	0	0	0	0.50	0	0	0.50	0.50	0.50	0	4-12	0.50	scal (12.5%)
MOV -0x8(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JA 4ce40d <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d5d>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV (%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce432 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d82>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0xb0(%RBP),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VMOVSD (%R8,%RDI,1),%XMM0	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	scal (12.5%)
VXORPD %XMM4,%XMM0,%XMM0	1	0.33	0.33	0	0	0	0.33	0	0	0	0	0	0-1	0.33	vect (25.0%)
VMOVLPD %XMM0,(%R8,%RDI,1)	1	0	0	0	0	0.50	0	0	0.50	0.50	0.50	0	4-12	0.50	scal (12.5%)
MOV 0x8(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce380 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2cd0>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
JMP 4ce443 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d93>	1	0	0	0	0	0	0	0	0	0	0	0	0	5.84	N/A
MOV -0x8(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce3c0 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d10>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0xb0(%RBP),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VMOVSD -0x8(%R8,%RDI,1),%XMM0	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	scal (12.5%)
VXORPD %XMM4,%XMM0,%XMM0	1	0.33	0.33	0	0	0	0.33	0	0	0	0	0	0-1	0.33	vect (25.0%)
VMOVLPD %XMM0,-0x8(%R8,%RDI,1)	1	0	0	0	0	0.50	0	0	0.50	0.50	0.50	0	4-12	0.50	scal (12.5%)
MOV (%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JA 4ce3cc <hypre_ParCSRComputeL1NormsThreads.extracted+0x2d1c>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV 0x8(%R11,%RDI,1),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VUCOMISD (%R12,%R8,8),%XMM3	2	1	0	0.33	0.33	0	0	0	0	0	0	0.33	3	1	scal (12.5%)
JBE 4ce380 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2cd0>	1	0.50	0	0	0	0	0	0.50	0	0	0	0	0	0.50	N/A
MOV -0xb0(%RBP),%R8	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	N/A
VMOVSD 0x8(%R8,%RDI,1),%XMM0	1	0	0	0.33	0.33	0	0	0	0	0	0	0.33	1	0.33	scal (12.5%)
VXORPD %XMM4,%XMM0,%XMM0	1	0.33	0.33	0	0	0	0.33	0	0	0	0	0	0-1	0.33	vect (25.0%)
VMOVLPD %XMM0,0x8(%R8,%RDI,1)	1	0	0	0	0	0.50	0	0	0.50	0.50	0.50	0	4-12	0.50	scal (12.5%)
JMP 4ce380 <hypre_ParCSRComputeL1NormsThreads.extracted+0x2cd0>	1	0	0	0	0	0	0	0	0	0	0	0	0	2.08	N/A

Report Configuration

Code clean check

Workaround

Vectorization

Details

Workaround

Execution units bottlenecks

Complex instructions

Details

Type of elements and instruction set

Matching between your loop (in the source code) and the binary loop

General properties

Front-end

Back-end

Cycles summary

Vectorization ratios

Vector efficiency ratios

Cycles and memory resources usage

Front-end bottlenecks

ASM code

Code clean check

Workaround

Vectorization

Details

Workaround

Execution units bottlenecks

Complex instructions

Details

Type of elements and instruction set

Matching between your loop (in the source code) and the binary loop

General properties

Front-end

Back-end

Cycles summary

Vectorization ratios

Vector efficiency ratios

Cycles and memory resources usage

Front-end bottlenecks

ASM code