OV - FFTW GCC-SSE-128 - Loop 1153

	Loop Id: 1153	Module: bench	Source: t2fv_4.c:97-114 [...]	Coverage: 34.22%

0x104da0 MOV	0x18(%RCX),%RSI    [3]

0x104da4 VMOVAPD	(%RAX),%XMM5    [2]

0x104da8 INC	%R8

0x104dab ADD	$0x60,%RDX

0x104daf VMOVAPD	(%RAX,%RSI,8),%XMM0    [2]

0x104db4 MOV	0x10(%RCX),%RSI    [3]

0x104db8 VPERMILPD	$0x1,%XMM0,%XMM3

0x104dbe VMOVAPD	(%RAX,%RSI,8),%XMM1    [2]

0x104dc3 MOV	0x8(%RCX),%RSI    [3]

0x104dc7 VMULPD	-0x20(%RDX),%XMM0,%XMM0    [4]

0x104dcc LEA	(%RAX,%RSI,8),%RSI

0x104dd0 VMOVAPD	(%RSI),%XMM2    [1]

0x104dd4 VFNMADD132PD	-0x10(%RDX),%XMM0,%XMM3    [4]

0x104dda VPERMILPD	$0x1,%XMM1,%XMM0

0x104de0 VMULPD	-0x40(%RDX),%XMM1,%XMM1    [4]

0x104de5 VFNMADD132PD	-0x30(%RDX),%XMM1,%XMM0    [4]

0x104deb VPERMILPD	$0x1,%XMM2,%XMM1

0x104df1 VMULPD	-0x60(%RDX),%XMM2,%XMM2    [4]

0x104df6 VSUBPD	%XMM0,%XMM5,%XMM4

0x104dfa VADDPD	%XMM5,%XMM0,%XMM0

0x104dfe VFNMADD132PD	-0x50(%RDX),%XMM2,%XMM1    [4]

0x104e04 VSUBPD	%XMM3,%XMM1,%XMM2

0x104e08 VADDPD	%XMM3,%XMM1,%XMM1

0x104e0c VXORPD	%XMM2,%XMM7,%XMM2

0x104e10 VPERMILPD	$0x1,%XMM2,%XMM2

0x104e16 VSUBPD	%XMM2,%XMM4,%XMM6

0x104e1a VADDPD	%XMM2,%XMM4,%XMM4

0x104e1e VSUBPD	%XMM1,%XMM0,%XMM2

0x104e22 VADDPD	%XMM1,%XMM0,%XMM0

0x104e26 VMOVAPD	%XMM6,(%RSI)    [1]

0x104e2a MOV	0x18(%RCX),%RSI    [3]

0x104e2e VMOVAPD	%XMM4,(%RAX,%RSI,8)    [2]

0x104e33 MOV	0x10(%RCX),%RSI    [3]

0x104e37 ADD	%RDI,%RCX

0x104e3a VMOVAPD	%XMM2,(%RAX,%RSI,8)    [2]

0x104e3f VMOVAPD	%XMM0,(%RAX)    [2]

0x104e43 ADD	%R10,%RAX

0x104e46 CMP	%R8,%R9

0x104e49 JNE	104da0

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 251 - 251

--------------------------------------------------------------------------------

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 263 - 263

--------------------------------------------------------------------------------

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 275 - 275

--------------------------------------------------------------------------------

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 355 - 355

--------------------------------------------------------------------------------

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 953 - 953

--------------------------------------------------------------------------------

/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../../../simd-support/simd-sse2.h: 113 - 120

--------------------------------------------------------------------------------

113:      return *(const V *)x;

[...]

120:      *(V *)x = v;

/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../common/t2fv_4.c: 97 - 114

--------------------------------------------------------------------------------

97: 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {

98: 	       V T1, T8, T3, T6, T7, T2, T5;

99: 	       T1 = LD(&(x[0]), ms, &(x[0]));

100: 	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));

101: 	       T8 = BYTWJ(&(W[TWVL * 4]), T7);

102: 	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));

103: 	       T3 = BYTWJ(&(W[TWVL * 2]), T2);

104: 	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));

[...]

111: 		    ST(&(x[WS(rs, 3)]), VADD(T4, T9), ms, &(x[WS(rs, 1)]));

112: 		    Ta = VADD(T1, T3);

113: 		    Tb = VADD(T6, T8);

114: 		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));

Coverage (%)	Name	Source Location	Module
►99.78+	apply#0x120c50	dftw-direct.c:51	bench
○	doit	fftw-bench.c:274	bench
○	speed		bench
○	bench_main		bench
○	__libc_init_first		libc.so.6
○	__libc_start_main		libc.so.6
○	_start	hook.c:185	bench

min	med	avg	max

Percentile Index	10	20	30	40	50	60	70	80	90	100
Value

min	med	avg	max

Percentile Index	10	20	30	40	50	60	70	80	90	100
Value

Path /

Metric	Value
CQA speedup if no scalar integer	1.31
CQA speedup if FP arith vectorized	1.69
CQA speedup if fully vectorized	4.00
CQA speedup if no inter-iteration dependency	NA
CQA speedup if next bottleneck killed	1.27
Bottlenecks	micro-operation queue,
Function	t2fv_4
Source	emmintrin.h:251-251,emmintrin.h:263-263,emmintrin.h:275-275,emmintrin.h:355-355,emmintrin.h:953-953,simd-sse2.h:113-113,simd-sse2.h:120-120,t2fv_4.c:97-104,t2fv_4.c:111-114
Source loop unroll info	not unrolled or unrolled with no peel/tail loop
Source loop unroll confidence level	max
Unroll/vectorization loop type	NA
Unroll factor	NA
CQA cycles	9.50
CQA cycles if no scalar integer	7.25
CQA cycles if FP arith vectorized	5.63
CQA cycles if fully vectorized	2.38
Front-end cycles	9.50
P0 cycles	7.50
P1 cycles	7.50
P2 cycles	7.50
P3 cycles	7.50
P4 cycles	4.00
P5 cycles	5.00
P6 cycles	5.00
P7 cycles	4.00
DIV/SQRT cycles	0.00
Inter-iter dependencies cycles	1
FE+BE cycles (UFS)	NA
Stall cycles (UFS)	NA
Nb insns	39.00
Nb uops	38.00
Nb loads	15.00
Nb stores	4.00
Nb stack references	0.00
FLOP/cycle	3.58
Nb FLOP add-sub	16.00
Nb FLOP mul	6.00
Nb FLOP fma	6.00
Nb FLOP div	0.00
Nb FLOP rcp	0.00
Nb FLOP sqrt	0.00
Nb FLOP rsqrt	0.00
Bytes/cycle	27.79
Bytes prefetched	0.00
Bytes loaded	200.00
Bytes stored	64.00
Stride 0	0.00
Stride 1	1.00
Stride n	0.00
Stride unknown	3.00
Stride indirect	0.00
Vectorization ratio all	100.00
Vectorization ratio load	100.00
Vectorization ratio store	100.00
Vectorization ratio mul	100.00
Vectorization ratio add_sub	100.00
Vectorization ratio fma	100.00
Vectorization ratio div_sqrt	NA
Vectorization ratio other	100.00
Vector-efficiency ratio all	25.00
Vector-efficiency ratio load	25.00
Vector-efficiency ratio store	25.00
Vector-efficiency ratio mul	25.00
Vector-efficiency ratio add_sub	25.00
Vector-efficiency ratio fma	25.00
Vector-efficiency ratio div_sqrt	NA
Vector-efficiency ratio other	25.00

Metric	Value
CQA speedup if no scalar integer	1.31
CQA speedup if FP arith vectorized	1.69
CQA speedup if fully vectorized	4.00
CQA speedup if no inter-iteration dependency	NA
CQA speedup if next bottleneck killed	1.27
Bottlenecks	micro-operation queue,
Function	t2fv_4
Source	emmintrin.h:251-251,emmintrin.h:263-263,emmintrin.h:275-275,emmintrin.h:355-355,emmintrin.h:953-953,simd-sse2.h:113-113,simd-sse2.h:120-120,t2fv_4.c:97-104,t2fv_4.c:111-114
Source loop unroll info	not unrolled or unrolled with no peel/tail loop
Source loop unroll confidence level	max
Unroll/vectorization loop type	NA
Unroll factor	NA
CQA cycles	9.50
CQA cycles if no scalar integer	7.25
CQA cycles if FP arith vectorized	5.63
CQA cycles if fully vectorized	2.38
Front-end cycles	9.50
P0 cycles	7.50
P1 cycles	7.50
P2 cycles	7.50
P3 cycles	7.50
P4 cycles	4.00
P5 cycles	5.00
P6 cycles	5.00
P7 cycles	4.00
DIV/SQRT cycles	0.00
Inter-iter dependencies cycles	1
FE+BE cycles (UFS)	NA
Stall cycles (UFS)	NA
Nb insns	39.00
Nb uops	38.00
Nb loads	15.00
Nb stores	4.00
Nb stack references	0.00
FLOP/cycle	3.58
Nb FLOP add-sub	16.00
Nb FLOP mul	6.00
Nb FLOP fma	6.00
Nb FLOP div	0.00
Nb FLOP rcp	0.00
Nb FLOP sqrt	0.00
Nb FLOP rsqrt	0.00
Bytes/cycle	27.79
Bytes prefetched	0.00
Bytes loaded	200.00
Bytes stored	64.00
Stride 0	0.00
Stride 1	1.00
Stride n	0.00
Stride unknown	3.00
Stride indirect	0.00
Vectorization ratio all	100.00
Vectorization ratio load	100.00
Vectorization ratio store	100.00
Vectorization ratio mul	100.00
Vectorization ratio add_sub	100.00
Vectorization ratio fma	100.00
Vectorization ratio div_sqrt	NA
Vectorization ratio other	100.00
Vector-efficiency ratio all	25.00
Vector-efficiency ratio load	25.00
Vector-efficiency ratio store	25.00
Vector-efficiency ratio mul	25.00
Vector-efficiency ratio add_sub	25.00
Vector-efficiency ratio fma	25.00
Vector-efficiency ratio div_sqrt	NA
Vector-efficiency ratio other	25.00

Path /

Average path: Display a virtual path defined by average values of all real paths

Function	t2fv_4
Source file and lines	t2fv_4.c:97-114
Module	bench

The loop is defined in:

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h:251,263,275,355,953
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../../../simd-support/simd-sse2.h:113,120
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../common/t2fv_4.c:97-104,111-114

The related source loop is not unrolled or unrolled with no peel/tail loop.

gain
potential
hint
expert

Code clean check

Detected a slowdown caused by scalar integer instructions (typically used for address computation). By removing them, you can lower the cost of an iteration from 9.50 to 7.25 cycles (1.31x speedup).

Workaround

Try to reorganize arrays of structures to structures of arrays
Consider to permute loops (see vectorization gain report)

Vectorization

Your loop is vectorized, but using only 128 out of 512 bits (SSE/AVX-128 instructions on AVX-512 processors).

By fully vectorizing your loop, you can lower the cost of an iteration from 9.50 to 2.37 cycles (4.00x speedup).

Details

All SSE/AVX instructions are used in vector version (process two or more data elements in vector registers). Since your execution units are vector units, only a fully vectorized loop can use their full power.

Workaround

Recompile with march=skylake-avx512. CQA target is Skylake_SP (Intel(R) Xeon(R) Skylake SP) but specialization flags are -march=cascadelake
Use vector aligned instructions:
1. align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.
2. inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.

Execution units bottlenecks

Found no such bottlenecks but see expert reports for more complex bottlenecks.

FMA

Detected 6 FMA (fused multiply-add) operations. Presence of both ADD/SUB and MUL operations.

Workaround

Try to change order in which elements are evaluated (using parentheses) in arithmetic expressions containing both ADD/SUB and MUL operations to enable your compiler to generate FMA instructions wherever possible. For instance a + b*c is a valid FMA (MUL then ADD). However (a+b)* c cannot be translated into an FMA (ADD then MUL).

Slow data structures access

Detected data structures (typically arrays) that cannot be efficiently read/written

Details

Constant unknown stride: 3 occurrence(s)

Non-unit stride (uncontiguous) accesses are not efficiently using data caches

Workaround

Try to reorganize arrays of structures to structures of arrays
Consider to permute loops (see vectorization gain report)

Type of elements and instruction set

14 SSE or AVX instructions are processing arithmetic or math operations on double precision FP elements in vector mode (two at a time).

Matching between your loop (in the source code) and the binary loop

The binary loop is composed of 34 FP arithmetical operations:

22: addition or subtraction (6 inside FMA instructions)
12: multiply (6 inside FMA instructions)

The binary loop is loading 200 bytes (25 double precision FP elements). The binary loop is storing 64 bytes (8 double precision FP elements).

Arithmetic intensity

Arithmetic intensity is 0.13 FP operations per loaded or stored byte.

Unroll opportunity

Loop is data access bound.

Workaround

Unroll your loop if trip count is significantly higher than target unroll factor and if some data references are common to consecutive iterations. This can be done manually. Or by recompiling with -funroll-loops and/or -floop-unroll-and-jam. Or with the unroll (resp. unroll_and_jam) directive on top of the inner (resp. surrounding) loop. You can enforce an unroll factor: #pragma GCC unroll N

General properties

nb instructions	39
nb uops	38
loop length	175
used x86 registers	8
used mmx registers	0
used xmm registers	8
used ymm registers	0
used zmm registers	0
nb stack references	0
ADD-SUB / MUL ratio	2.67

Front-end

ASSUMED MACRO FUSION FIT IN UOP CACHE

micro-operation queue	9.50 cycles
front end	9.50 cycles

Back-end

	P0	P1	P2	P3	P4	P5	P6	P7
uops	7.50	7.50	7.50	7.50	4.00	5.00	5.00	4.00
cycles	7.50	7.50	7.50	7.50	4.00	5.00	5.00	4.00

Execution ports to units layout:

P0 (256 bits): VPU, BRU, ALU, DIV/SQRT
P1 (256 bits): ALU, VPU
P2 (512 bits): store address, load
P3 (512 bits): store address, load
P4 (512 bits): store data
P5 (512 bits): ALU, VPU
P6: ALU, BRU
P7: store address

Cycles executing div or sqrt instructions	NA
Longest recurrence chain latency (RecMII)	1.00

Cycles summary

Front-end	9.50
Dispatch	7.50
Data deps.	1.00
Overall L1	9.50

Vectorization ratios

all	100%
load	100%
store	100%
mul	100%
add-sub	100%
fma	100%
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	100%

Vector efficiency ratios

all	25%
load	25%
store	25%
mul	25%
add-sub	25%
fma	25%
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	25%

Cycles and memory resources usage

Assuming all data fit into the L1 cache, each iteration of the binary loop takes 9.50 cycles. At this rate:

16% of peak load performance is reached (21.05 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))
10% of peak store performance is reached (6.74 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))

Front-end bottlenecks

Performance is limited by instruction throughput (loading/decoding program instructions to execution core) (front-end is a bottleneck). By removing all these bottlenecks, you can lower the cost of an iteration from 9.50 to 7.50 cycles (1.27x speedup).

ASM code

In the binary file, the address of the loop is: 104da0

Instruction	Nb FU	P0	P1	P2	P3	P4	P5	P6	P7	Latency	Recip. throughput	Vectorization
MOV 0x18(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
VMOVAPD (%RAX),%XMM5	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
INC %R8	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
ADD $0x60,%RDX	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
VMOVAPD (%RAX,%RSI,8),%XMM0	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
MOV 0x10(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
VPERMILPD $0x1,%XMM0,%XMM3	1	0	0	0	0	0	1	0	0	1	1	vect (25.0%)
VMOVAPD (%RAX,%RSI,8),%XMM1	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
MOV 0x8(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
VMULPD -0x20(%RDX),%XMM0,%XMM0	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
LEA (%RAX,%RSI,8),%RSI	1	0	0.50	0	0	0	0.50	0	0	1	0.50	N/A
VMOVAPD (%RSI),%XMM2	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
VFNMADD132PD -0x10(%RDX),%XMM0,%XMM3	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VPERMILPD $0x1,%XMM1,%XMM0	1	0	0	0	0	0	1	0	0	1	1	vect (25.0%)
VMULPD -0x40(%RDX),%XMM1,%XMM1	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VFNMADD132PD -0x30(%RDX),%XMM1,%XMM0	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VPERMILPD $0x1,%XMM2,%XMM1	1	0	0	0	0	0	1	0	0	1	1	vect (25.0%)
VMULPD -0x60(%RDX),%XMM2,%XMM2	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VSUBPD %XMM0,%XMM5,%XMM4	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VADDPD %XMM5,%XMM0,%XMM0	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VFNMADD132PD -0x50(%RDX),%XMM2,%XMM1	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VSUBPD %XMM3,%XMM1,%XMM2	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VADDPD %XMM3,%XMM1,%XMM1	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VXORPD %XMM2,%XMM7,%XMM2	1	0.33	0.33	0	0	0	0.33	0	0	1	0.33	vect (25.0%)
VPERMILPD $0x1,%XMM2,%XMM2	1	0	0	0	0	0	1	0	0	1	1	vect (25.0%)
VSUBPD %XMM2,%XMM4,%XMM6	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VADDPD %XMM2,%XMM4,%XMM4	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VSUBPD %XMM1,%XMM0,%XMM2	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VADDPD %XMM1,%XMM0,%XMM0	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VMOVAPD %XMM6,(%RSI)	1	0	0	0.33	0.33	1	0	0	0.33	3	1	vect (25.0%)
MOV 0x18(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
VMOVAPD %XMM4,(%RAX,%RSI,8)	1	0	0	0.33	0.33	1	0	0	0.33	3	1	vect (25.0%)
MOV 0x10(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
ADD %RDI,%RCX	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
VMOVAPD %XMM2,(%RAX,%RSI,8)	1	0	0	0.33	0.33	1	0	0	0.33	3	1	vect (25.0%)
VMOVAPD %XMM0,(%RAX)	1	0	0	0.33	0.33	1	0	0	0.33	3	1	vect (25.0%)
ADD %R10,%RAX	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
CMP %R8,%R9	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
JNE 104da0 <t2fv_4+0x40>	1	0.50	0	0	0	0	0	0.50	0	0	0.50-1	N/A

Function	t2fv_4
Source file and lines	t2fv_4.c:97-114
Module	bench

The loop is defined in:

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h:251,263,275,355,953
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../../../simd-support/simd-sse2.h:113,120
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../common/t2fv_4.c:97-104,111-114

The related source loop is not unrolled or unrolled with no peel/tail loop.

gain
potential
hint
expert

Code clean check

Detected a slowdown caused by scalar integer instructions (typically used for address computation). By removing them, you can lower the cost of an iteration from 9.50 to 7.25 cycles (1.31x speedup).

Workaround

Try to reorganize arrays of structures to structures of arrays
Consider to permute loops (see vectorization gain report)

Vectorization

Your loop is vectorized, but using only 128 out of 512 bits (SSE/AVX-128 instructions on AVX-512 processors).

By fully vectorizing your loop, you can lower the cost of an iteration from 9.50 to 2.37 cycles (4.00x speedup).

Details

Workaround

Recompile with march=skylake-avx512. CQA target is Skylake_SP (Intel(R) Xeon(R) Skylake SP) but specialization flags are -march=cascadelake
Use vector aligned instructions:
1. align your arrays on 64 bytes boundaries: replace { void *p = malloc (size); } with { void *p; posix_memalign (&p, 64, size); }.
2. inform your compiler that your arrays are vector aligned: if array 'foo' is 64 bytes-aligned, define a pointer 'p_foo' as __builtin_assume_aligned (foo, 64) and use it instead of 'foo' in the loop.

Execution units bottlenecks

Found no such bottlenecks but see expert reports for more complex bottlenecks.

FMA

Detected 6 FMA (fused multiply-add) operations. Presence of both ADD/SUB and MUL operations.

Workaround

Slow data structures access

Detected data structures (typically arrays) that cannot be efficiently read/written

Details

Constant unknown stride: 3 occurrence(s)

Non-unit stride (uncontiguous) accesses are not efficiently using data caches

Workaround

Try to reorganize arrays of structures to structures of arrays
Consider to permute loops (see vectorization gain report)

Type of elements and instruction set

14 SSE or AVX instructions are processing arithmetic or math operations on double precision FP elements in vector mode (two at a time).

Matching between your loop (in the source code) and the binary loop

The binary loop is composed of 34 FP arithmetical operations:

22: addition or subtraction (6 inside FMA instructions)
12: multiply (6 inside FMA instructions)

The binary loop is loading 200 bytes (25 double precision FP elements). The binary loop is storing 64 bytes (8 double precision FP elements).

Arithmetic intensity

Arithmetic intensity is 0.13 FP operations per loaded or stored byte.

Unroll opportunity

Loop is data access bound.

Workaround

General properties

nb instructions	39
nb uops	38
loop length	175
used x86 registers	8
used mmx registers	0
used xmm registers	8
used ymm registers	0
used zmm registers	0
nb stack references	0
ADD-SUB / MUL ratio	2.67

Front-end

ASSUMED MACRO FUSION FIT IN UOP CACHE

micro-operation queue	9.50 cycles
front end	9.50 cycles

Back-end

	P0	P1	P2	P3	P4	P5	P6	P7
uops	7.50	7.50	7.50	7.50	4.00	5.00	5.00	4.00
cycles	7.50	7.50	7.50	7.50	4.00	5.00	5.00	4.00

Execution ports to units layout:

P0 (256 bits): VPU, BRU, ALU, DIV/SQRT
P1 (256 bits): ALU, VPU
P2 (512 bits): store address, load
P3 (512 bits): store address, load
P4 (512 bits): store data
P5 (512 bits): ALU, VPU
P6: ALU, BRU
P7: store address

Cycles executing div or sqrt instructions	NA
Longest recurrence chain latency (RecMII)	1.00

Cycles summary

Front-end	9.50
Dispatch	7.50
Data deps.	1.00
Overall L1	9.50

Vectorization ratios

all	100%
load	100%
store	100%
mul	100%
add-sub	100%
fma	100%
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	100%

Vector efficiency ratios

all	25%
load	25%
store	25%
mul	25%
add-sub	25%
fma	25%
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	25%

Cycles and memory resources usage

Assuming all data fit into the L1 cache, each iteration of the binary loop takes 9.50 cycles. At this rate:

16% of peak load performance is reached (21.05 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))
10% of peak store performance is reached (6.74 out of 64.00 bytes stored per cycle (GB/s @ 1GHz))

Front-end bottlenecks

ASM code

In the binary file, the address of the loop is: 104da0

Instruction	Nb FU	P0	P1	P2	P3	P4	P5	P6	P7	Latency	Recip. throughput	Vectorization
MOV 0x18(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
VMOVAPD (%RAX),%XMM5	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
INC %R8	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
ADD $0x60,%RDX	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
VMOVAPD (%RAX,%RSI,8),%XMM0	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
MOV 0x10(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
VPERMILPD $0x1,%XMM0,%XMM3	1	0	0	0	0	0	1	0	0	1	1	vect (25.0%)
VMOVAPD (%RAX,%RSI,8),%XMM1	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
MOV 0x8(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
VMULPD -0x20(%RDX),%XMM0,%XMM0	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
LEA (%RAX,%RSI,8),%RSI	1	0	0.50	0	0	0	0.50	0	0	1	0.50	N/A
VMOVAPD (%RSI),%XMM2	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
VFNMADD132PD -0x10(%RDX),%XMM0,%XMM3	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VPERMILPD $0x1,%XMM1,%XMM0	1	0	0	0	0	0	1	0	0	1	1	vect (25.0%)
VMULPD -0x40(%RDX),%XMM1,%XMM1	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VFNMADD132PD -0x30(%RDX),%XMM1,%XMM0	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VPERMILPD $0x1,%XMM2,%XMM1	1	0	0	0	0	0	1	0	0	1	1	vect (25.0%)
VMULPD -0x60(%RDX),%XMM2,%XMM2	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VSUBPD %XMM0,%XMM5,%XMM4	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VADDPD %XMM5,%XMM0,%XMM0	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VFNMADD132PD -0x50(%RDX),%XMM2,%XMM1	1	0.50	0.50	0.50	0.50	0	0	0	0	4	0.50	vect (25.0%)
VSUBPD %XMM3,%XMM1,%XMM2	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VADDPD %XMM3,%XMM1,%XMM1	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VXORPD %XMM2,%XMM7,%XMM2	1	0.33	0.33	0	0	0	0.33	0	0	1	0.33	vect (25.0%)
VPERMILPD $0x1,%XMM2,%XMM2	1	0	0	0	0	0	1	0	0	1	1	vect (25.0%)
VSUBPD %XMM2,%XMM4,%XMM6	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VADDPD %XMM2,%XMM4,%XMM4	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VSUBPD %XMM1,%XMM0,%XMM2	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VADDPD %XMM1,%XMM0,%XMM0	1	0.50	0.50	0	0	0	0	0	0	4	0.50	vect (25.0%)
VMOVAPD %XMM6,(%RSI)	1	0	0	0.33	0.33	1	0	0	0.33	3	1	vect (25.0%)
MOV 0x18(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
VMOVAPD %XMM4,(%RAX,%RSI,8)	1	0	0	0.33	0.33	1	0	0	0.33	3	1	vect (25.0%)
MOV 0x10(%RCX),%RSI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
ADD %RDI,%RCX	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
VMOVAPD %XMM2,(%RAX,%RSI,8)	1	0	0	0.33	0.33	1	0	0	0.33	3	1	vect (25.0%)
VMOVAPD %XMM0,(%RAX)	1	0	0	0.33	0.33	1	0	0	0.33	3	1	vect (25.0%)
ADD %R10,%RAX	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
CMP %R8,%R9	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
JNE 104da0 <t2fv_4+0x40>	1	0.50	0	0	0	0	0	0.50	0	0	0.50-1	N/A

Report Configuration

Code clean check

Workaround

Vectorization

Details

Workaround

Execution units bottlenecks

FMA

Workaround

Slow data structures access

Details

Workaround

Type of elements and instruction set

Matching between your loop (in the source code) and the binary loop

Arithmetic intensity

Unroll opportunity

Workaround

General properties

Front-end

Back-end

Cycles summary

Vectorization ratios

Vector efficiency ratios

Cycles and memory resources usage

Front-end bottlenecks

ASM code

Code clean check

Workaround

Vectorization

Details

Workaround

Execution units bottlenecks

FMA

Workaround

Slow data structures access

Details

Workaround

Type of elements and instruction set

Matching between your loop (in the source code) and the binary loop

Arithmetic intensity

Unroll opportunity

Workaround

General properties

Front-end

Back-end

Cycles summary

Vectorization ratios

Vector efficiency ratios

Cycles and memory resources usage

Front-end bottlenecks

ASM code