OV - FFTW GCC-SSE-128 - Function t2fv

	Function: t2fv_4	Module: bench	Source: t2fv_4.c:92-120 [...]	Coverage (incl. loops): 34.46% \| (excl. loops): 0.24%

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 251 - 251

--------------------------------------------------------------------------------

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 263 - 263

--------------------------------------------------------------------------------

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 275 - 275

--------------------------------------------------------------------------------

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 355 - 355

--------------------------------------------------------------------------------

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 953 - 953

--------------------------------------------------------------------------------

/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../../../simd-support/simd-sse2.h: 113 - 120

--------------------------------------------------------------------------------

113:      return *(const V *)x;

[...]

120:      *(V *)x = v;

/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../common/t2fv_4.c: 92 - 120

--------------------------------------------------------------------------------

92: {

93:      {

94: 	  INT m;

95: 	  R *x;

96: 	  x = ri;

97: 	  for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) {

98: 	       V T1, T8, T3, T6, T7, T2, T5;

99: 	       T1 = LD(&(x[0]), ms, &(x[0]));

100: 	       T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)]));

101: 	       T8 = BYTWJ(&(W[TWVL * 4]), T7);

102: 	       T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0]));

103: 	       T3 = BYTWJ(&(W[TWVL * 2]), T2);

104: 	       T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)]));

[...]

111: 		    ST(&(x[WS(rs, 3)]), VADD(T4, T9), ms, &(x[WS(rs, 1)]));

112: 		    Ta = VADD(T1, T3);

113: 		    Tb = VADD(T6, T8);

114: 		    ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0]));

[...]

120: }

0x104d60 LEA	(%R8,%R8,2),%RSI

0x104d64 SAL	$0x5,%RSI

0x104d68 ADD	%RSI,%RDX

0x104d6b CMP	%R9,%R8

0x104d6e JGE	104e58

0x104d74 PUSH	%RBP

0x104d75 MOV	%RDI,%RAX

0x104d78 VMOVAPD	0x4c180(%RIP),%XMM7

0x104d80 MOV	%RSP,%RBP

0x104d83 MOV	0x10(%RBP),%RDI

0x104d87 LEA	(,%RDI,8),%R10

0x104d8f MOV	0x5004a(%RIP),%RDI

0x104d96 SAL	$0x3,%RDI

0x104d9a NOPW	(%RAX,%RAX,1)

(1153) 0x104da0 MOV	0x18(%RCX),%RSI

(1153) 0x104da4 VMOVAPD	(%RAX),%XMM5

(1153) 0x104da8 INC	%R8

(1153) 0x104dab ADD	$0x60,%RDX

(1153) 0x104daf VMOVAPD	(%RAX,%RSI,8),%XMM0

(1153) 0x104db4 MOV	0x10(%RCX),%RSI

(1153) 0x104db8 VPERMILPD	$0x1,%XMM0,%XMM3

(1153) 0x104dbe VMOVAPD	(%RAX,%RSI,8),%XMM1

(1153) 0x104dc3 MOV	0x8(%RCX),%RSI

(1153) 0x104dc7 VMULPD	-0x20(%RDX),%XMM0,%XMM0

(1153) 0x104dcc LEA	(%RAX,%RSI,8),%RSI

(1153) 0x104dd0 VMOVAPD	(%RSI),%XMM2

(1153) 0x104dd4 VFNMADD132PD	-0x10(%RDX),%XMM0,%XMM3

(1153) 0x104dda VPERMILPD	$0x1,%XMM1,%XMM0

(1153) 0x104de0 VMULPD	-0x40(%RDX),%XMM1,%XMM1

(1153) 0x104de5 VFNMADD132PD	-0x30(%RDX),%XMM1,%XMM0

(1153) 0x104deb VPERMILPD	$0x1,%XMM2,%XMM1

(1153) 0x104df1 VMULPD	-0x60(%RDX),%XMM2,%XMM2

(1153) 0x104df6 VSUBPD	%XMM0,%XMM5,%XMM4

(1153) 0x104dfa VADDPD	%XMM5,%XMM0,%XMM0

(1153) 0x104dfe VFNMADD132PD	-0x50(%RDX),%XMM2,%XMM1

(1153) 0x104e04 VSUBPD	%XMM3,%XMM1,%XMM2

(1153) 0x104e08 VADDPD	%XMM3,%XMM1,%XMM1

(1153) 0x104e0c VXORPD	%XMM2,%XMM7,%XMM2

(1153) 0x104e10 VPERMILPD	$0x1,%XMM2,%XMM2

(1153) 0x104e16 VSUBPD	%XMM2,%XMM4,%XMM6

(1153) 0x104e1a VADDPD	%XMM2,%XMM4,%XMM4

(1153) 0x104e1e VSUBPD	%XMM1,%XMM0,%XMM2

(1153) 0x104e22 VADDPD	%XMM1,%XMM0,%XMM0

(1153) 0x104e26 VMOVAPD	%XMM6,(%RSI)

(1153) 0x104e2a MOV	0x18(%RCX),%RSI

(1153) 0x104e2e VMOVAPD	%XMM4,(%RAX,%RSI,8)

(1153) 0x104e33 MOV	0x10(%RCX),%RSI

(1153) 0x104e37 ADD	%RDI,%RCX

(1153) 0x104e3a VMOVAPD	%XMM2,(%RAX,%RSI,8)

(1153) 0x104e3f VMOVAPD	%XMM0,(%RAX)

(1153) 0x104e43 ADD	%R10,%RAX

(1153) 0x104e46 CMP	%R8,%R9

(1153) 0x104e49 JNE	104da0

0x104e4f POP	%RBP

0x104e50 RET

0x104e51 NOPL	(%RAX)

0x104e58 RET

0x104e59 NOPL	(%RAX)

Coverage (%)	Name	Source Location	Module
►99.79+	apply#0x120c50	dftw-direct.c:51	bench
○	doit	fftw-bench.c:274	bench
○	speed		bench
○	bench_main		bench
○	__libc_init_first		libc.so.6
○	__libc_start_main		libc.so.6
○	_start	hook.c:185	bench

min	med	avg	max

Percentile Index	10	20	30	40	50	60	70	80	90	100
Value

min	med	avg	max

Percentile Index	10	20	30	40	50	60	70	80	90	100
Value

Path /

Average path: Display a virtual path defined by average values of all real paths

The code analyzed by CQA in that panel excludes loops and represents 0.24% of application time for run run_0

Source file and lines	t2fv_4.c:92-120
Module	bench

The function is defined in:

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h:251,263,275,355,953
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../../../simd-support/simd-sse2.h:113,120
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../common/t2fv_4.c:92-104,111-114,120

Warnings:
get_cqa_results:

Ignoring paths for analysis

gain
potential
hint
expert

Code clean check

Detected a slowdown caused by scalar integer instructions (typically used for address computation). By removing them, you can lower the cost of an iteration from 4.75 to 0.75 cycles (6.33x speedup).

Workaround

Try to reorganize arrays of structures to structures of arrays
Consider to permute loops (see vectorization gain report)

Vectorization

Your function is poorly vectorized. Only 16% of vector register length is used (average across all SSE/AVX instructions). By fully vectorizing your function, you can lower the cost of an iteration from 4.75 to 0.62 cycles (7.60x speedup).

Details

33% of SSE/AVX instructions are used in vector version (process two or more data elements in vector registers):

0% of SSE/AVX addition or subtraction instructions are used in vector version.
0% of SSE/AVX instructions that are not load, store, addition, subtraction nor multiply instructions are used in vector version.

Since your execution units are vector units, only a fully vectorized function can use their full power.

Workaround

Try another compiler or update/tune your current one
Make array accesses unit-stride:
- If your function streams arrays of structures (AoS), try to use structures of arrays instead (SoA): for(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)

Execution units bottlenecks

Found no such bottlenecks but see expert reports for more complex bottlenecks.

No data for this section

Type of elements and instruction set

No instructions are processing arithmetic or math operations on FP elements. This function is probably writing/copying data or processing integer elements.

Matching between your function (in the source code) and the binary function

The binary function does not contain any FP arithmetical operations. The binary function is loading 32 bytes.

General properties

nb instructions	19
nb uops	19
loop length	81
used x86 registers	9
used mmx registers	0
used xmm registers	1
used ymm registers	0
used zmm registers	0
nb stack references	1

Front-end

MACRO FUSION NOT POSSIBLE FIT IN UOP CACHE

micro-operation queue	4.75 cycles
front end	4.75 cycles

Back-end

	P0	P1	P2	P3	P4	P5	P6	P7
uops	2.50	2.00	2.33	2.33	1.00	2.00	2.50	2.33
cycles	2.50	2.00	2.33	2.33	1.00	2.00	2.50	2.33

Execution ports to units layout:

P0 (256 bits): VPU, BRU, ALU, DIV/SQRT
P1 (256 bits): ALU, VPU
P2 (512 bits): store address, load
P3 (512 bits): store address, load
P4 (512 bits): store data
P5 (512 bits): ALU, VPU
P6: ALU, BRU
P7: store address

Cycles executing div or sqrt instructions

Cycles summary

Front-end	4.75
Dispatch	2.50
Overall L1	4.75

Vectorization ratios

INT

all	0%
load	NA (no load vectorizable/vectorized instructions)
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	0%
fma	NA (no fma vectorizable/vectorized instructions)
other	0%

all	100%
load	100%
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	NA (no add-sub vectorizable/vectorized instructions)
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	NA (no other vectorizable/vectorized instructions)

INT+FP

all	33%
load	100%
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	0%
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	0%

Vector efficiency ratios

INT

all	12%
load	NA (no load vectorizable/vectorized instructions)
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	12%
fma	NA (no fma vectorizable/vectorized instructions)
other	12%

all	25%
load	25%
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	NA (no add-sub vectorizable/vectorized instructions)
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	NA (no other vectorizable/vectorized instructions)

INT+FP

all	16%
load	25%
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	12%
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	12%

Cycles and memory resources usage

Assuming all data fit into the L1 cache, each call to the function takes 4.75 cycles. At this rate:

5% of peak load performance is reached (6.74 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))

Front-end bottlenecks

Performance is limited by instruction throughput (loading/decoding program instructions to execution core) (front-end is a bottleneck). By removing all these bottlenecks, you can lower the cost of an iteration from 4.75 to 2.50 cycles (1.90x speedup).

ASM code

In the binary file, the address of the function is: 104d60

Instruction	Nb FU	P0	P1	P2	P3	P4	P5	P6	P7	Latency	Recip. throughput	Vectorization
LEA (%R8,%R8,2),%RSI	1	0	0.50	0	0	0	0.50	0	0	1	0.50	N/A
SAL $0x5,%RSI	1	0.50	0	0	0	0	0	0.50	0	1	0.50	scal (12.5%)
ADD %RSI,%RDX	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	scal (12.5%)
CMP %R9,%R8	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
JGE 104e58 <t2fv_4+0xf8>	1	0.50	0	0	0	0	0	0.50	0	0	0.50-1	N/A
PUSH %RBP	1	0	0	0.33	0.33	1	0	0	0.33	3	1	N/A
MOV %RDI,%RAX	1	0	0	0	0	0	0	0	0	0	0.25	N/A
VMOVAPD 0x4c180(%RIP),%XMM7	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
MOV %RSP,%RBP	1	0	0	0	0	0	0	0	0	0	0.25	N/A
MOV 0x10(%RBP),%RDI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
LEA (,%RDI,8),%R10	1	0	0.50	0	0	0	0.50	0	0	1	0.50	N/A
MOV 0x5004a(%RIP),%RDI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
SAL $0x3,%RDI	1	0.50	0	0	0	0	0	0.50	0	1	0.50	N/A
NOPW (%RAX,%RAX,1)	1	0	0	0	0	0	0	0	0	0	0.25	N/A
POP %RBP	1	0	0	0.50	0.50	0	0	0	0	2	0.50	N/A
RET	1	0	0	0.33	0.33	0	0	1	0.33	0	1	N/A
NOPL (%RAX)	1	0	0	0	0	0	0	0	0	0	0.25	N/A
RET	1	0	0	0.33	0.33	0	0	1	0.33	0	1	N/A
NOPL (%RAX)	1	0	0	0	0	0	0	0	0	0	0.25	N/A

The code analyzed by CQA in that panel excludes loops and represents 0.24% of application time for run run_0

Source file and lines	t2fv_4.c:92-120
Module	bench

The function is defined in:

/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h:251,263,275,355,953
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../../../simd-support/simd-sse2.h:113,120
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../common/t2fv_4.c:92-104,111-114,120

Warnings:
get_cqa_results:

Ignoring paths for analysis

gain
potential
hint
expert

Code clean check

Detected a slowdown caused by scalar integer instructions (typically used for address computation). By removing them, you can lower the cost of an iteration from 4.75 to 0.75 cycles (6.33x speedup).

Workaround

Try to reorganize arrays of structures to structures of arrays
Consider to permute loops (see vectorization gain report)

Vectorization

Details

33% of SSE/AVX instructions are used in vector version (process two or more data elements in vector registers):

0% of SSE/AVX addition or subtraction instructions are used in vector version.
0% of SSE/AVX instructions that are not load, store, addition, subtraction nor multiply instructions are used in vector version.

Since your execution units are vector units, only a fully vectorized function can use their full power.

Workaround

Try another compiler or update/tune your current one
Make array accesses unit-stride:
- If your function streams arrays of structures (AoS), try to use structures of arrays instead (SoA): for(i) a[i].x = b[i].x; (slow, non stride 1) => for(i) a.x[i] = b.x[i]; (fast, stride 1)

Execution units bottlenecks

Found no such bottlenecks but see expert reports for more complex bottlenecks.

No data for this section

Type of elements and instruction set

No instructions are processing arithmetic or math operations on FP elements. This function is probably writing/copying data or processing integer elements.

Matching between your function (in the source code) and the binary function

The binary function does not contain any FP arithmetical operations. The binary function is loading 32 bytes.

General properties

nb instructions	19
nb uops	19
loop length	81
used x86 registers	9
used mmx registers	0
used xmm registers	1
used ymm registers	0
used zmm registers	0
nb stack references	1

Front-end

MACRO FUSION NOT POSSIBLE FIT IN UOP CACHE

micro-operation queue	4.75 cycles
front end	4.75 cycles

Back-end

	P0	P1	P2	P3	P4	P5	P6	P7
uops	2.50	2.00	2.33	2.33	1.00	2.00	2.50	2.33
cycles	2.50	2.00	2.33	2.33	1.00	2.00	2.50	2.33

Execution ports to units layout:

P0 (256 bits): VPU, BRU, ALU, DIV/SQRT
P1 (256 bits): ALU, VPU
P2 (512 bits): store address, load
P3 (512 bits): store address, load
P4 (512 bits): store data
P5 (512 bits): ALU, VPU
P6: ALU, BRU
P7: store address

Cycles executing div or sqrt instructions

Cycles summary

Front-end	4.75
Dispatch	2.50
Overall L1	4.75

Vectorization ratios

INT

all	0%
load	NA (no load vectorizable/vectorized instructions)
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	0%
fma	NA (no fma vectorizable/vectorized instructions)
other	0%

all	100%
load	100%
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	NA (no add-sub vectorizable/vectorized instructions)
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	NA (no other vectorizable/vectorized instructions)

INT+FP

all	33%
load	100%
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	0%
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	0%

Vector efficiency ratios

INT

all	12%
load	NA (no load vectorizable/vectorized instructions)
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	12%
fma	NA (no fma vectorizable/vectorized instructions)
other	12%

all	25%
load	25%
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	NA (no add-sub vectorizable/vectorized instructions)
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	NA (no other vectorizable/vectorized instructions)

INT+FP

all	16%
load	25%
store	NA (no store vectorizable/vectorized instructions)
mul	NA (no mul vectorizable/vectorized instructions)
add-sub	12%
fma	NA (no fma vectorizable/vectorized instructions)
div/sqrt	NA (no div/sqrt vectorizable/vectorized instructions)
other	12%

Cycles and memory resources usage

Assuming all data fit into the L1 cache, each call to the function takes 4.75 cycles. At this rate:

5% of peak load performance is reached (6.74 out of 128.00 bytes loaded per cycle (GB/s @ 1GHz))

Front-end bottlenecks

ASM code

In the binary file, the address of the function is: 104d60

Instruction	Nb FU	P0	P1	P2	P3	P4	P5	P6	P7	Latency	Recip. throughput	Vectorization
LEA (%R8,%R8,2),%RSI	1	0	0.50	0	0	0	0.50	0	0	1	0.50	N/A
SAL $0x5,%RSI	1	0.50	0	0	0	0	0	0.50	0	1	0.50	scal (12.5%)
ADD %RSI,%RDX	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	scal (12.5%)
CMP %R9,%R8	1	0.25	0.25	0	0	0	0.25	0.25	0	1	0.25	N/A
JGE 104e58 <t2fv_4+0xf8>	1	0.50	0	0	0	0	0	0.50	0	0	0.50-1	N/A
PUSH %RBP	1	0	0	0.33	0.33	1	0	0	0.33	3	1	N/A
MOV %RDI,%RAX	1	0	0	0	0	0	0	0	0	0	0.25	N/A
VMOVAPD 0x4c180(%RIP),%XMM7	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	vect (25.0%)
MOV %RSP,%RBP	1	0	0	0	0	0	0	0	0	0	0.25	N/A
MOV 0x10(%RBP),%RDI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
LEA (,%RDI,8),%R10	1	0	0.50	0	0	0	0.50	0	0	1	0.50	N/A
MOV 0x5004a(%RIP),%RDI	1	0	0	0.50	0.50	0	0	0	0	4-5	0.50	N/A
SAL $0x3,%RDI	1	0.50	0	0	0	0	0	0.50	0	1	0.50	N/A
NOPW (%RAX,%RAX,1)	1	0	0	0	0	0	0	0	0	0	0.25	N/A
POP %RBP	1	0	0	0.50	0.50	0	0	0	0	2	0.50	N/A
RET	1	0	0	0.33	0.33	0	0	1	0.33	0	1	N/A
NOPL (%RAX)	1	0	0	0	0	0	0	0	0	0	0.25	N/A
RET	1	0	0	0.33	0.33	0	0	1	0.33	0	1	N/A
NOPL (%RAX)	1	0	0	0	0	0	0	0	0	0	0.25	N/A

Name	Coverage (%)	Time (s)
▼t2fv_4–	34.46	6.54
○Loop 1153 - t2fv_4.c:97-114 - bench	34.22	6.49

Report Configuration

Code clean check

Workaround

Vectorization

Details

Workaround

Execution units bottlenecks

Type of elements and instruction set

Matching between your function (in the source code) and the binary function

General properties

Front-end

Back-end

Cycles summary

Vectorization ratios

Vector efficiency ratios

Cycles and memory resources usage

Front-end bottlenecks

ASM code

Code clean check

Workaround

Vectorization

Details

Workaround

Execution units bottlenecks

Type of elements and instruction set

Matching between your function (in the source code) and the binary function

General properties

Front-end

Back-end

Cycles summary

Vectorization ratios

Vector efficiency ratios

Cycles and memory resources usage

Front-end bottlenecks

ASM code