Function: t2fv_4 | Module: bench | Source: t2fv_4.c:92-120 [...] | Coverage (incl. loops): 34.46% | (excl. loops): 0.24% |
---|
Function: t2fv_4 | Module: bench | Source: t2fv_4.c:92-120 [...] | Coverage (incl. loops): 34.46% | (excl. loops): 0.24% |
---|
/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 251 - 251 |
-------------------------------------------------------------------------------- |
/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 263 - 263 |
-------------------------------------------------------------------------------- |
/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 275 - 275 |
-------------------------------------------------------------------------------- |
/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 355 - 355 |
-------------------------------------------------------------------------------- |
/usr/lib/gcc/x86_64-pc-linux-gnu/14.2.1/include/emmintrin.h: 953 - 953 |
-------------------------------------------------------------------------------- |
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../../../simd-support/simd-sse2.h: 113 - 120 |
-------------------------------------------------------------------------------- |
113: return *(const V *)x; |
[...] |
120: *(V *)x = v; |
/home/fmusial/FFTW_Benchmarks/fftw-3.3.10-gcc-sse-128/dft/simd/sse2/../common/t2fv_4.c: 92 - 120 |
-------------------------------------------------------------------------------- |
92: { |
93: { |
94: INT m; |
95: R *x; |
96: x = ri; |
97: for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) { |
98: V T1, T8, T3, T6, T7, T2, T5; |
99: T1 = LD(&(x[0]), ms, &(x[0])); |
100: T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); |
101: T8 = BYTWJ(&(W[TWVL * 4]), T7); |
102: T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0])); |
103: T3 = BYTWJ(&(W[TWVL * 2]), T2); |
104: T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); |
[...] |
111: ST(&(x[WS(rs, 3)]), VADD(T4, T9), ms, &(x[WS(rs, 1)])); |
112: Ta = VADD(T1, T3); |
113: Tb = VADD(T6, T8); |
114: ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0])); |
[...] |
120: } |
0x104d60 LEA (%R8,%R8,2),%RSI |
0x104d64 SAL $0x5,%RSI |
0x104d68 ADD %RSI,%RDX |
0x104d6b CMP %R9,%R8 |
0x104d6e JGE 104e58 |
0x104d74 PUSH %RBP |
0x104d75 MOV %RDI,%RAX |
0x104d78 VMOVAPD 0x4c180(%RIP),%XMM7 |
0x104d80 MOV %RSP,%RBP |
0x104d83 MOV 0x10(%RBP),%RDI |
0x104d87 LEA (,%RDI,8),%R10 |
0x104d8f MOV 0x5004a(%RIP),%RDI |
0x104d96 SAL $0x3,%RDI |
0x104d9a NOPW (%RAX,%RAX,1) |
(1153) 0x104da0 MOV 0x18(%RCX),%RSI |
(1153) 0x104da4 VMOVAPD (%RAX),%XMM5 |
(1153) 0x104da8 INC %R8 |
(1153) 0x104dab ADD $0x60,%RDX |
(1153) 0x104daf VMOVAPD (%RAX,%RSI,8),%XMM0 |
(1153) 0x104db4 MOV 0x10(%RCX),%RSI |
(1153) 0x104db8 VPERMILPD $0x1,%XMM0,%XMM3 |
(1153) 0x104dbe VMOVAPD (%RAX,%RSI,8),%XMM1 |
(1153) 0x104dc3 MOV 0x8(%RCX),%RSI |
(1153) 0x104dc7 VMULPD -0x20(%RDX),%XMM0,%XMM0 |
(1153) 0x104dcc LEA (%RAX,%RSI,8),%RSI |
(1153) 0x104dd0 VMOVAPD (%RSI),%XMM2 |
(1153) 0x104dd4 VFNMADD132PD -0x10(%RDX),%XMM0,%XMM3 |
(1153) 0x104dda VPERMILPD $0x1,%XMM1,%XMM0 |
(1153) 0x104de0 VMULPD -0x40(%RDX),%XMM1,%XMM1 |
(1153) 0x104de5 VFNMADD132PD -0x30(%RDX),%XMM1,%XMM0 |
(1153) 0x104deb VPERMILPD $0x1,%XMM2,%XMM1 |
(1153) 0x104df1 VMULPD -0x60(%RDX),%XMM2,%XMM2 |
(1153) 0x104df6 VSUBPD %XMM0,%XMM5,%XMM4 |
(1153) 0x104dfa VADDPD %XMM5,%XMM0,%XMM0 |
(1153) 0x104dfe VFNMADD132PD -0x50(%RDX),%XMM2,%XMM1 |
(1153) 0x104e04 VSUBPD %XMM3,%XMM1,%XMM2 |
(1153) 0x104e08 VADDPD %XMM3,%XMM1,%XMM1 |
(1153) 0x104e0c VXORPD %XMM2,%XMM7,%XMM2 |
(1153) 0x104e10 VPERMILPD $0x1,%XMM2,%XMM2 |
(1153) 0x104e16 VSUBPD %XMM2,%XMM4,%XMM6 |
(1153) 0x104e1a VADDPD %XMM2,%XMM4,%XMM4 |
(1153) 0x104e1e VSUBPD %XMM1,%XMM0,%XMM2 |
(1153) 0x104e22 VADDPD %XMM1,%XMM0,%XMM0 |
(1153) 0x104e26 VMOVAPD %XMM6,(%RSI) |
(1153) 0x104e2a MOV 0x18(%RCX),%RSI |
(1153) 0x104e2e VMOVAPD %XMM4,(%RAX,%RSI,8) |
(1153) 0x104e33 MOV 0x10(%RCX),%RSI |
(1153) 0x104e37 ADD %RDI,%RCX |
(1153) 0x104e3a VMOVAPD %XMM2,(%RAX,%RSI,8) |
(1153) 0x104e3f VMOVAPD %XMM0,(%RAX) |
(1153) 0x104e43 ADD %R10,%RAX |
(1153) 0x104e46 CMP %R8,%R9 |
(1153) 0x104e49 JNE 104da0 |
0x104e4f POP %RBP |
0x104e50 RET |
0x104e51 NOPL (%RAX) |
0x104e58 RET |
0x104e59 NOPL (%RAX) |
Coverage (%) | Name | Source Location | Module |
---|---|---|---|
►99.79+ | apply#0x120c50 | dftw-direct.c:51 | bench |
○ | doit | fftw-bench.c:274 | bench |
○ | speed | bench | |
○ | bench_main | bench | |
○ | __libc_init_first | libc.so.6 | |
○ | __libc_start_main | libc.so.6 | |
○ | _start | hook.c:185 | bench |
min | med | avg | max |
---|---|---|---|
Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
---|---|---|---|---|---|---|---|---|---|---|
Value |
min | med | avg | max |
---|---|---|---|
Percentile Index | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
---|---|---|---|---|---|---|---|---|---|---|
Value |
Path / |
The code analyzed by CQA in that panel excludes loops and represents 0.24% of application time for run run_0
Source file and lines | t2fv_4.c:92-120 |
Module | bench |
nb instructions | 19 |
nb uops | 19 |
loop length | 81 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
micro-operation queue | 4.75 cycles |
front end | 4.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.00 | 2.33 | 2.33 | 1.00 | 2.00 | 2.50 | 2.33 |
cycles | 2.50 | 2.00 | 2.33 | 2.33 | 1.00 | 2.00 | 2.50 | 2.33 |
Cycles executing div or sqrt instructions | NA |
Front-end | 4.75 |
Dispatch | 2.50 |
Overall L1 | 4.75 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 100% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 33% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 25% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 16% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
---|---|---|---|---|---|---|---|---|---|---|---|---|
LEA (%R8,%R8,2),%RSI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
SAL $0x5,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
ADD %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
CMP %R9,%R8 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
JGE 104e58 <t2fv_4+0xf8> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
VMOVAPD 0x4c180(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
MOV 0x10(%RBP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
LEA (,%RDI,8),%R10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
MOV 0x5004a(%RIP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
SAL $0x3,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 | N/A |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 | N/A |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
The code analyzed by CQA in that panel excludes loops and represents 0.24% of application time for run run_0
Source file and lines | t2fv_4.c:92-120 |
Module | bench |
nb instructions | 19 |
nb uops | 19 |
loop length | 81 |
used x86 registers | 9 |
used mmx registers | 0 |
used xmm registers | 1 |
used ymm registers | 0 |
used zmm registers | 0 |
nb stack references | 1 |
micro-operation queue | 4.75 cycles |
front end | 4.75 cycles |
P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | |
---|---|---|---|---|---|---|---|---|
uops | 2.50 | 2.00 | 2.33 | 2.33 | 1.00 | 2.00 | 2.50 | 2.33 |
cycles | 2.50 | 2.00 | 2.33 | 2.33 | 1.00 | 2.00 | 2.50 | 2.33 |
Cycles executing div or sqrt instructions | NA |
Front-end | 4.75 |
Dispatch | 2.50 |
Overall L1 | 4.75 |
all | 0% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 0% |
all | 100% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 33% |
load | 100% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 0% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 0% |
all | 12% |
load | NA (no load vectorizable/vectorized instructions) |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
other | 12% |
all | 25% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | NA (no add-sub vectorizable/vectorized instructions) |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | NA (no other vectorizable/vectorized instructions) |
all | 16% |
load | 25% |
store | NA (no store vectorizable/vectorized instructions) |
mul | NA (no mul vectorizable/vectorized instructions) |
add-sub | 12% |
fma | NA (no fma vectorizable/vectorized instructions) |
div/sqrt | NA (no div/sqrt vectorizable/vectorized instructions) |
other | 12% |
Instruction | Nb FU | P0 | P1 | P2 | P3 | P4 | P5 | P6 | P7 | Latency | Recip. throughput | Vectorization |
---|---|---|---|---|---|---|---|---|---|---|---|---|
LEA (%R8,%R8,2),%RSI | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
SAL $0x5,%RSI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | scal (12.5%) |
ADD %RSI,%RDX | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | scal (12.5%) |
CMP %R9,%R8 | 1 | 0.25 | 0.25 | 0 | 0 | 0 | 0.25 | 0.25 | 0 | 1 | 0.25 | N/A |
JGE 104e58 <t2fv_4+0xf8> | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 0 | 0.50-1 | N/A |
PUSH %RBP | 1 | 0 | 0 | 0.33 | 0.33 | 1 | 0 | 0 | 0.33 | 3 | 1 | N/A |
MOV %RDI,%RAX | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
VMOVAPD 0x4c180(%RIP),%XMM7 | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | vect (25.0%) |
MOV %RSP,%RBP | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
MOV 0x10(%RBP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
LEA (,%RDI,8),%R10 | 1 | 0 | 0.50 | 0 | 0 | 0 | 0.50 | 0 | 0 | 1 | 0.50 | N/A |
MOV 0x5004a(%RIP),%RDI | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 4-5 | 0.50 | N/A |
SAL $0x3,%RDI | 1 | 0.50 | 0 | 0 | 0 | 0 | 0 | 0.50 | 0 | 1 | 0.50 | N/A |
NOPW (%RAX,%RAX,1) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
POP %RBP | 1 | 0 | 0 | 0.50 | 0.50 | 0 | 0 | 0 | 0 | 2 | 0.50 | N/A |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 | N/A |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
RET | 1 | 0 | 0 | 0.33 | 0.33 | 0 | 0 | 1 | 0.33 | 0 | 1 | N/A |
NOPL (%RAX) | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.25 | N/A |
Name | Coverage (%) | Time (s) |
---|---|---|
▼t2fv_4– | 34.46 | 6.54 |
○Loop 1153 - t2fv_4.c:97-114 - bench | 34.22 | 6.49 |