Coverage Report

Created: 2026-01-16 07:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ffmpeg/libavutil/tx_template.c
Line
Count
Source
1
/*
2
 * Copyright (c) Lynne
3
 *
4
 * Power of two FFT:
5
 * Copyright (c) Lynne
6
 * Copyright (c) 2008 Loren Merritt
7
 * Copyright (c) 2002 Fabrice Bellard
8
 * Partly based on libdjbfft by D. J. Bernstein
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or
13
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15
 * version 2.1 of the License, or (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU Lesser General Public
23
 * License along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 */
26
27
#include "mem.h"
28
29
#define TABLE_DEF(name, size) \
30
    DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
31
32
#define SR_POW2_TABLES \
33
    SR_TABLE(8)        \
34
    SR_TABLE(16)       \
35
    SR_TABLE(32)       \
36
    SR_TABLE(64)       \
37
    SR_TABLE(128)      \
38
    SR_TABLE(256)      \
39
    SR_TABLE(512)      \
40
    SR_TABLE(1024)     \
41
    SR_TABLE(2048)     \
42
    SR_TABLE(4096)     \
43
    SR_TABLE(8192)     \
44
    SR_TABLE(16384)    \
45
    SR_TABLE(32768)    \
46
    SR_TABLE(65536)    \
47
    SR_TABLE(131072)   \
48
49
#define SR_TABLE(len) \
50
    TABLE_DEF(len, len/4 + 1);
51
/* Power of two tables */
52
SR_POW2_TABLES
53
#undef SR_TABLE
54
55
/* Other factors' tables */
56
TABLE_DEF(53, 12);
57
TABLE_DEF( 7,  6);
58
TABLE_DEF( 9,  8);
59
60
typedef struct FFTabInitData {
61
    void (*func)(void);
62
    int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
63
} FFTabInitData;
64
65
#define SR_TABLE(len)                                              \
66
221
static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void)            \
67
221
{                                                                  \
68
221
    double freq = 2*M_PI/len;                                      \
69
221
    TXSample *tab = TX_TAB(ff_tx_tab_ ##len);                      \
70
221
                                                                   \
71
13.6k
    for (int i = 0; i < len/4; i++)                                \
72
13.4k
        *tab++ = RESCALE(cos(i*freq));                             \
73
221
                                                                   \
74
221
    *tab = 0;                                                      \
75
221
}
76
853
SR_POW2_TABLES
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_8_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_16_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_32_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_64_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_128_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_256_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_512_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_1024_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_2048_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_4096_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_8192_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_16384_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_32768_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_65536_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_131072_double
tx_float.c:ff_tx_init_tab_8_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_16_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_32_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_64_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_128_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_256_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_512_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_1024_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_2048_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_4096_float
Line
Count
Source
76
SR_POW2_TABLES
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_8192_float
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_16384_float
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_32768_float
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_65536_float
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_131072_float
tx_int32.c:ff_tx_init_tab_8_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_16_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_32_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_64_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_128_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_256_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_512_int32
Line
Count
Source
76
SR_POW2_TABLES
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_1024_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_2048_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_4096_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_8192_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_16384_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_32768_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_65536_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_131072_int32
77
853
#undef SR_TABLE
78
853
79
853
static void (*const sr_tabs_init_funcs[])(void) = {
80
853
#define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
81
853
    SR_POW2_TABLES
82
853
#undef SR_TABLE
83
853
};
84
853
85
853
static AVOnce sr_tabs_init_once[] = {
86
853
#define SR_TABLE(len) AV_ONCE_INIT,
87
853
    SR_POW2_TABLES
88
853
#undef SR_TABLE
89
853
};
90
853
91
853
static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
92
853
{
93
    /* 5pt, doubled to eliminate AVX lane shuffles */
94
7
    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI /  5));
95
7
    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI /  5));
96
7
    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
97
7
    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
98
7
    TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI /  5));
99
7
    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
100
7
    TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
101
7
    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
102
103
    /* 3pt */
104
7
    TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
105
7
    TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
106
7
    TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI /  6));
107
7
    TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI /  6));
108
7
}
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_53_double
tx_float.c:ff_tx_init_tab_53_float
Line
Count
Source
92
6
{
93
    /* 5pt, doubled to eliminate AVX lane shuffles */
94
6
    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI /  5));
95
6
    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI /  5));
96
6
    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
97
6
    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
98
6
    TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI /  5));
99
6
    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
100
6
    TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
101
6
    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
102
103
    /* 3pt */
104
6
    TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
105
6
    TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
106
6
    TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI /  6));
107
6
    TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI /  6));
108
6
}
tx_int32.c:ff_tx_init_tab_53_int32
Line
Count
Source
92
1
{
93
    /* 5pt, doubled to eliminate AVX lane shuffles */
94
1
    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI /  5));
95
1
    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI /  5));
96
1
    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
97
1
    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
98
1
    TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI /  5));
99
1
    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
100
1
    TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
101
1
    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
102
103
    /* 3pt */
104
1
    TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
105
1
    TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
106
1
    TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI /  6));
107
1
    TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI /  6));
108
1
}
109
110
static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
111
1
{
112
1
    TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI /  7));
113
1
    TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI /  7));
114
1
    TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
115
1
    TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
116
1
    TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
117
1
    TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
118
1
}
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_7_double
tx_float.c:ff_tx_init_tab_7_float
Line
Count
Source
111
1
{
112
1
    TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI /  7));
113
1
    TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI /  7));
114
1
    TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
115
1
    TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
116
1
    TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
117
1
    TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
118
1
}
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_7_int32
119
120
static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
121
1
{
122
1
    TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI /  3));
123
1
    TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI /  3));
124
1
    TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI /  9));
125
1
    TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI /  9));
126
1
    TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
127
1
    TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
128
1
    TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
129
1
    TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
130
1
}
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_9_double
tx_float.c:ff_tx_init_tab_9_float
Line
Count
Source
121
1
{
122
1
    TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI /  3));
123
1
    TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI /  3));
124
1
    TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI /  9));
125
1
    TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI /  9));
126
1
    TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
127
1
    TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
128
1
    TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
129
1
    TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
130
1
}
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_9_int32
131
132
static const FFTabInitData nptwo_tabs_init_data[] = {
133
    { TX_TAB(ff_tx_init_tab_53),      { 15, 5, 3 } },
134
    { TX_TAB(ff_tx_init_tab_9),       {  9 }       },
135
    { TX_TAB(ff_tx_init_tab_7),       {  7 }       },
136
};
137
138
static AVOnce nptwo_tabs_init_once[] = {
139
    AV_ONCE_INIT,
140
    AV_ONCE_INIT,
141
    AV_ONCE_INIT,
142
};
143
144
av_cold void TX_TAB(ff_tx_init_tabs)(int len)
145
1.37M
{
146
1.37M
    int factor_2 = ff_ctz(len);
147
1.37M
    if (factor_2) {
148
990k
        int idx = factor_2 - 3;
149
4.19M
        for (int i = 0; i <= idx; i++)
150
3.20M
            ff_thread_once(&sr_tabs_init_once[i],
151
990k
                            sr_tabs_init_funcs[i]);
152
990k
        len >>= factor_2;
153
990k
    }
154
155
1.75M
    for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
156
1.75M
        int f, f_idx = 0;
157
158
1.75M
        if (len <= 1)
159
1.37M
            return;
160
161
390k
        while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
162
387k
            if (f % len)
163
5.54k
                continue;
164
165
382k
            ff_thread_once(&nptwo_tabs_init_once[i],
166
382k
                            nptwo_tabs_init_data[i].func);
167
382k
            len /= f;
168
382k
            break;
169
387k
        }
170
384k
    }
171
1.37M
}
Unexecuted instantiation: ff_tx_init_tabs_double
ff_tx_init_tabs_float
Line
Count
Source
145
1.19M
{
146
1.19M
    int factor_2 = ff_ctz(len);
147
1.19M
    if (factor_2) {
148
843k
        int idx = factor_2 - 3;
149
3.53M
        for (int i = 0; i <= idx; i++)
150
2.69M
            ff_thread_once(&sr_tabs_init_once[i],
151
843k
                            sr_tabs_init_funcs[i]);
152
843k
        len >>= factor_2;
153
843k
    }
154
155
1.55M
    for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
156
1.54M
        int f, f_idx = 0;
157
158
1.54M
        if (len <= 1)
159
1.19M
            return;
160
161
360k
        while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
162
357k
            if (f % len)
163
5.54k
                continue;
164
165
352k
            ff_thread_once(&nptwo_tabs_init_once[i],
166
352k
                            nptwo_tabs_init_data[i].func);
167
352k
            len /= f;
168
352k
            break;
169
357k
        }
170
354k
    }
171
1.19M
}
ff_tx_init_tabs_int32
Line
Count
Source
145
176k
{
146
176k
    int factor_2 = ff_ctz(len);
147
176k
    if (factor_2) {
148
146k
        int idx = factor_2 - 3;
149
661k
        for (int i = 0; i <= idx; i++)
150
514k
            ff_thread_once(&sr_tabs_init_once[i],
151
146k
                            sr_tabs_init_funcs[i]);
152
146k
        len >>= factor_2;
153
146k
    }
154
155
206k
    for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
156
206k
        int f, f_idx = 0;
157
158
206k
        if (len <= 1)
159
176k
            return;
160
161
29.8k
        while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
162
29.8k
            if (f % len)
163
0
                continue;
164
165
29.8k
            ff_thread_once(&nptwo_tabs_init_once[i],
166
29.8k
                            nptwo_tabs_init_data[i].func);
167
29.8k
            len /= f;
168
29.8k
            break;
169
29.8k
        }
170
29.8k
    }
171
176k
}
172
173
static av_always_inline void fft3(TXComplex *out, TXComplex *in,
174
                                  ptrdiff_t stride)
175
95.8M
{
176
95.8M
    TXComplex tmp[3];
177
95.8M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);
178
#ifdef TX_INT32
179
    int64_t mtmp[4];
180
#endif
181
182
95.8M
    tmp[0] = in[0];
183
95.8M
    BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
184
95.8M
    BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
185
186
#ifdef TX_INT32
187
    out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
188
    out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
189
    mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
190
    mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
191
    mtmp[2] = (int64_t)tab[10] * tmp[2].re;
192
    mtmp[3] = (int64_t)tab[10] * tmp[2].im;
193
    out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
194
    out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
195
    out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
196
    out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
197
#else
198
    out[0*stride].re = tmp[0].re + tmp[2].re;
199
    out[0*stride].im = tmp[0].im + tmp[2].im;
200
    tmp[1].re = tab[ 8] * tmp[1].re;
201
    tmp[1].im = tab[ 9] * tmp[1].im;
202
    tmp[2].re = tab[10] * tmp[2].re;
203
    tmp[2].im = tab[10] * tmp[2].im;
204
    out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
205
    out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
206
    out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
207
    out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
208
#endif
209
95.8M
}
Unexecuted instantiation: tx_double.c:fft3
tx_float.c:fft3
Line
Count
Source
175
94.8M
{
176
94.8M
    TXComplex tmp[3];
177
94.8M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);
178
#ifdef TX_INT32
179
    int64_t mtmp[4];
180
#endif
181
182
94.8M
    tmp[0] = in[0];
183
94.8M
    BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
184
94.8M
    BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
185
186
#ifdef TX_INT32
187
    out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
188
    out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
189
    mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
190
    mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
191
    mtmp[2] = (int64_t)tab[10] * tmp[2].re;
192
    mtmp[3] = (int64_t)tab[10] * tmp[2].im;
193
    out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
194
    out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
195
    out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
196
    out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
197
#else
198
94.8M
    out[0*stride].re = tmp[0].re + tmp[2].re;
199
94.8M
    out[0*stride].im = tmp[0].im + tmp[2].im;
200
94.8M
    tmp[1].re = tab[ 8] * tmp[1].re;
201
94.8M
    tmp[1].im = tab[ 9] * tmp[1].im;
202
94.8M
    tmp[2].re = tab[10] * tmp[2].re;
203
94.8M
    tmp[2].im = tab[10] * tmp[2].im;
204
94.8M
    out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
205
94.8M
    out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
206
94.8M
    out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
207
94.8M
    out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
208
94.8M
#endif
209
94.8M
}
tx_int32.c:fft3
Line
Count
Source
175
1.06M
{
176
1.06M
    TXComplex tmp[3];
177
1.06M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);
178
1.06M
#ifdef TX_INT32
179
1.06M
    int64_t mtmp[4];
180
1.06M
#endif
181
182
1.06M
    tmp[0] = in[0];
183
1.06M
    BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
184
1.06M
    BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
185
186
1.06M
#ifdef TX_INT32
187
1.06M
    out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
188
1.06M
    out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
189
1.06M
    mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
190
1.06M
    mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
191
1.06M
    mtmp[2] = (int64_t)tab[10] * tmp[2].re;
192
1.06M
    mtmp[3] = (int64_t)tab[10] * tmp[2].im;
193
1.06M
    out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
194
1.06M
    out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
195
1.06M
    out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
196
1.06M
    out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
197
#else
198
    out[0*stride].re = tmp[0].re + tmp[2].re;
199
    out[0*stride].im = tmp[0].im + tmp[2].im;
200
    tmp[1].re = tab[ 8] * tmp[1].re;
201
    tmp[1].im = tab[ 9] * tmp[1].im;
202
    tmp[2].re = tab[10] * tmp[2].re;
203
    tmp[2].im = tab[10] * tmp[2].im;
204
    out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
205
    out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
206
    out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
207
    out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
208
#endif
209
1.06M
}
210
211
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)                         \
212
static av_always_inline void NAME(TXComplex *out, TXComplex *in,    \
213
52.8M
                                  ptrdiff_t stride)                 \
214
52.8M
{                                                                   \
215
52.8M
    TXComplex dc, z0[4], t[6];                                      \
216
52.8M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
52.8M
                                                                    \
218
52.8M
    dc = in[0];                                                     \
219
52.8M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
52.8M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
52.8M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
52.8M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
52.8M
                                                                    \
224
52.8M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
52.8M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
52.8M
                                                                    \
227
52.8M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
52.8M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
52.8M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
52.8M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
52.8M
                                                                    \
232
52.8M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
52.8M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
52.8M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
52.8M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
52.8M
                                                                    \
237
52.8M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
52.8M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
52.8M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
52.8M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
52.8M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
52.8M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
52.8M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
52.8M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
52.8M
}
Unexecuted instantiation: tx_double.c:fft5
Unexecuted instantiation: tx_double.c:fft5_m1
Unexecuted instantiation: tx_double.c:fft5_m2
Unexecuted instantiation: tx_double.c:fft5_m3
tx_float.c:fft5
Line
Count
Source
213
39.6M
                                  ptrdiff_t stride)                 \
214
39.6M
{                                                                   \
215
39.6M
    TXComplex dc, z0[4], t[6];                                      \
216
39.6M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
39.6M
                                                                    \
218
39.6M
    dc = in[0];                                                     \
219
39.6M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
39.6M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
39.6M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
39.6M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
39.6M
                                                                    \
224
39.6M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
39.6M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
39.6M
                                                                    \
227
39.6M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
39.6M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
39.6M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
39.6M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
39.6M
                                                                    \
232
39.6M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
39.6M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
39.6M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
39.6M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
39.6M
                                                                    \
237
39.6M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
39.6M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
39.6M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
39.6M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
39.6M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
39.6M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
39.6M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
39.6M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
39.6M
}
tx_float.c:fft5_m1
Line
Count
Source
213
4.17M
                                  ptrdiff_t stride)                 \
214
4.17M
{                                                                   \
215
4.17M
    TXComplex dc, z0[4], t[6];                                      \
216
4.17M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
4.17M
                                                                    \
218
4.17M
    dc = in[0];                                                     \
219
4.17M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
4.17M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
4.17M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
4.17M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
4.17M
                                                                    \
224
4.17M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
4.17M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
4.17M
                                                                    \
227
4.17M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
4.17M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
4.17M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
4.17M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
4.17M
                                                                    \
232
4.17M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
4.17M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
4.17M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
4.17M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
4.17M
                                                                    \
237
4.17M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
4.17M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
4.17M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
4.17M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
4.17M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
4.17M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
4.17M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
4.17M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
4.17M
}
tx_float.c:fft5_m2
Line
Count
Source
213
4.17M
                                  ptrdiff_t stride)                 \
214
4.17M
{                                                                   \
215
4.17M
    TXComplex dc, z0[4], t[6];                                      \
216
4.17M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
4.17M
                                                                    \
218
4.17M
    dc = in[0];                                                     \
219
4.17M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
4.17M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
4.17M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
4.17M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
4.17M
                                                                    \
224
4.17M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
4.17M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
4.17M
                                                                    \
227
4.17M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
4.17M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
4.17M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
4.17M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
4.17M
                                                                    \
232
4.17M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
4.17M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
4.17M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
4.17M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
4.17M
                                                                    \
237
4.17M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
4.17M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
4.17M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
4.17M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
4.17M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
4.17M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
4.17M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
4.17M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
4.17M
}
tx_float.c:fft5_m3
Line
Count
Source
213
4.17M
                                  ptrdiff_t stride)                 \
214
4.17M
{                                                                   \
215
4.17M
    TXComplex dc, z0[4], t[6];                                      \
216
4.17M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
4.17M
                                                                    \
218
4.17M
    dc = in[0];                                                     \
219
4.17M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
4.17M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
4.17M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
4.17M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
4.17M
                                                                    \
224
4.17M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
4.17M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
4.17M
                                                                    \
227
4.17M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
4.17M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
4.17M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
4.17M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
4.17M
                                                                    \
232
4.17M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
4.17M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
4.17M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
4.17M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
4.17M
                                                                    \
237
4.17M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
4.17M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
4.17M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
4.17M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
4.17M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
4.17M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
4.17M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
4.17M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
4.17M
}
Unexecuted instantiation: tx_int32.c:fft5
tx_int32.c:fft5_m1
Line
Count
Source
213
213k
                                  ptrdiff_t stride)                 \
214
213k
{                                                                   \
215
213k
    TXComplex dc, z0[4], t[6];                                      \
216
213k
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
213k
                                                                    \
218
213k
    dc = in[0];                                                     \
219
213k
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
213k
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
213k
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
213k
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
213k
                                                                    \
224
213k
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
213k
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
213k
                                                                    \
227
213k
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
213k
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
213k
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
213k
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
213k
                                                                    \
232
213k
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
213k
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
213k
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
213k
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
213k
                                                                    \
237
213k
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
213k
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
213k
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
213k
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
213k
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
213k
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
213k
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
213k
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
213k
}
tx_int32.c:fft5_m2
Line
Count
Source
213
213k
                                  ptrdiff_t stride)                 \
214
213k
{                                                                   \
215
213k
    TXComplex dc, z0[4], t[6];                                      \
216
213k
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
213k
                                                                    \
218
213k
    dc = in[0];                                                     \
219
213k
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
213k
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
213k
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
213k
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
213k
                                                                    \
224
213k
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
213k
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
213k
                                                                    \
227
213k
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
213k
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
213k
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
213k
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
213k
                                                                    \
232
213k
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
213k
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
213k
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
213k
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
213k
                                                                    \
237
213k
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
213k
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
213k
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
213k
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
213k
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
213k
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
213k
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
213k
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
213k
}
tx_int32.c:fft5_m3
Line
Count
Source
213
213k
                                  ptrdiff_t stride)                 \
214
213k
{                                                                   \
215
213k
    TXComplex dc, z0[4], t[6];                                      \
216
213k
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
213k
                                                                    \
218
213k
    dc = in[0];                                                     \
219
213k
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
213k
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
213k
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
213k
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
213k
                                                                    \
224
213k
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
213k
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
213k
                                                                    \
227
213k
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
213k
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
213k
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
213k
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
213k
                                                                    \
232
213k
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
213k
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
213k
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
213k
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
213k
                                                                    \
237
213k
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
213k
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
213k
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
213k
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
213k
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
213k
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
213k
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
213k
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
213k
}
246
247
DECL_FFT5(fft5,     0,  1,  2,  3,  4)
248
DECL_FFT5(fft5_m1,  0,  6, 12,  3,  9)
249
DECL_FFT5(fft5_m2, 10,  1,  7, 13,  4)
250
DECL_FFT5(fft5_m3,  5, 11,  2,  8, 14)
251
252
static av_always_inline void fft7(TXComplex *out, TXComplex *in,
253
                                  ptrdiff_t stride)
254
6.81M
{
255
6.81M
    TXComplex dc, t[6], z[3];
256
6.81M
    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
257
#ifdef TX_INT32
258
    int64_t mtmp[12];
259
#endif
260
261
6.81M
    dc = in[0];
262
6.81M
    BF(t[1].re, t[0].re, in[1].re, in[6].re);
263
6.81M
    BF(t[1].im, t[0].im, in[1].im, in[6].im);
264
6.81M
    BF(t[3].re, t[2].re, in[2].re, in[5].re);
265
6.81M
    BF(t[3].im, t[2].im, in[2].im, in[5].im);
266
6.81M
    BF(t[5].re, t[4].re, in[3].re, in[4].re);
267
6.81M
    BF(t[5].im, t[4].im, in[3].im, in[4].im);
268
269
6.81M
    out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
270
6.81M
    out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
271
272
#ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
273
    mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
274
    mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
275
    mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
276
    mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
277
    mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
278
    mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
279
280
    mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
281
    mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
282
    mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
283
    mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
284
    mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
285
    mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
286
287
    z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
288
    z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
289
    z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
290
    z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
291
    z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
292
    z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
293
294
    t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
295
    t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
296
    t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
297
    t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
298
    t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
299
    t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
300
#else
301
    z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
302
    z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
303
    z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
304
    z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
305
    z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
306
    z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
307
308
    /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
309
     * multiplying the sum of all with the average of the twiddles */
310
311
    t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
312
    t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
313
    t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
314
    t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
315
    t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
316
    t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
317
#endif
318
319
6.81M
    BF(t[1].re, z[0].re, z[0].re, t[4].re);
320
6.81M
    BF(t[3].re, z[1].re, z[1].re, t[2].re);
321
6.81M
    BF(t[5].re, z[2].re, z[2].re, t[0].re);
322
6.81M
    BF(t[1].im, z[0].im, z[0].im, t[0].im);
323
6.81M
    BF(t[3].im, z[1].im, z[1].im, t[2].im);
324
6.81M
    BF(t[5].im, z[2].im, z[2].im, t[4].im);
325
326
6.81M
    out[1*stride].re = dc.re + z[0].re;
327
6.81M
    out[1*stride].im = dc.im + t[1].im;
328
6.81M
    out[2*stride].re = dc.re + t[3].re;
329
6.81M
    out[2*stride].im = dc.im + z[1].im;
330
6.81M
    out[3*stride].re = dc.re + z[2].re;
331
6.81M
    out[3*stride].im = dc.im + t[5].im;
332
6.81M
    out[4*stride].re = dc.re + t[5].re;
333
6.81M
    out[4*stride].im = dc.im + z[2].im;
334
6.81M
    out[5*stride].re = dc.re + z[1].re;
335
6.81M
    out[5*stride].im = dc.im + t[3].im;
336
6.81M
    out[6*stride].re = dc.re + t[1].re;
337
6.81M
    out[6*stride].im = dc.im + z[0].im;
338
6.81M
}
Unexecuted instantiation: tx_double.c:fft7
tx_float.c:fft7
Line
Count
Source
254
6.81M
{
255
6.81M
    TXComplex dc, t[6], z[3];
256
6.81M
    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
257
#ifdef TX_INT32
258
    int64_t mtmp[12];
259
#endif
260
261
6.81M
    dc = in[0];
262
6.81M
    BF(t[1].re, t[0].re, in[1].re, in[6].re);
263
6.81M
    BF(t[1].im, t[0].im, in[1].im, in[6].im);
264
6.81M
    BF(t[3].re, t[2].re, in[2].re, in[5].re);
265
6.81M
    BF(t[3].im, t[2].im, in[2].im, in[5].im);
266
6.81M
    BF(t[5].re, t[4].re, in[3].re, in[4].re);
267
6.81M
    BF(t[5].im, t[4].im, in[3].im, in[4].im);
268
269
6.81M
    out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
270
6.81M
    out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
271
272
#ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
273
    mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
274
    mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
275
    mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
276
    mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
277
    mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
278
    mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
279
280
    mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
281
    mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
282
    mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
283
    mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
284
    mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
285
    mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
286
287
    z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
288
    z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
289
    z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
290
    z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
291
    z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
292
    z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
293
294
    t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
295
    t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
296
    t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
297
    t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
298
    t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
299
    t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
300
#else
301
6.81M
    z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
302
6.81M
    z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
303
6.81M
    z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
304
6.81M
    z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
305
6.81M
    z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
306
6.81M
    z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
307
308
    /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
309
     * multiplying the sum of all with the average of the twiddles */
310
311
6.81M
    t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
312
6.81M
    t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
313
6.81M
    t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
314
6.81M
    t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
315
6.81M
    t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
316
6.81M
    t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
317
6.81M
#endif
318
319
6.81M
    BF(t[1].re, z[0].re, z[0].re, t[4].re);
320
6.81M
    BF(t[3].re, z[1].re, z[1].re, t[2].re);
321
6.81M
    BF(t[5].re, z[2].re, z[2].re, t[0].re);
322
6.81M
    BF(t[1].im, z[0].im, z[0].im, t[0].im);
323
6.81M
    BF(t[3].im, z[1].im, z[1].im, t[2].im);
324
6.81M
    BF(t[5].im, z[2].im, z[2].im, t[4].im);
325
326
6.81M
    out[1*stride].re = dc.re + z[0].re;
327
6.81M
    out[1*stride].im = dc.im + t[1].im;
328
6.81M
    out[2*stride].re = dc.re + t[3].re;
329
6.81M
    out[2*stride].im = dc.im + z[1].im;
330
6.81M
    out[3*stride].re = dc.re + z[2].re;
331
6.81M
    out[3*stride].im = dc.im + t[5].im;
332
6.81M
    out[4*stride].re = dc.re + t[5].re;
333
6.81M
    out[4*stride].im = dc.im + z[2].im;
334
6.81M
    out[5*stride].re = dc.re + z[1].re;
335
6.81M
    out[5*stride].im = dc.im + t[3].im;
336
6.81M
    out[6*stride].re = dc.re + t[1].re;
337
6.81M
    out[6*stride].im = dc.im + z[0].im;
338
6.81M
}
Unexecuted instantiation: tx_int32.c:fft7
339
340
static av_always_inline void fft9(TXComplex *out, TXComplex *in,
341
                                  ptrdiff_t stride)
342
5.30M
{
343
5.30M
    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
344
5.30M
    TXComplex dc, t[16], w[4], x[5], y[5], z[2];
345
#ifdef TX_INT32
346
    int64_t mtmp[12];
347
#endif
348
349
5.30M
    dc = in[0];
350
5.30M
    BF(t[1].re, t[0].re, in[1].re, in[8].re);
351
5.30M
    BF(t[1].im, t[0].im, in[1].im, in[8].im);
352
5.30M
    BF(t[3].re, t[2].re, in[2].re, in[7].re);
353
5.30M
    BF(t[3].im, t[2].im, in[2].im, in[7].im);
354
5.30M
    BF(t[5].re, t[4].re, in[3].re, in[6].re);
355
5.30M
    BF(t[5].im, t[4].im, in[3].im, in[6].im);
356
5.30M
    BF(t[7].re, t[6].re, in[4].re, in[5].re);
357
5.30M
    BF(t[7].im, t[6].im, in[4].im, in[5].im);
358
359
5.30M
    w[0].re = t[0].re - t[6].re;
360
5.30M
    w[0].im = t[0].im - t[6].im;
361
5.30M
    w[1].re = t[2].re - t[6].re;
362
5.30M
    w[1].im = t[2].im - t[6].im;
363
5.30M
    w[2].re = t[1].re - t[7].re;
364
5.30M
    w[2].im = t[1].im - t[7].im;
365
5.30M
    w[3].re = t[3].re + t[7].re;
366
5.30M
    w[3].im = t[3].im + t[7].im;
367
368
5.30M
    z[0].re = dc.re + t[4].re;
369
5.30M
    z[0].im = dc.im + t[4].im;
370
371
5.30M
    z[1].re = t[0].re + t[2].re + t[6].re;
372
5.30M
    z[1].im = t[0].im + t[2].im + t[6].im;
373
374
5.30M
    out[0*stride].re = z[0].re + z[1].re;
375
5.30M
    out[0*stride].im = z[0].im + z[1].im;
376
377
#ifdef TX_INT32
378
    mtmp[0] = t[1].re - t[3].re + t[7].re;
379
    mtmp[1] = t[1].im - t[3].im + t[7].im;
380
381
    y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
382
    y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
383
384
    mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
385
    mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
386
    mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
387
    mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
388
389
    x[3].re = z[0].re  + (int32_t)mtmp[0];
390
    x[3].im = z[0].im  + (int32_t)mtmp[1];
391
    z[0].re = in[0].re + (int32_t)mtmp[2];
392
    z[0].im = in[0].im + (int32_t)mtmp[3];
393
394
    mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
395
    mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
396
    mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
397
    mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
398
    mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
399
    mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
400
    mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
401
    mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
402
403
    x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
404
    x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
405
    x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
406
    x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
407
    y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
408
    y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
409
    y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
410
    y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
411
412
    y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
413
    y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
414
415
#else
416
    y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
417
    y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
418
419
    x[3].re = z[0].re  + tab[0].re*z[1].re;
420
    x[3].im = z[0].im  + tab[0].re*z[1].im;
421
    z[0].re = dc.re + tab[0].re*t[4].re;
422
    z[0].im = dc.im + tab[0].re*t[4].im;
423
424
    x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
425
    x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
426
    x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
427
    x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
428
    y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
429
    y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
430
    y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
431
    y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
432
433
    y[0].re = tab[0].im*t[5].re;
434
    y[0].im = tab[0].im*t[5].im;
435
#endif
436
437
5.30M
    x[4].re = x[1].re + x[2].re;
438
5.30M
    x[4].im = x[1].im + x[2].im;
439
440
5.30M
    y[4].re = y[1].re - y[2].re;
441
5.30M
    y[4].im = y[1].im - y[2].im;
442
5.30M
    x[1].re = z[0].re + x[1].re;
443
5.30M
    x[1].im = z[0].im + x[1].im;
444
5.30M
    y[1].re = y[0].re + y[1].re;
445
5.30M
    y[1].im = y[0].im + y[1].im;
446
5.30M
    x[2].re = z[0].re + x[2].re;
447
5.30M
    x[2].im = z[0].im + x[2].im;
448
5.30M
    y[2].re = y[2].re - y[0].re;
449
5.30M
    y[2].im = y[2].im - y[0].im;
450
5.30M
    x[4].re = z[0].re - x[4].re;
451
5.30M
    x[4].im = z[0].im - x[4].im;
452
5.30M
    y[4].re = y[0].re - y[4].re;
453
5.30M
    y[4].im = y[0].im - y[4].im;
454
455
5.30M
    out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
456
5.30M
    out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
457
5.30M
    out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
458
5.30M
    out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
459
5.30M
    out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
460
5.30M
    out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
461
5.30M
    out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
462
5.30M
    out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
463
5.30M
}
Unexecuted instantiation: tx_double.c:fft9
tx_float.c:fft9
Line
Count
Source
342
5.30M
{
343
5.30M
    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
344
5.30M
    TXComplex dc, t[16], w[4], x[5], y[5], z[2];
345
#ifdef TX_INT32
346
    int64_t mtmp[12];
347
#endif
348
349
5.30M
    dc = in[0];
350
5.30M
    BF(t[1].re, t[0].re, in[1].re, in[8].re);
351
5.30M
    BF(t[1].im, t[0].im, in[1].im, in[8].im);
352
5.30M
    BF(t[3].re, t[2].re, in[2].re, in[7].re);
353
5.30M
    BF(t[3].im, t[2].im, in[2].im, in[7].im);
354
5.30M
    BF(t[5].re, t[4].re, in[3].re, in[6].re);
355
5.30M
    BF(t[5].im, t[4].im, in[3].im, in[6].im);
356
5.30M
    BF(t[7].re, t[6].re, in[4].re, in[5].re);
357
5.30M
    BF(t[7].im, t[6].im, in[4].im, in[5].im);
358
359
5.30M
    w[0].re = t[0].re - t[6].re;
360
5.30M
    w[0].im = t[0].im - t[6].im;
361
5.30M
    w[1].re = t[2].re - t[6].re;
362
5.30M
    w[1].im = t[2].im - t[6].im;
363
5.30M
    w[2].re = t[1].re - t[7].re;
364
5.30M
    w[2].im = t[1].im - t[7].im;
365
5.30M
    w[3].re = t[3].re + t[7].re;
366
5.30M
    w[3].im = t[3].im + t[7].im;
367
368
5.30M
    z[0].re = dc.re + t[4].re;
369
5.30M
    z[0].im = dc.im + t[4].im;
370
371
5.30M
    z[1].re = t[0].re + t[2].re + t[6].re;
372
5.30M
    z[1].im = t[0].im + t[2].im + t[6].im;
373
374
5.30M
    out[0*stride].re = z[0].re + z[1].re;
375
5.30M
    out[0*stride].im = z[0].im + z[1].im;
376
377
#ifdef TX_INT32
378
    mtmp[0] = t[1].re - t[3].re + t[7].re;
379
    mtmp[1] = t[1].im - t[3].im + t[7].im;
380
381
    y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
382
    y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
383
384
    mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
385
    mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
386
    mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
387
    mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
388
389
    x[3].re = z[0].re  + (int32_t)mtmp[0];
390
    x[3].im = z[0].im  + (int32_t)mtmp[1];
391
    z[0].re = in[0].re + (int32_t)mtmp[2];
392
    z[0].im = in[0].im + (int32_t)mtmp[3];
393
394
    mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
395
    mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
396
    mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
397
    mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
398
    mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
399
    mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
400
    mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
401
    mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
402
403
    x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
404
    x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
405
    x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
406
    x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
407
    y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
408
    y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
409
    y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
410
    y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
411
412
    y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
413
    y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
414
415
#else
416
5.30M
    y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
417
5.30M
    y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
418
419
5.30M
    x[3].re = z[0].re  + tab[0].re*z[1].re;
420
5.30M
    x[3].im = z[0].im  + tab[0].re*z[1].im;
421
5.30M
    z[0].re = dc.re + tab[0].re*t[4].re;
422
5.30M
    z[0].im = dc.im + tab[0].re*t[4].im;
423
424
5.30M
    x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
425
5.30M
    x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
426
5.30M
    x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
427
5.30M
    x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
428
5.30M
    y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
429
5.30M
    y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
430
5.30M
    y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
431
5.30M
    y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
432
433
5.30M
    y[0].re = tab[0].im*t[5].re;
434
5.30M
    y[0].im = tab[0].im*t[5].im;
435
5.30M
#endif
436
437
5.30M
    x[4].re = x[1].re + x[2].re;
438
5.30M
    x[4].im = x[1].im + x[2].im;
439
440
5.30M
    y[4].re = y[1].re - y[2].re;
441
5.30M
    y[4].im = y[1].im - y[2].im;
442
5.30M
    x[1].re = z[0].re + x[1].re;
443
5.30M
    x[1].im = z[0].im + x[1].im;
444
5.30M
    y[1].re = y[0].re + y[1].re;
445
5.30M
    y[1].im = y[0].im + y[1].im;
446
5.30M
    x[2].re = z[0].re + x[2].re;
447
5.30M
    x[2].im = z[0].im + x[2].im;
448
5.30M
    y[2].re = y[2].re - y[0].re;
449
5.30M
    y[2].im = y[2].im - y[0].im;
450
5.30M
    x[4].re = z[0].re - x[4].re;
451
5.30M
    x[4].im = z[0].im - x[4].im;
452
5.30M
    y[4].re = y[0].re - y[4].re;
453
5.30M
    y[4].im = y[0].im - y[4].im;
454
455
5.30M
    out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
456
5.30M
    out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
457
5.30M
    out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
458
5.30M
    out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
459
5.30M
    out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
460
5.30M
    out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
461
5.30M
    out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
462
5.30M
    out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
463
5.30M
}
Unexecuted instantiation: tx_int32.c:fft9
464
465
static av_always_inline void fft15(TXComplex *out, TXComplex *in,
466
                                   ptrdiff_t stride)
467
4.38M
{
468
4.38M
    TXComplex tmp[15];
469
470
26.3M
    for (int i = 0; i < 5; i++)
471
21.9M
        fft3(tmp + i, in + i*3, 5);
472
473
4.38M
    fft5_m1(out, tmp +  0, stride);
474
4.38M
    fft5_m2(out, tmp +  5, stride);
475
4.38M
    fft5_m3(out, tmp + 10, stride);
476
4.38M
}
Unexecuted instantiation: tx_double.c:fft15
tx_float.c:fft15
Line
Count
Source
467
4.17M
{
468
4.17M
    TXComplex tmp[15];
469
470
25.0M
    for (int i = 0; i < 5; i++)
471
20.8M
        fft3(tmp + i, in + i*3, 5);
472
473
4.17M
    fft5_m1(out, tmp +  0, stride);
474
4.17M
    fft5_m2(out, tmp +  5, stride);
475
4.17M
    fft5_m3(out, tmp + 10, stride);
476
4.17M
}
tx_int32.c:fft15
Line
Count
Source
467
213k
{
468
213k
    TXComplex tmp[15];
469
470
1.28M
    for (int i = 0; i < 5; i++)
471
1.06M
        fft3(tmp + i, in + i*3, 5);
472
473
213k
    fft5_m1(out, tmp +  0, stride);
474
213k
    fft5_m2(out, tmp +  5, stride);
475
213k
    fft5_m3(out, tmp + 10, stride);
476
213k
}
477
478
static av_cold int TX_NAME(ff_tx_fft_factor_init)(AVTXContext *s,
479
                                                  const FFTXCodelet *cd,
480
                                                  uint64_t flags,
481
                                                  FFTXCodeletOptions *opts,
482
                                                  int len, int inv,
483
                                                  const void *scale)
484
2.37k
{
485
2.37k
    int ret = 0;
486
2.37k
    TX_TAB(ff_tx_init_tabs)(len);
487
488
2.37k
    if (len == 15)
489
0
        ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
490
2.37k
    else if (flags & FF_TX_PRESHUFFLE)
491
2.37k
        ret = ff_tx_gen_default_map(s, opts);
492
493
2.37k
    return ret;
494
2.37k
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_factor_init_double_c
tx_float.c:ff_tx_fft_factor_init_float_c
Line
Count
Source
484
2.37k
{
485
2.37k
    int ret = 0;
486
2.37k
    TX_TAB(ff_tx_init_tabs)(len);
487
488
2.37k
    if (len == 15)
489
0
        ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
490
2.37k
    else if (flags & FF_TX_PRESHUFFLE)
491
2.37k
        ret = ff_tx_gen_default_map(s, opts);
492
493
2.37k
    return ret;
494
2.37k
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_factor_init_int32_c
495
496
#define DECL_FACTOR_S(n)                                                       \
497
static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst,                   \
498
21.9M
                                  void *src, ptrdiff_t stride)                 \
499
21.9M
{                                                                              \
500
21.9M
    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
501
21.9M
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_fft3_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft5_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft7_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft9_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft15_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft3_float_c
tx_float.c:ff_tx_fft5_float_c
Line
Count
Source
498
9.84M
                                  void *src, ptrdiff_t stride)                 \
499
9.84M
{                                                                              \
500
9.84M
    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
501
9.84M
}                                                                              \
tx_float.c:ff_tx_fft7_float_c
Line
Count
Source
498
6.81M
                                  void *src, ptrdiff_t stride)                 \
499
6.81M
{                                                                              \
500
6.81M
    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
501
6.81M
}                                                                              \
tx_float.c:ff_tx_fft9_float_c
Line
Count
Source
498
5.30M
                                  void *src, ptrdiff_t stride)                 \
499
5.30M
{                                                                              \
500
5.30M
    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
501
5.30M
}                                                                              \
Unexecuted instantiation: tx_float.c:ff_tx_fft15_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft3_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft5_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft7_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft9_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft15_int32_c
502
static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = {                    \
503
    .name       = TX_NAME_STR("fft" #n "_ns"),                                 \
504
    .function   = TX_NAME(ff_tx_fft##n),                                       \
505
    .type       = TX_TYPE(FFT),                                                \
506
    .flags      = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |                         \
507
                  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE,                          \
508
    .factors[0] = n,                                                           \
509
    .nb_factors = 1,                                                           \
510
    .min_len    = n,                                                           \
511
    .max_len    = n,                                                           \
512
    .init       = TX_NAME(ff_tx_fft_factor_init),                              \
513
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
514
    .prio       = FF_TX_PRIO_BASE,                                             \
515
};
516
517
#define DECL_FACTOR_F(n)                                                       \
518
DECL_FACTOR_S(n)                                                               \
519
static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = {                   \
520
    .name       = TX_NAME_STR("fft" #n "_fwd"),                                \
521
    .function   = TX_NAME(ff_tx_fft##n),                                       \
522
    .type       = TX_TYPE(FFT),                                                \
523
    .flags      = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |                         \
524
                  AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY,                        \
525
    .factors[0] = n,                                                           \
526
    .nb_factors = 1,                                                           \
527
    .min_len    = n,                                                           \
528
    .max_len    = n,                                                           \
529
    .init       = TX_NAME(ff_tx_fft_factor_init),                              \
530
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
531
    .prio       = FF_TX_PRIO_BASE,                                             \
532
};
533
534
DECL_FACTOR_F(3)
535
DECL_FACTOR_F(5)
536
DECL_FACTOR_F(7)
537
DECL_FACTOR_F(9)
538
DECL_FACTOR_S(15)
539
540
#define BUTTERFLIES(a0, a1, a2, a3)            \
541
7.37G
    do {                                       \
542
7.37G
        r0=a0.re;                              \
543
7.37G
        i0=a0.im;                              \
544
7.37G
        r1=a1.re;                              \
545
7.37G
        i1=a1.im;                              \
546
7.37G
        BF(t3, t5, t5, t1);                    \
547
7.37G
        BF(a2.re, a0.re, r0, t5);              \
548
7.37G
        BF(a3.im, a1.im, i1, t3);              \
549
7.37G
        BF(t4, t6, t2, t6);                    \
550
7.37G
        BF(a3.re, a1.re, r1, t4);              \
551
7.37G
        BF(a2.im, a0.im, i0, t6);              \
552
7.37G
    } while (0)
553
554
#define TRANSFORM(a0, a1, a2, a3, wre, wim)    \
555
6.17G
    do {                                       \
556
6.17G
        CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
557
6.17G
        CMUL(t5, t6, a3.re, a3.im, wre,  wim); \
558
6.17G
        BUTTERFLIES(a0, a1, a2, a3);           \
559
6.17G
    } while (0)
560
561
/* z[0...8n-1], w[1...2n-1] */
562
static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z,
563
                                                 const TXSample *cos, int len)
564
311M
{
565
311M
    int o1 = 2*len;
566
311M
    int o2 = 4*len;
567
311M
    int o3 = 6*len;
568
311M
    const TXSample *wim = cos + o1 - 7;
569
311M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
570
571
847M
    for (int i = 0; i < len; i += 4) {
572
535M
        TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
573
535M
        TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
574
535M
        TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
575
535M
        TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
576
577
535M
        TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
578
535M
        TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
579
535M
        TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
580
535M
        TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
581
582
535M
        z   += 2*4;
583
535M
        cos += 2*4;
584
535M
        wim -= 2*4;
585
535M
    }
586
311M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_sr_combine_double_c
tx_float.c:ff_tx_fft_sr_combine_float_c
Line
Count
Source
564
242M
{
565
242M
    int o1 = 2*len;
566
242M
    int o2 = 4*len;
567
242M
    int o3 = 6*len;
568
242M
    const TXSample *wim = cos + o1 - 7;
569
242M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
570
571
680M
    for (int i = 0; i < len; i += 4) {
572
437M
        TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
573
437M
        TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
574
437M
        TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
575
437M
        TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
576
577
437M
        TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
578
437M
        TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
579
437M
        TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
580
437M
        TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
581
582
437M
        z   += 2*4;
583
437M
        cos += 2*4;
584
437M
        wim -= 2*4;
585
437M
    }
586
242M
}
tx_int32.c:ff_tx_fft_sr_combine_int32_c
Line
Count
Source
564
69.0M
{
565
69.0M
    int o1 = 2*len;
566
69.0M
    int o2 = 4*len;
567
69.0M
    int o3 = 6*len;
568
69.0M
    const TXSample *wim = cos + o1 - 7;
569
69.0M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
570
571
167M
    for (int i = 0; i < len; i += 4) {
572
98.1M
        TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
573
98.1M
        TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
574
98.1M
        TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
575
98.1M
        TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
576
577
98.1M
        TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
578
98.1M
        TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
579
98.1M
        TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
580
98.1M
        TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
581
582
98.1M
        z   += 2*4;
583
98.1M
        cos += 2*4;
584
98.1M
        wim -= 2*4;
585
98.1M
    }
586
69.0M
}
587
588
static av_cold int TX_NAME(ff_tx_fft_sr_codelet_init)(AVTXContext *s,
589
                                                      const FFTXCodelet *cd,
590
                                                      uint64_t flags,
591
                                                      FFTXCodeletOptions *opts,
592
                                                      int len, int inv,
593
                                                      const void *scale)
594
990k
{
595
990k
    TX_TAB(ff_tx_init_tabs)(len);
596
990k
    return ff_tx_gen_ptwo_revtab(s, opts);
597
990k
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_sr_codelet_init_double_c
tx_float.c:ff_tx_fft_sr_codelet_init_float_c
Line
Count
Source
594
843k
{
595
843k
    TX_TAB(ff_tx_init_tabs)(len);
596
843k
    return ff_tx_gen_ptwo_revtab(s, opts);
597
843k
}
tx_int32.c:ff_tx_fft_sr_codelet_init_int32_c
Line
Count
Source
594
146k
{
595
146k
    TX_TAB(ff_tx_init_tabs)(len);
596
146k
    return ff_tx_gen_ptwo_revtab(s, opts);
597
146k
}
598
599
#define DECL_SR_CODELET_DEF(n)                              \
600
static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
601
    .name       = TX_NAME_STR("fft" #n "_ns"),              \
602
    .function   = TX_NAME(ff_tx_fft##n##_ns),               \
603
    .type       = TX_TYPE(FFT),                             \
604
    .flags      = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE |      \
605
                  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE,       \
606
    .factors[0] = 2,                                        \
607
    .nb_factors = 1,                                        \
608
    .min_len    = n,                                        \
609
    .max_len    = n,                                        \
610
    .init       = TX_NAME(ff_tx_fft_sr_codelet_init),       \
611
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                      \
612
    .prio       = FF_TX_PRIO_BASE,                          \
613
};
614
615
#define DECL_SR_CODELET(n, n2, n4)                                    \
616
static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst,    \
617
311M
                                        void *_src, ptrdiff_t stride) \
618
311M
{                                                                     \
619
311M
    TXComplex *src = _src;                                            \
620
311M
    TXComplex *dst = _dst;                                            \
621
311M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
311M
                                                                      \
623
311M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
311M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
311M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
311M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
311M
}                                                                     \
Unexecuted instantiation: tx_double.c:ff_tx_fft32_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft64_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft128_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft256_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft512_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft1024_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft2048_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft4096_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft8192_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft16384_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft32768_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft65536_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft131072_ns_double_c
tx_float.c:ff_tx_fft32_ns_float_c
Line
Count
Source
617
162M
                                        void *_src, ptrdiff_t stride) \
618
162M
{                                                                     \
619
162M
    TXComplex *src = _src;                                            \
620
162M
    TXComplex *dst = _dst;                                            \
621
162M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
162M
                                                                      \
623
162M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
162M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
162M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
162M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
162M
}                                                                     \
tx_float.c:ff_tx_fft64_ns_float_c
Line
Count
Source
617
50.2M
                                        void *_src, ptrdiff_t stride) \
618
50.2M
{                                                                     \
619
50.2M
    TXComplex *src = _src;                                            \
620
50.2M
    TXComplex *dst = _dst;                                            \
621
50.2M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
50.2M
                                                                      \
623
50.2M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
50.2M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
50.2M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
50.2M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
50.2M
}                                                                     \
tx_float.c:ff_tx_fft128_ns_float_c
Line
Count
Source
617
22.8M
                                        void *_src, ptrdiff_t stride) \
618
22.8M
{                                                                     \
619
22.8M
    TXComplex *src = _src;                                            \
620
22.8M
    TXComplex *dst = _dst;                                            \
621
22.8M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
22.8M
                                                                      \
623
22.8M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
22.8M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
22.8M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
22.8M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
22.8M
}                                                                     \
tx_float.c:ff_tx_fft256_ns_float_c
Line
Count
Source
617
5.24M
                                        void *_src, ptrdiff_t stride) \
618
5.24M
{                                                                     \
619
5.24M
    TXComplex *src = _src;                                            \
620
5.24M
    TXComplex *dst = _dst;                                            \
621
5.24M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
5.24M
                                                                      \
623
5.24M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
5.24M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
5.24M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
5.24M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
5.24M
}                                                                     \
tx_float.c:ff_tx_fft512_ns_float_c
Line
Count
Source
617
1.55M
                                        void *_src, ptrdiff_t stride) \
618
1.55M
{                                                                     \
619
1.55M
    TXComplex *src = _src;                                            \
620
1.55M
    TXComplex *dst = _dst;                                            \
621
1.55M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
1.55M
                                                                      \
623
1.55M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
1.55M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
1.55M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
1.55M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
1.55M
}                                                                     \
tx_float.c:ff_tx_fft1024_ns_float_c
Line
Count
Source
617
303k
                                        void *_src, ptrdiff_t stride) \
618
303k
{                                                                     \
619
303k
    TXComplex *src = _src;                                            \
620
303k
    TXComplex *dst = _dst;                                            \
621
303k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
303k
                                                                      \
623
303k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
303k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
303k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
303k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
303k
}                                                                     \
tx_float.c:ff_tx_fft2048_ns_float_c
Line
Count
Source
617
39.7k
                                        void *_src, ptrdiff_t stride) \
618
39.7k
{                                                                     \
619
39.7k
    TXComplex *src = _src;                                            \
620
39.7k
    TXComplex *dst = _dst;                                            \
621
39.7k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
39.7k
                                                                      \
623
39.7k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
39.7k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
39.7k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
39.7k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
39.7k
}                                                                     \
tx_float.c:ff_tx_fft4096_ns_float_c
Line
Count
Source
617
28.6k
                                        void *_src, ptrdiff_t stride) \
618
28.6k
{                                                                     \
619
28.6k
    TXComplex *src = _src;                                            \
620
28.6k
    TXComplex *dst = _dst;                                            \
621
28.6k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
28.6k
                                                                      \
623
28.6k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
28.6k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
28.6k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
28.6k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
28.6k
}                                                                     \
Unexecuted instantiation: tx_float.c:ff_tx_fft8192_ns_float_c
Unexecuted instantiation: tx_float.c:ff_tx_fft16384_ns_float_c
Unexecuted instantiation: tx_float.c:ff_tx_fft32768_ns_float_c
Unexecuted instantiation: tx_float.c:ff_tx_fft65536_ns_float_c
Unexecuted instantiation: tx_float.c:ff_tx_fft131072_ns_float_c
tx_int32.c:ff_tx_fft32_ns_int32_c
Line
Count
Source
617
58.5M
                                        void *_src, ptrdiff_t stride) \
618
58.5M
{                                                                     \
619
58.5M
    TXComplex *src = _src;                                            \
620
58.5M
    TXComplex *dst = _dst;                                            \
621
58.5M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
58.5M
                                                                      \
623
58.5M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
58.5M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
58.5M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
58.5M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
58.5M
}                                                                     \
tx_int32.c:ff_tx_fft64_ns_int32_c
Line
Count
Source
617
5.92M
                                        void *_src, ptrdiff_t stride) \
618
5.92M
{                                                                     \
619
5.92M
    TXComplex *src = _src;                                            \
620
5.92M
    TXComplex *dst = _dst;                                            \
621
5.92M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
5.92M
                                                                      \
623
5.92M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
5.92M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
5.92M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
5.92M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
5.92M
}                                                                     \
tx_int32.c:ff_tx_fft128_ns_int32_c
Line
Count
Source
617
3.45M
                                        void *_src, ptrdiff_t stride) \
618
3.45M
{                                                                     \
619
3.45M
    TXComplex *src = _src;                                            \
620
3.45M
    TXComplex *dst = _dst;                                            \
621
3.45M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
3.45M
                                                                      \
623
3.45M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
3.45M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
3.45M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
3.45M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
3.45M
}                                                                     \
tx_int32.c:ff_tx_fft256_ns_int32_c
Line
Count
Source
617
602k
                                        void *_src, ptrdiff_t stride) \
618
602k
{                                                                     \
619
602k
    TXComplex *src = _src;                                            \
620
602k
    TXComplex *dst = _dst;                                            \
621
602k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
602k
                                                                      \
623
602k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
602k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
602k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
602k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
602k
}                                                                     \
tx_int32.c:ff_tx_fft512_ns_int32_c
Line
Count
Source
617
570k
                                        void *_src, ptrdiff_t stride) \
618
570k
{                                                                     \
619
570k
    TXComplex *src = _src;                                            \
620
570k
    TXComplex *dst = _dst;                                            \
621
570k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
570k
                                                                      \
623
570k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
570k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
570k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
570k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
570k
}                                                                     \
Unexecuted instantiation: tx_int32.c:ff_tx_fft1024_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft2048_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft4096_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft8192_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft16384_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft32768_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft65536_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft131072_ns_int32_c
628
                                                                      \
629
DECL_SR_CODELET_DEF(n)
630
631
static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst,
632
                                   void *_src, ptrdiff_t stride)
633
0
{
634
0
    TXComplex *src = _src;
635
0
    TXComplex *dst = _dst;
636
0
    TXComplex tmp;
637
638
0
    BF(tmp.re, dst[0].re, src[0].re, src[1].re);
639
0
    BF(tmp.im, dst[0].im, src[0].im, src[1].im);
640
0
    dst[1] = tmp;
641
0
}
Unexecuted instantiation: tx_double.c:ff_tx_fft2_ns_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft2_ns_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft2_ns_int32_c
642
643
static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst,
644
                                   void *_src, ptrdiff_t stride)
645
1.54G
{
646
1.54G
    TXComplex *src = _src;
647
1.54G
    TXComplex *dst = _dst;
648
1.54G
    TXSample t1, t2, t3, t4, t5, t6, t7, t8;
649
650
1.54G
    BF(t3, t1, src[0].re, src[1].re);
651
1.54G
    BF(t8, t6, src[3].re, src[2].re);
652
1.54G
    BF(dst[2].re, dst[0].re, t1, t6);
653
1.54G
    BF(t4, t2, src[0].im, src[1].im);
654
1.54G
    BF(t7, t5, src[2].im, src[3].im);
655
1.54G
    BF(dst[3].im, dst[1].im, t4, t8);
656
1.54G
    BF(dst[3].re, dst[1].re, t3, t7);
657
1.54G
    BF(dst[2].im, dst[0].im, t2, t5);
658
1.54G
}
Unexecuted instantiation: tx_double.c:ff_tx_fft4_ns_double_c
tx_float.c:ff_tx_fft4_ns_float_c
Line
Count
Source
645
1.21G
{
646
1.21G
    TXComplex *src = _src;
647
1.21G
    TXComplex *dst = _dst;
648
1.21G
    TXSample t1, t2, t3, t4, t5, t6, t7, t8;
649
650
1.21G
    BF(t3, t1, src[0].re, src[1].re);
651
1.21G
    BF(t8, t6, src[3].re, src[2].re);
652
1.21G
    BF(dst[2].re, dst[0].re, t1, t6);
653
1.21G
    BF(t4, t2, src[0].im, src[1].im);
654
1.21G
    BF(t7, t5, src[2].im, src[3].im);
655
1.21G
    BF(dst[3].im, dst[1].im, t4, t8);
656
1.21G
    BF(dst[3].re, dst[1].re, t3, t7);
657
1.21G
    BF(dst[2].im, dst[0].im, t2, t5);
658
1.21G
}
tx_int32.c:ff_tx_fft4_ns_int32_c
Line
Count
Source
645
328M
{
646
328M
    TXComplex *src = _src;
647
328M
    TXComplex *dst = _dst;
648
328M
    TXSample t1, t2, t3, t4, t5, t6, t7, t8;
649
650
328M
    BF(t3, t1, src[0].re, src[1].re);
651
328M
    BF(t8, t6, src[3].re, src[2].re);
652
328M
    BF(dst[2].re, dst[0].re, t1, t6);
653
328M
    BF(t4, t2, src[0].im, src[1].im);
654
328M
    BF(t7, t5, src[2].im, src[3].im);
655
328M
    BF(dst[3].im, dst[1].im, t4, t8);
656
328M
    BF(dst[3].re, dst[1].re, t3, t7);
657
328M
    BF(dst[2].im, dst[0].im, t2, t5);
658
328M
}
659
660
static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst,
661
                                   void *_src, ptrdiff_t stride)
662
844M
{
663
844M
    TXComplex *src = _src;
664
844M
    TXComplex *dst = _dst;
665
844M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
666
844M
    const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
667
668
844M
    TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
669
670
844M
    BF(t1, dst[5].re, src[4].re, -src[5].re);
671
844M
    BF(t2, dst[5].im, src[4].im, -src[5].im);
672
844M
    BF(t5, dst[7].re, src[6].re, -src[7].re);
673
844M
    BF(t6, dst[7].im, src[6].im, -src[7].im);
674
675
844M
    BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
676
844M
    TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
677
844M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft8_ns_double_c
tx_float.c:ff_tx_fft8_ns_float_c
Line
Count
Source
662
657M
{
663
657M
    TXComplex *src = _src;
664
657M
    TXComplex *dst = _dst;
665
657M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
666
657M
    const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
667
668
657M
    TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
669
670
657M
    BF(t1, dst[5].re, src[4].re, -src[5].re);
671
657M
    BF(t2, dst[5].im, src[4].im, -src[5].im);
672
657M
    BF(t5, dst[7].re, src[6].re, -src[7].re);
673
657M
    BF(t6, dst[7].im, src[6].im, -src[7].im);
674
675
657M
    BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
676
657M
    TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
677
657M
}
tx_int32.c:ff_tx_fft8_ns_int32_c
Line
Count
Source
662
187M
{
663
187M
    TXComplex *src = _src;
664
187M
    TXComplex *dst = _dst;
665
187M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
666
187M
    const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
667
668
187M
    TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
669
670
187M
    BF(t1, dst[5].re, src[4].re, -src[5].re);
671
187M
    BF(t2, dst[5].im, src[4].im, -src[5].im);
672
187M
    BF(t5, dst[7].re, src[6].re, -src[7].re);
673
187M
    BF(t6, dst[7].im, src[6].im, -src[7].im);
674
675
187M
    BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
676
187M
    TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
677
187M
}
678
679
static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst,
680
                                    void *_src, ptrdiff_t stride)
681
349M
{
682
349M
    TXComplex *src = _src;
683
349M
    TXComplex *dst = _dst;
684
349M
    const TXSample *cos = TX_TAB(ff_tx_tab_16);
685
686
349M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
687
349M
    TXSample cos_16_1 = cos[1];
688
349M
    TXSample cos_16_2 = cos[2];
689
349M
    TXSample cos_16_3 = cos[3];
690
691
349M
    TX_NAME(ff_tx_fft8_ns)(s, dst +  0, src +  0, stride);
692
349M
    TX_NAME(ff_tx_fft4_ns)(s, dst +  8, src +  8, stride);
693
349M
    TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
694
695
349M
    t1 = dst[ 8].re;
696
349M
    t2 = dst[ 8].im;
697
349M
    t5 = dst[12].re;
698
349M
    t6 = dst[12].im;
699
349M
    BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
700
701
349M
    TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
702
349M
    TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
703
349M
    TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
704
349M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft16_ns_double_c
tx_float.c:ff_tx_fft16_ns_float_c
Line
Count
Source
681
279M
{
682
279M
    TXComplex *src = _src;
683
279M
    TXComplex *dst = _dst;
684
279M
    const TXSample *cos = TX_TAB(ff_tx_tab_16);
685
686
279M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
687
279M
    TXSample cos_16_1 = cos[1];
688
279M
    TXSample cos_16_2 = cos[2];
689
279M
    TXSample cos_16_3 = cos[3];
690
691
279M
    TX_NAME(ff_tx_fft8_ns)(s, dst +  0, src +  0, stride);
692
279M
    TX_NAME(ff_tx_fft4_ns)(s, dst +  8, src +  8, stride);
693
279M
    TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
694
695
279M
    t1 = dst[ 8].re;
696
279M
    t2 = dst[ 8].im;
697
279M
    t5 = dst[12].re;
698
279M
    t6 = dst[12].im;
699
279M
    BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
700
701
279M
    TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
702
279M
    TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
703
279M
    TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
704
279M
}
tx_int32.c:ff_tx_fft16_ns_int32_c
Line
Count
Source
681
70.4M
{
682
70.4M
    TXComplex *src = _src;
683
70.4M
    TXComplex *dst = _dst;
684
70.4M
    const TXSample *cos = TX_TAB(ff_tx_tab_16);
685
686
70.4M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
687
70.4M
    TXSample cos_16_1 = cos[1];
688
70.4M
    TXSample cos_16_2 = cos[2];
689
70.4M
    TXSample cos_16_3 = cos[3];
690
691
70.4M
    TX_NAME(ff_tx_fft8_ns)(s, dst +  0, src +  0, stride);
692
70.4M
    TX_NAME(ff_tx_fft4_ns)(s, dst +  8, src +  8, stride);
693
70.4M
    TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
694
695
70.4M
    t1 = dst[ 8].re;
696
70.4M
    t2 = dst[ 8].im;
697
70.4M
    t5 = dst[12].re;
698
70.4M
    t6 = dst[12].im;
699
70.4M
    BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
700
701
70.4M
    TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
702
70.4M
    TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
703
70.4M
    TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
704
70.4M
}
705
706
DECL_SR_CODELET_DEF(2)
707
DECL_SR_CODELET_DEF(4)
708
DECL_SR_CODELET_DEF(8)
709
DECL_SR_CODELET_DEF(16)
710
DECL_SR_CODELET(32,16,8)
711
DECL_SR_CODELET(64,32,16)
712
DECL_SR_CODELET(128,64,32)
713
DECL_SR_CODELET(256,128,64)
714
DECL_SR_CODELET(512,256,128)
715
DECL_SR_CODELET(1024,512,256)
716
DECL_SR_CODELET(2048,1024,512)
717
DECL_SR_CODELET(4096,2048,1024)
718
DECL_SR_CODELET(8192,4096,2048)
719
DECL_SR_CODELET(16384,8192,4096)
720
DECL_SR_CODELET(32768,16384,8192)
721
DECL_SR_CODELET(65536,32768,16384)
722
DECL_SR_CODELET(131072,65536,32768)
723
724
static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s,
725
                                           const FFTXCodelet *cd,
726
                                           uint64_t flags,
727
                                           FFTXCodeletOptions *opts,
728
                                           int len, int inv,
729
                                           const void *scale)
730
10.9k
{
731
10.9k
    int ret;
732
10.9k
    int is_inplace = !!(flags & AV_TX_INPLACE);
733
10.9k
    FFTXCodeletOptions sub_opts = {
734
10.9k
        .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
735
10.9k
    };
736
737
10.9k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
738
10.9k
    flags |=  AV_TX_INPLACE;      /* in-place */
739
10.9k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
740
741
10.9k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
742
2.37k
        return ret;
743
744
8.55k
    if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
745
0
        return ret;
746
747
8.55k
    return 0;
748
8.55k
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_init_double_c
tx_float.c:ff_tx_fft_init_float_c
Line
Count
Source
730
10.9k
{
731
10.9k
    int ret;
732
10.9k
    int is_inplace = !!(flags & AV_TX_INPLACE);
733
10.9k
    FFTXCodeletOptions sub_opts = {
734
10.9k
        .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
735
10.9k
    };
736
737
10.9k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
738
10.9k
    flags |=  AV_TX_INPLACE;      /* in-place */
739
10.9k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
740
741
10.9k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
742
2.37k
        return ret;
743
744
8.55k
    if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
745
0
        return ret;
746
747
8.55k
    return 0;
748
8.55k
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_init_int32_c
749
750
static av_cold int TX_NAME(ff_tx_fft_inplace_small_init)(AVTXContext *s,
751
                                                         const FFTXCodelet *cd,
752
                                                         uint64_t flags,
753
                                                         FFTXCodeletOptions *opts,
754
                                                         int len, int inv,
755
                                                         const void *scale)
756
792
{
757
792
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
758
0
        return AVERROR(ENOMEM);
759
792
    flags &= ~AV_TX_INPLACE;
760
792
    return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
761
792
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_inplace_small_init_double_c
tx_float.c:ff_tx_fft_inplace_small_init_float_c
Line
Count
Source
756
792
{
757
792
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
758
0
        return AVERROR(ENOMEM);
759
792
    flags &= ~AV_TX_INPLACE;
760
792
    return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
761
792
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_inplace_small_init_int32_c
762
763
static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst,
764
                               void *_src, ptrdiff_t stride)
765
20.1M
{
766
20.1M
    TXComplex *src = _src;
767
20.1M
    TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
768
20.1M
    TXComplex *dst2 = _dst;
769
20.1M
    int *map = s->sub[0].map;
770
20.1M
    int len = s->len;
771
772
    /* Compilers can't vectorize this anyway without assuming AVX2, which they
773
     * generally don't, at least without -march=native -mtune=native */
774
2.35G
    for (int i = 0; i < len; i++)
775
2.33G
        dst1[i] = src[map[i]];
776
777
20.1M
    s->fn[0](&s->sub[0], dst2, dst1, stride);
778
20.1M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_double_c
tx_float.c:ff_tx_fft_float_c
Line
Count
Source
765
20.1M
{
766
20.1M
    TXComplex *src = _src;
767
20.1M
    TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
768
20.1M
    TXComplex *dst2 = _dst;
769
20.1M
    int *map = s->sub[0].map;
770
20.1M
    int len = s->len;
771
772
    /* Compilers can't vectorize this anyway without assuming AVX2, which they
773
     * generally don't, at least without -march=native -mtune=native */
774
2.35G
    for (int i = 0; i < len; i++)
775
2.33G
        dst1[i] = src[map[i]];
776
777
20.1M
    s->fn[0](&s->sub[0], dst2, dst1, stride);
778
20.1M
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_int32_c
779
780
static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst,
781
                                       void *_src, ptrdiff_t stride)
782
0
{
783
0
    TXComplex *src = _src;
784
0
    TXComplex *dst = _dst;
785
0
    TXComplex tmp;
786
0
    const int *map = s->sub->map;
787
0
    const int *inplace_idx = s->map;
788
0
    int src_idx, dst_idx;
789
790
0
    src_idx = *inplace_idx++;
791
0
    do {
792
0
        tmp = src[src_idx];
793
0
        dst_idx = map[src_idx];
794
0
        do {
795
0
            FFSWAP(TXComplex, tmp, src[dst_idx]);
796
0
            dst_idx = map[dst_idx];
797
0
        } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */
798
0
        src[dst_idx] = tmp;
799
0
    } while ((src_idx = *inplace_idx++));
800
801
0
    s->fn[0](&s->sub[0], dst, src, stride);
802
0
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_inplace_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft_inplace_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft_inplace_int32_c
803
804
static const FFTXCodelet TX_NAME(ff_tx_fft_def) = {
805
    .name       = TX_NAME_STR("fft"),
806
    .function   = TX_NAME(ff_tx_fft),
807
    .type       = TX_TYPE(FFT),
808
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
809
    .factors[0] = TX_FACTOR_ANY,
810
    .nb_factors = 1,
811
    .min_len    = 2,
812
    .max_len    = TX_LEN_UNLIMITED,
813
    .init       = TX_NAME(ff_tx_fft_init),
814
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
815
    .prio       = FF_TX_PRIO_BASE,
816
};
817
818
static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = {
819
    .name       = TX_NAME_STR("fft_inplace_small"),
820
    .function   = TX_NAME(ff_tx_fft),
821
    .type       = TX_TYPE(FFT),
822
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
823
    .factors[0] = TX_FACTOR_ANY,
824
    .nb_factors = 1,
825
    .min_len    = 2,
826
    .max_len    = 65536,
827
    .init       = TX_NAME(ff_tx_fft_inplace_small_init),
828
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
829
    .prio       = FF_TX_PRIO_BASE - 256,
830
};
831
832
static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = {
833
    .name       = TX_NAME_STR("fft_inplace"),
834
    .function   = TX_NAME(ff_tx_fft_inplace),
835
    .type       = TX_TYPE(FFT),
836
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
837
    .factors[0] = TX_FACTOR_ANY,
838
    .nb_factors = 1,
839
    .min_len    = 2,
840
    .max_len    = TX_LEN_UNLIMITED,
841
    .init       = TX_NAME(ff_tx_fft_init),
842
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
843
    .prio       = FF_TX_PRIO_BASE - 512,
844
};
845
846
static av_cold int TX_NAME(ff_tx_fft_init_naive_small)(AVTXContext *s,
847
                                                       const FFTXCodelet *cd,
848
                                                       uint64_t flags,
849
                                                       FFTXCodeletOptions *opts,
850
                                                       int len, int inv,
851
                                                       const void *scale)
852
792
{
853
792
    const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
854
855
792
    if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
856
0
        return AVERROR(ENOMEM);
857
858
11.0k
    for (int i = 0; i < len; i++) {
859
144k
        for (int j = 0; j < len; j++) {
860
133k
            const double factor = phase*i*j;
861
133k
            s->exp[i*j] = (TXComplex){
862
133k
                RESCALE(cos(factor)),
863
133k
                RESCALE(sin(factor)),
864
133k
            };
865
133k
        }
866
10.2k
    }
867
868
792
    return 0;
869
792
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_init_naive_small_double_c
tx_float.c:ff_tx_fft_init_naive_small_float_c
Line
Count
Source
852
792
{
853
792
    const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
854
855
792
    if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
856
0
        return AVERROR(ENOMEM);
857
858
11.0k
    for (int i = 0; i < len; i++) {
859
144k
        for (int j = 0; j < len; j++) {
860
133k
            const double factor = phase*i*j;
861
133k
            s->exp[i*j] = (TXComplex){
862
133k
                RESCALE(cos(factor)),
863
133k
                RESCALE(sin(factor)),
864
133k
            };
865
133k
        }
866
10.2k
    }
867
868
792
    return 0;
869
792
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_init_naive_small_int32_c
870
871
static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src,
872
                                     ptrdiff_t stride)
873
0
{
874
0
    TXComplex *src = _src;
875
0
    TXComplex *dst = _dst;
876
0
    const int n = s->len;
877
0
    double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n;
878
879
0
    stride /= sizeof(*dst);
880
881
0
    for (int i = 0; i < n; i++) {
882
0
        TXComplex tmp = { 0 };
883
0
        for (int j = 0; j < n; j++) {
884
0
            const double factor = phase*i*j;
885
0
            const TXComplex mult = {
886
0
                RESCALE(cos(factor)),
887
0
                RESCALE(sin(factor)),
888
0
            };
889
0
            TXComplex res;
890
0
            CMUL3(res, src[j], mult);
891
0
            tmp.re += res.re;
892
0
            tmp.im += res.im;
893
0
        }
894
0
        dst[i*stride] = tmp;
895
0
    }
896
0
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_naive_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft_naive_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft_naive_int32_c
897
898
static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src,
899
                                           ptrdiff_t stride)
900
3.78M
{
901
3.78M
    TXComplex *src = _src;
902
3.78M
    TXComplex *dst = _dst;
903
3.78M
    const int n = s->len;
904
905
3.78M
    stride /= sizeof(*dst);
906
907
53.0M
    for (int i = 0; i < n; i++) {
908
49.2M
        TXComplex tmp = { 0 };
909
689M
        for (int j = 0; j < n; j++) {
910
640M
            TXComplex res;
911
640M
            const TXComplex mult = s->exp[i*j];
912
640M
            CMUL3(res, src[j], mult);
913
640M
            tmp.re += res.re;
914
640M
            tmp.im += res.im;
915
640M
        }
916
49.2M
        dst[i*stride] = tmp;
917
49.2M
    }
918
3.78M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_naive_small_double_c
tx_float.c:ff_tx_fft_naive_small_float_c
Line
Count
Source
900
3.78M
{
901
3.78M
    TXComplex *src = _src;
902
3.78M
    TXComplex *dst = _dst;
903
3.78M
    const int n = s->len;
904
905
3.78M
    stride /= sizeof(*dst);
906
907
53.0M
    for (int i = 0; i < n; i++) {
908
49.2M
        TXComplex tmp = { 0 };
909
689M
        for (int j = 0; j < n; j++) {
910
640M
            TXComplex res;
911
640M
            const TXComplex mult = s->exp[i*j];
912
640M
            CMUL3(res, src[j], mult);
913
640M
            tmp.re += res.re;
914
640M
            tmp.im += res.im;
915
640M
        }
916
49.2M
        dst[i*stride] = tmp;
917
49.2M
    }
918
3.78M
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_naive_small_int32_c
919
920
static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = {
921
    .name       = TX_NAME_STR("fft_naive_small"),
922
    .function   = TX_NAME(ff_tx_fft_naive_small),
923
    .type       = TX_TYPE(FFT),
924
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
925
    .factors[0] = TX_FACTOR_ANY,
926
    .nb_factors = 1,
927
    .min_len    = 2,
928
    .max_len    = 1024,
929
    .init       = TX_NAME(ff_tx_fft_init_naive_small),
930
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
931
    .prio       = FF_TX_PRIO_MIN/2,
932
};
933
934
static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = {
935
    .name       = TX_NAME_STR("fft_naive"),
936
    .function   = TX_NAME(ff_tx_fft_naive),
937
    .type       = TX_TYPE(FFT),
938
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
939
    .factors[0] = TX_FACTOR_ANY,
940
    .nb_factors = 1,
941
    .min_len    = 2,
942
    .max_len    = TX_LEN_UNLIMITED,
943
    .init       = NULL,
944
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
945
    .prio       = FF_TX_PRIO_MIN,
946
};
947
948
static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s,
949
                                               const FFTXCodelet *cd,
950
                                               uint64_t flags,
951
                                               FFTXCodeletOptions *opts,
952
                                               int len, int inv,
953
                                               const void *scale)
954
911k
{
955
911k
    int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
956
911k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
957
911k
    size_t extra_tmp_len = 0;
958
911k
    int len_list[TX_MAX_DECOMPOSITIONS];
959
960
911k
    if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
961
909k
        return ret;
962
963
    /* Two iterations to test both orderings. */
964
1.58k
    for (int i = 0; i < ret; i++) {
965
1.58k
        int len1 = len_list[i];
966
1.58k
        int len2 = len / len1;
967
968
        /* Our ptwo transforms don't support striding the output. */
969
1.58k
        if (len2 & (len2 - 1))
970
1.58k
            FFSWAP(int, len1, len2);
971
972
1.58k
        ff_tx_clear_ctx(s);
973
974
        /* First transform */
975
1.58k
        sub_opts.map_dir = FF_TX_MAP_GATHER;
976
1.58k
        flags &= ~AV_TX_INPLACE;
977
1.58k
        flags |=  FF_TX_OUT_OF_PLACE;
978
1.58k
        flags |=  FF_TX_PRESHUFFLE; /* This function handles the permute step */
979
1.58k
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
980
1.58k
                               len1, inv, scale);
981
982
1.58k
        if (ret == AVERROR(ENOMEM)) {
983
0
            return ret;
984
1.58k
        } else if (ret < 0) { /* Try again without a preshuffle flag */
985
792
            flags &= ~FF_TX_PRESHUFFLE;
986
792
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
987
792
                                   len1, inv, scale);
988
792
            if (ret == AVERROR(ENOMEM))
989
0
                return ret;
990
792
            else if (ret < 0)
991
0
                continue;
992
792
        }
993
994
        /* Second transform. */
995
1.58k
        sub_opts.map_dir = FF_TX_MAP_SCATTER;
996
1.58k
        flags |=  FF_TX_PRESHUFFLE;
997
1.58k
retry:
998
1.58k
        flags &= ~FF_TX_OUT_OF_PLACE;
999
1.58k
        flags |=  AV_TX_INPLACE;
1000
1.58k
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1001
1.58k
                               len2, inv, scale);
1002
1003
1.58k
        if (ret == AVERROR(ENOMEM)) {
1004
0
            return ret;
1005
1.58k
        } else if (ret < 0) { /* Try again with an out-of-place transform */
1006
0
            flags |= FF_TX_OUT_OF_PLACE;
1007
0
            flags &= ~AV_TX_INPLACE;
1008
0
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1009
0
                                   len2, inv, scale);
1010
0
            if (ret == AVERROR(ENOMEM)) {
1011
0
                return ret;
1012
0
            } else if (ret < 0) {
1013
0
                if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1014
0
                    flags &= ~FF_TX_PRESHUFFLE;
1015
0
                    goto retry;
1016
0
                } else {
1017
0
                    continue;
1018
0
                }
1019
0
            }
1020
0
        }
1021
1022
        /* Success */
1023
1.58k
        break;
1024
1.58k
    }
1025
1026
    /* If nothing was successful, error out */
1027
1.58k
    if (ret < 0)
1028
0
        return ret;
1029
1030
    /* Generate PFA map */
1031
1.58k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1032
1.58k
                                          s->sub[0].len, s->sub[1].len)))
1033
0
        return ret;
1034
1035
1.58k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1036
0
        return AVERROR(ENOMEM);
1037
1038
    /* Flatten input map */
1039
1.58k
    tmp = (int *)s->tmp;
1040
12.6k
    for (int k = 0; k < len; k += s->sub[0].len) {
1041
11.0k
        memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1042
112k
        for (int i = 0; i < s->sub[0].len; i++)
1043
101k
            s->map[k + i] = tmp[s->sub[0].map[i]];
1044
11.0k
    }
1045
1046
    /* Only allocate extra temporary memory if we need it */
1047
1.58k
    if (!(s->sub[1].flags & AV_TX_INPLACE))
1048
0
        extra_tmp_len = len;
1049
1.58k
    else if (!ps)
1050
1.58k
        extra_tmp_len = s->sub[0].len;
1051
1052
1.58k
    if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1053
0
        return AVERROR(ENOMEM);
1054
1055
1.58k
    return 0;
1056
1.58k
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_pfa_init_double_c
tx_float.c:ff_tx_fft_pfa_init_float_c
Line
Count
Source
954
770k
{
955
770k
    int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
956
770k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
957
770k
    size_t extra_tmp_len = 0;
958
770k
    int len_list[TX_MAX_DECOMPOSITIONS];
959
960
770k
    if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
961
768k
        return ret;
962
963
    /* Two iterations to test both orderings. */
964
1.58k
    for (int i = 0; i < ret; i++) {
965
1.58k
        int len1 = len_list[i];
966
1.58k
        int len2 = len / len1;
967
968
        /* Our ptwo transforms don't support striding the output. */
969
1.58k
        if (len2 & (len2 - 1))
970
1.58k
            FFSWAP(int, len1, len2);
971
972
1.58k
        ff_tx_clear_ctx(s);
973
974
        /* First transform */
975
1.58k
        sub_opts.map_dir = FF_TX_MAP_GATHER;
976
1.58k
        flags &= ~AV_TX_INPLACE;
977
1.58k
        flags |=  FF_TX_OUT_OF_PLACE;
978
1.58k
        flags |=  FF_TX_PRESHUFFLE; /* This function handles the permute step */
979
1.58k
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
980
1.58k
                               len1, inv, scale);
981
982
1.58k
        if (ret == AVERROR(ENOMEM)) {
983
0
            return ret;
984
1.58k
        } else if (ret < 0) { /* Try again without a preshuffle flag */
985
792
            flags &= ~FF_TX_PRESHUFFLE;
986
792
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
987
792
                                   len1, inv, scale);
988
792
            if (ret == AVERROR(ENOMEM))
989
0
                return ret;
990
792
            else if (ret < 0)
991
0
                continue;
992
792
        }
993
994
        /* Second transform. */
995
1.58k
        sub_opts.map_dir = FF_TX_MAP_SCATTER;
996
1.58k
        flags |=  FF_TX_PRESHUFFLE;
997
1.58k
retry:
998
1.58k
        flags &= ~FF_TX_OUT_OF_PLACE;
999
1.58k
        flags |=  AV_TX_INPLACE;
1000
1.58k
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1001
1.58k
                               len2, inv, scale);
1002
1003
1.58k
        if (ret == AVERROR(ENOMEM)) {
1004
0
            return ret;
1005
1.58k
        } else if (ret < 0) { /* Try again with an out-of-place transform */
1006
0
            flags |= FF_TX_OUT_OF_PLACE;
1007
0
            flags &= ~AV_TX_INPLACE;
1008
0
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1009
0
                                   len2, inv, scale);
1010
0
            if (ret == AVERROR(ENOMEM)) {
1011
0
                return ret;
1012
0
            } else if (ret < 0) {
1013
0
                if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1014
0
                    flags &= ~FF_TX_PRESHUFFLE;
1015
0
                    goto retry;
1016
0
                } else {
1017
0
                    continue;
1018
0
                }
1019
0
            }
1020
0
        }
1021
1022
        /* Success */
1023
1.58k
        break;
1024
1.58k
    }
1025
1026
    /* If nothing was successful, error out */
1027
1.58k
    if (ret < 0)
1028
0
        return ret;
1029
1030
    /* Generate PFA map */
1031
1.58k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1032
1.58k
                                          s->sub[0].len, s->sub[1].len)))
1033
0
        return ret;
1034
1035
1.58k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1036
0
        return AVERROR(ENOMEM);
1037
1038
    /* Flatten input map */
1039
1.58k
    tmp = (int *)s->tmp;
1040
12.6k
    for (int k = 0; k < len; k += s->sub[0].len) {
1041
11.0k
        memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1042
112k
        for (int i = 0; i < s->sub[0].len; i++)
1043
101k
            s->map[k + i] = tmp[s->sub[0].map[i]];
1044
11.0k
    }
1045
1046
    /* Only allocate extra temporary memory if we need it */
1047
1.58k
    if (!(s->sub[1].flags & AV_TX_INPLACE))
1048
0
        extra_tmp_len = len;
1049
1.58k
    else if (!ps)
1050
1.58k
        extra_tmp_len = s->sub[0].len;
1051
1052
1.58k
    if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1053
0
        return AVERROR(ENOMEM);
1054
1055
1.58k
    return 0;
1056
1.58k
}
tx_int32.c:ff_tx_fft_pfa_init_int32_c
Line
Count
Source
954
140k
{
955
140k
    int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
956
140k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
957
140k
    size_t extra_tmp_len = 0;
958
140k
    int len_list[TX_MAX_DECOMPOSITIONS];
959
960
140k
    if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
961
140k
        return ret;
962
963
    /* Two iterations to test both orderings. */
964
0
    for (int i = 0; i < ret; i++) {
965
0
        int len1 = len_list[i];
966
0
        int len2 = len / len1;
967
968
        /* Our ptwo transforms don't support striding the output. */
969
0
        if (len2 & (len2 - 1))
970
0
            FFSWAP(int, len1, len2);
971
972
0
        ff_tx_clear_ctx(s);
973
974
        /* First transform */
975
0
        sub_opts.map_dir = FF_TX_MAP_GATHER;
976
0
        flags &= ~AV_TX_INPLACE;
977
0
        flags |=  FF_TX_OUT_OF_PLACE;
978
0
        flags |=  FF_TX_PRESHUFFLE; /* This function handles the permute step */
979
0
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
980
0
                               len1, inv, scale);
981
982
0
        if (ret == AVERROR(ENOMEM)) {
983
0
            return ret;
984
0
        } else if (ret < 0) { /* Try again without a preshuffle flag */
985
0
            flags &= ~FF_TX_PRESHUFFLE;
986
0
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
987
0
                                   len1, inv, scale);
988
0
            if (ret == AVERROR(ENOMEM))
989
0
                return ret;
990
0
            else if (ret < 0)
991
0
                continue;
992
0
        }
993
994
        /* Second transform. */
995
0
        sub_opts.map_dir = FF_TX_MAP_SCATTER;
996
0
        flags |=  FF_TX_PRESHUFFLE;
997
0
retry:
998
0
        flags &= ~FF_TX_OUT_OF_PLACE;
999
0
        flags |=  AV_TX_INPLACE;
1000
0
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1001
0
                               len2, inv, scale);
1002
1003
0
        if (ret == AVERROR(ENOMEM)) {
1004
0
            return ret;
1005
0
        } else if (ret < 0) { /* Try again with an out-of-place transform */
1006
0
            flags |= FF_TX_OUT_OF_PLACE;
1007
0
            flags &= ~AV_TX_INPLACE;
1008
0
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1009
0
                                   len2, inv, scale);
1010
0
            if (ret == AVERROR(ENOMEM)) {
1011
0
                return ret;
1012
0
            } else if (ret < 0) {
1013
0
                if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1014
0
                    flags &= ~FF_TX_PRESHUFFLE;
1015
0
                    goto retry;
1016
0
                } else {
1017
0
                    continue;
1018
0
                }
1019
0
            }
1020
0
        }
1021
1022
        /* Success */
1023
0
        break;
1024
0
    }
1025
1026
    /* If nothing was successful, error out */
1027
0
    if (ret < 0)
1028
0
        return ret;
1029
1030
    /* Generate PFA map */
1031
0
    if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1032
0
                                          s->sub[0].len, s->sub[1].len)))
1033
0
        return ret;
1034
1035
0
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1036
0
        return AVERROR(ENOMEM);
1037
1038
    /* Flatten input map */
1039
0
    tmp = (int *)s->tmp;
1040
0
    for (int k = 0; k < len; k += s->sub[0].len) {
1041
0
        memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1042
0
        for (int i = 0; i < s->sub[0].len; i++)
1043
0
            s->map[k + i] = tmp[s->sub[0].map[i]];
1044
0
    }
1045
1046
    /* Only allocate extra temporary memory if we need it */
1047
0
    if (!(s->sub[1].flags & AV_TX_INPLACE))
1048
0
        extra_tmp_len = len;
1049
0
    else if (!ps)
1050
0
        extra_tmp_len = s->sub[0].len;
1051
1052
0
    if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1053
0
        return AVERROR(ENOMEM);
1054
1055
0
    return 0;
1056
0
}
1057
1058
static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out,
1059
                                   void *_in, ptrdiff_t stride)
1060
1.51M
{
1061
1.51M
    const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1062
1.51M
    const int *in_map = s->map, *out_map = in_map + l;
1063
1.51M
    const int *sub_map = s->sub[1].map;
1064
1.51M
    TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1065
1.51M
    TXComplex *in = _in, *out = _out;
1066
1067
1.51M
    stride /= sizeof(*out);
1068
1069
12.1M
    for (int i = 0; i < m; i++) {
1070
107M
        for (int j = 0; j < n; j++)
1071
96.9M
            s->exp[j] = in[in_map[i*n + j]];
1072
10.6M
        s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
1073
10.6M
    }
1074
1075
16.6M
    for (int i = 0; i < n; i++)
1076
15.1M
        s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1077
1078
98.4M
    for (int i = 0; i < l; i++)
1079
96.9M
        out[i*stride] = tmp1[out_map[i]];
1080
1.51M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_pfa_double_c
tx_float.c:ff_tx_fft_pfa_float_c
Line
Count
Source
1060
1.51M
{
1061
1.51M
    const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1062
1.51M
    const int *in_map = s->map, *out_map = in_map + l;
1063
1.51M
    const int *sub_map = s->sub[1].map;
1064
1.51M
    TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1065
1.51M
    TXComplex *in = _in, *out = _out;
1066
1067
1.51M
    stride /= sizeof(*out);
1068
1069
12.1M
    for (int i = 0; i < m; i++) {
1070
107M
        for (int j = 0; j < n; j++)
1071
96.9M
            s->exp[j] = in[in_map[i*n + j]];
1072
10.6M
        s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
1073
10.6M
    }
1074
1075
16.6M
    for (int i = 0; i < n; i++)
1076
15.1M
        s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1077
1078
98.4M
    for (int i = 0; i < l; i++)
1079
96.9M
        out[i*stride] = tmp1[out_map[i]];
1080
1.51M
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_pfa_int32_c
1081
1082
static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out,
1083
                                      void *_in, ptrdiff_t stride)
1084
0
{
1085
0
    const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1086
0
    const int *in_map = s->map, *out_map = in_map + l;
1087
0
    const int *sub_map = s->sub[1].map;
1088
0
    TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1089
0
    TXComplex *in = _in, *out = _out;
1090
1091
0
    stride /= sizeof(*out);
1092
1093
0
    for (int i = 0; i < m; i++)
1094
0
        s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex));
1095
1096
0
    for (int i = 0; i < n; i++)
1097
0
        s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1098
1099
0
    for (int i = 0; i < l; i++)
1100
0
        out[i*stride] = tmp1[out_map[i]];
1101
0
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_pfa_ns_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft_pfa_ns_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft_pfa_ns_int32_c
1102
1103
static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = {
1104
    .name       = TX_NAME_STR("fft_pfa"),
1105
    .function   = TX_NAME(ff_tx_fft_pfa),
1106
    .type       = TX_TYPE(FFT),
1107
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
1108
    .factors    = { 7, 5, 3, 2, TX_FACTOR_ANY },
1109
    .nb_factors = 2,
1110
    .min_len    = 2*3,
1111
    .max_len    = TX_LEN_UNLIMITED,
1112
    .init       = TX_NAME(ff_tx_fft_pfa_init),
1113
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1114
    .prio       = FF_TX_PRIO_BASE,
1115
};
1116
1117
static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = {
1118
    .name       = TX_NAME_STR("fft_pfa_ns"),
1119
    .function   = TX_NAME(ff_tx_fft_pfa_ns),
1120
    .type       = TX_TYPE(FFT),
1121
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |
1122
                  FF_TX_PRESHUFFLE,
1123
    .factors    = { 7, 5, 3, 2, TX_FACTOR_ANY },
1124
    .nb_factors = 2,
1125
    .min_len    = 2*3,
1126
    .max_len    = TX_LEN_UNLIMITED,
1127
    .init       = TX_NAME(ff_tx_fft_pfa_init),
1128
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1129
    .prio       = FF_TX_PRIO_BASE,
1130
};
1131
1132
static av_cold int TX_NAME(ff_tx_mdct_naive_init)(AVTXContext *s,
1133
                                                  const FFTXCodelet *cd,
1134
                                                  uint64_t flags,
1135
                                                  FFTXCodeletOptions *opts,
1136
                                                  int len, int inv,
1137
                                                  const void *scale)
1138
0
{
1139
0
    s->scale_d = *((SCALE_TYPE *)scale);
1140
0
    s->scale_f = s->scale_d;
1141
0
    return 0;
1142
0
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_naive_init_double_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_naive_init_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_naive_init_int32_c
1143
1144
static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst,
1145
                                          void *_src, ptrdiff_t stride)
1146
0
{
1147
0
    TXSample *src = _src;
1148
0
    TXSample *dst = _dst;
1149
0
    double scale = s->scale_d;
1150
0
    int len = s->len;
1151
0
    const double phase = M_PI/(4.0*len);
1152
1153
0
    stride /= sizeof(*dst);
1154
1155
0
    for (int i = 0; i < len; i++) {
1156
0
        double sum = 0.0;
1157
0
        for (int j = 0; j < len*2; j++) {
1158
0
            int a = (2*j + 1 + len) * (2*i + 1);
1159
0
            sum += UNSCALE(src[j]) * cos(a * phase);
1160
0
        }
1161
0
        dst[i*stride] = RESCALE(sum*scale);
1162
0
    }
1163
0
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_naive_fwd_double_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_naive_fwd_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_naive_fwd_int32_c
1164
1165
static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst,
1166
                                          void *_src, ptrdiff_t stride)
1167
0
{
1168
0
    TXSample *src = _src;
1169
0
    TXSample *dst = _dst;
1170
0
    double scale = s->scale_d;
1171
0
    int len = s->len >> 1;
1172
0
    int len2 = len*2;
1173
0
    const double phase = M_PI/(4.0*len2);
1174
1175
0
    stride /= sizeof(*src);
1176
1177
0
    for (int i = 0; i < len; i++) {
1178
0
        double sum_d = 0.0;
1179
0
        double sum_u = 0.0;
1180
0
        double i_d = phase * (4*len  - 2*i - 1);
1181
0
        double i_u = phase * (3*len2 + 2*i + 1);
1182
0
        for (int j = 0; j < len2; j++) {
1183
0
            double a = (2 * j + 1);
1184
0
            double a_d = cos(a * i_d);
1185
0
            double a_u = cos(a * i_u);
1186
0
            double val = UNSCALE(src[j*stride]);
1187
0
            sum_d += a_d * val;
1188
0
            sum_u += a_u * val;
1189
0
        }
1190
0
        dst[i +   0] = RESCALE( sum_d*scale);
1191
0
        dst[i + len] = RESCALE(-sum_u*scale);
1192
0
    }
1193
0
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_naive_inv_double_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_naive_inv_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_naive_inv_int32_c
1194
1195
static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = {
1196
    .name       = TX_NAME_STR("mdct_naive_fwd"),
1197
    .function   = TX_NAME(ff_tx_mdct_naive_fwd),
1198
    .type       = TX_TYPE(MDCT),
1199
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
1200
    .factors    = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */
1201
    .nb_factors = 2,
1202
    .min_len    = 2,
1203
    .max_len    = TX_LEN_UNLIMITED,
1204
    .init       = TX_NAME(ff_tx_mdct_naive_init),
1205
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1206
    .prio       = FF_TX_PRIO_MIN,
1207
};
1208
1209
static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = {
1210
    .name       = TX_NAME_STR("mdct_naive_inv"),
1211
    .function   = TX_NAME(ff_tx_mdct_naive_inv),
1212
    .type       = TX_TYPE(MDCT),
1213
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
1214
    .factors    = { 2, TX_FACTOR_ANY },
1215
    .nb_factors = 2,
1216
    .min_len    = 2,
1217
    .max_len    = TX_LEN_UNLIMITED,
1218
    .init       = TX_NAME(ff_tx_mdct_naive_init),
1219
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1220
    .prio       = FF_TX_PRIO_MIN,
1221
};
1222
1223
static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s,
1224
                                            const FFTXCodelet *cd,
1225
                                            uint64_t flags,
1226
                                            FFTXCodeletOptions *opts,
1227
                                            int len, int inv,
1228
                                            const void *scale)
1229
602k
{
1230
602k
    int ret;
1231
602k
    FFTXCodeletOptions sub_opts = {
1232
602k
        .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
1233
602k
    };
1234
1235
602k
    s->scale_d = *((SCALE_TYPE *)scale);
1236
602k
    s->scale_f = s->scale_d;
1237
1238
602k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1239
602k
    flags |=  AV_TX_INPLACE;      /* in-place */
1240
602k
    flags |=  FF_TX_PRESHUFFLE;   /* First try with an in-place transform */
1241
1242
602k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1243
602k
                                inv, scale))) {
1244
0
        flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1245
0
        if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1246
0
                                    inv, scale)))
1247
0
            return ret;
1248
0
    }
1249
1250
602k
    s->map = av_malloc((len >> 1)*sizeof(*s->map));
1251
602k
    if (!s->map)
1252
0
        return AVERROR(ENOMEM);
1253
1254
    /* If we need to preshuffle copy the map from the subcontext */
1255
602k
    if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1256
602k
        memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1257
602k
    } else {
1258
0
        for (int i = 0; i < len >> 1; i++)
1259
0
            s->map[i] = i;
1260
0
    }
1261
1262
602k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1263
0
        return ret;
1264
1265
    /* Saves a multiply in a hot path. */
1266
602k
    if (inv)
1267
114M
        for (int i = 0; i < (s->len >> 1); i++)
1268
114M
            s->map[i] <<= 1;
1269
1270
602k
    return 0;
1271
602k
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_init_double_c
tx_float.c:ff_tx_mdct_init_float_c
Line
Count
Source
1229
485k
{
1230
485k
    int ret;
1231
485k
    FFTXCodeletOptions sub_opts = {
1232
485k
        .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
1233
485k
    };
1234
1235
485k
    s->scale_d = *((SCALE_TYPE *)scale);
1236
485k
    s->scale_f = s->scale_d;
1237
1238
485k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1239
485k
    flags |=  AV_TX_INPLACE;      /* in-place */
1240
485k
    flags |=  FF_TX_PRESHUFFLE;   /* First try with an in-place transform */
1241
1242
485k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1243
485k
                                inv, scale))) {
1244
0
        flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1245
0
        if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1246
0
                                    inv, scale)))
1247
0
            return ret;
1248
0
    }
1249
1250
485k
    s->map = av_malloc((len >> 1)*sizeof(*s->map));
1251
485k
    if (!s->map)
1252
0
        return AVERROR(ENOMEM);
1253
1254
    /* If we need to preshuffle copy the map from the subcontext */
1255
485k
    if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1256
485k
        memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1257
485k
    } else {
1258
0
        for (int i = 0; i < len >> 1; i++)
1259
0
            s->map[i] = i;
1260
0
    }
1261
1262
485k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1263
0
        return ret;
1264
1265
    /* Saves a multiply in a hot path. */
1266
485k
    if (inv)
1267
105M
        for (int i = 0; i < (s->len >> 1); i++)
1268
105M
            s->map[i] <<= 1;
1269
1270
485k
    return 0;
1271
485k
}
tx_int32.c:ff_tx_mdct_init_int32_c
Line
Count
Source
1229
116k
{
1230
116k
    int ret;
1231
116k
    FFTXCodeletOptions sub_opts = {
1232
116k
        .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
1233
116k
    };
1234
1235
116k
    s->scale_d = *((SCALE_TYPE *)scale);
1236
116k
    s->scale_f = s->scale_d;
1237
1238
116k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1239
116k
    flags |=  AV_TX_INPLACE;      /* in-place */
1240
116k
    flags |=  FF_TX_PRESHUFFLE;   /* First try with an in-place transform */
1241
1242
116k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1243
116k
                                inv, scale))) {
1244
0
        flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1245
0
        if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1246
0
                                    inv, scale)))
1247
0
            return ret;
1248
0
    }
1249
1250
116k
    s->map = av_malloc((len >> 1)*sizeof(*s->map));
1251
116k
    if (!s->map)
1252
0
        return AVERROR(ENOMEM);
1253
1254
    /* If we need to preshuffle copy the map from the subcontext */
1255
116k
    if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1256
116k
        memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1257
116k
    } else {
1258
0
        for (int i = 0; i < len >> 1; i++)
1259
0
            s->map[i] = i;
1260
0
    }
1261
1262
116k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1263
0
        return ret;
1264
1265
    /* Saves a multiply in a hot path. */
1266
116k
    if (inv)
1267
8.92M
        for (int i = 0; i < (s->len >> 1); i++)
1268
8.81M
            s->map[i] <<= 1;
1269
1270
116k
    return 0;
1271
116k
}
1272
1273
static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
1274
                                    ptrdiff_t stride)
1275
51.3k
{
1276
51.3k
    TXSample *src = _src, *dst = _dst;
1277
51.3k
    TXComplex *exp = s->exp, tmp, *z = _dst;
1278
51.3k
    const int len2 = s->len >> 1;
1279
51.3k
    const int len4 = s->len >> 2;
1280
51.3k
    const int len3 = len2 * 3;
1281
51.3k
    const int *sub_map = s->map;
1282
1283
51.3k
    stride /= sizeof(*dst);
1284
1285
26.3M
    for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1286
26.2M
        const int k = 2*i;
1287
26.2M
        const int idx = sub_map[i];
1288
26.2M
        if (k < len2) {
1289
13.1M
            tmp.re = FOLD(-src[ len2 + k],  src[1*len2 - 1 - k]);
1290
13.1M
            tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1291
13.1M
        } else {
1292
13.1M
            tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1293
13.1M
            tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1294
13.1M
        }
1295
26.2M
        CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1296
26.2M
    }
1297
1298
51.3k
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1299
1300
13.2M
    for (int i = 0; i < len4; i++) {
1301
13.1M
        const int i0 = len4 + i, i1 = len4 - i - 1;
1302
13.1M
        TXComplex src1 = { z[i1].re, z[i1].im };
1303
13.1M
        TXComplex src0 = { z[i0].re, z[i0].im };
1304
1305
13.1M
        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1306
13.1M
             exp[i0].im, exp[i0].re);
1307
13.1M
        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1308
13.1M
             exp[i1].im, exp[i1].re);
1309
13.1M
    }
1310
51.3k
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_fwd_double_c
tx_float.c:ff_tx_mdct_fwd_float_c
Line
Count
Source
1275
20.7k
{
1276
20.7k
    TXSample *src = _src, *dst = _dst;
1277
20.7k
    TXComplex *exp = s->exp, tmp, *z = _dst;
1278
20.7k
    const int len2 = s->len >> 1;
1279
20.7k
    const int len4 = s->len >> 2;
1280
20.7k
    const int len3 = len2 * 3;
1281
20.7k
    const int *sub_map = s->map;
1282
1283
20.7k
    stride /= sizeof(*dst);
1284
1285
10.6M
    for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1286
10.6M
        const int k = 2*i;
1287
10.6M
        const int idx = sub_map[i];
1288
10.6M
        if (k < len2) {
1289
5.31M
            tmp.re = FOLD(-src[ len2 + k],  src[1*len2 - 1 - k]);
1290
5.31M
            tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1291
5.31M
        } else {
1292
5.31M
            tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1293
5.31M
            tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1294
5.31M
        }
1295
10.6M
        CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1296
10.6M
    }
1297
1298
20.7k
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1299
1300
5.33M
    for (int i = 0; i < len4; i++) {
1301
5.31M
        const int i0 = len4 + i, i1 = len4 - i - 1;
1302
5.31M
        TXComplex src1 = { z[i1].re, z[i1].im };
1303
5.31M
        TXComplex src0 = { z[i0].re, z[i0].im };
1304
1305
5.31M
        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1306
5.31M
             exp[i0].im, exp[i0].re);
1307
5.31M
        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1308
5.31M
             exp[i1].im, exp[i1].re);
1309
5.31M
    }
1310
20.7k
}
tx_int32.c:ff_tx_mdct_fwd_int32_c
Line
Count
Source
1275
30.6k
{
1276
30.6k
    TXSample *src = _src, *dst = _dst;
1277
30.6k
    TXComplex *exp = s->exp, tmp, *z = _dst;
1278
30.6k
    const int len2 = s->len >> 1;
1279
30.6k
    const int len4 = s->len >> 2;
1280
30.6k
    const int len3 = len2 * 3;
1281
30.6k
    const int *sub_map = s->map;
1282
1283
30.6k
    stride /= sizeof(*dst);
1284
1285
15.7M
    for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1286
15.6M
        const int k = 2*i;
1287
15.6M
        const int idx = sub_map[i];
1288
15.6M
        if (k < len2) {
1289
7.83M
            tmp.re = FOLD(-src[ len2 + k],  src[1*len2 - 1 - k]);
1290
7.83M
            tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1291
7.83M
        } else {
1292
7.83M
            tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1293
7.83M
            tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1294
7.83M
        }
1295
15.6M
        CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1296
15.6M
    }
1297
1298
30.6k
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1299
1300
7.86M
    for (int i = 0; i < len4; i++) {
1301
7.83M
        const int i0 = len4 + i, i1 = len4 - i - 1;
1302
7.83M
        TXComplex src1 = { z[i1].re, z[i1].im };
1303
7.83M
        TXComplex src0 = { z[i0].re, z[i0].im };
1304
1305
7.83M
        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1306
7.83M
             exp[i0].im, exp[i0].re);
1307
7.83M
        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1308
7.83M
             exp[i1].im, exp[i1].re);
1309
7.83M
    }
1310
30.6k
}
1311
1312
static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
1313
                                    ptrdiff_t stride)
1314
189M
{
1315
189M
    TXComplex *z = _dst, *exp = s->exp;
1316
189M
    const TXSample *src = _src, *in1, *in2;
1317
189M
    const int len2 = s->len >> 1;
1318
189M
    const int len4 = s->len >> 2;
1319
189M
    const int *sub_map = s->map;
1320
1321
189M
    stride /= sizeof(*src);
1322
189M
    in1 = src;
1323
189M
    in2 = src + ((len2*2) - 1) * stride;
1324
1325
6.95G
    for (int i = 0; i < len2; i++) {
1326
6.76G
        int k = sub_map[i];
1327
6.76G
        TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1328
6.76G
        CMUL3(z[i], tmp, exp[i]);
1329
6.76G
    }
1330
1331
189M
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1332
1333
189M
    exp += len2;
1334
3.57G
    for (int i = 0; i < len4; i++) {
1335
3.38G
        const int i0 = len4 + i, i1 = len4 - i - 1;
1336
3.38G
        TXComplex src1 = { z[i1].im, z[i1].re };
1337
3.38G
        TXComplex src0 = { z[i0].im, z[i0].re };
1338
1339
3.38G
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1340
3.38G
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1341
3.38G
    }
1342
189M
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_inv_double_c
tx_float.c:ff_tx_mdct_inv_float_c
Line
Count
Source
1314
140M
{
1315
140M
    TXComplex *z = _dst, *exp = s->exp;
1316
140M
    const TXSample *src = _src, *in1, *in2;
1317
140M
    const int len2 = s->len >> 1;
1318
140M
    const int len4 = s->len >> 2;
1319
140M
    const int *sub_map = s->map;
1320
1321
140M
    stride /= sizeof(*src);
1322
140M
    in1 = src;
1323
140M
    in2 = src + ((len2*2) - 1) * stride;
1324
1325
4.86G
    for (int i = 0; i < len2; i++) {
1326
4.72G
        int k = sub_map[i];
1327
4.72G
        TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1328
4.72G
        CMUL3(z[i], tmp, exp[i]);
1329
4.72G
    }
1330
1331
140M
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1332
1333
140M
    exp += len2;
1334
2.50G
    for (int i = 0; i < len4; i++) {
1335
2.36G
        const int i0 = len4 + i, i1 = len4 - i - 1;
1336
2.36G
        TXComplex src1 = { z[i1].im, z[i1].re };
1337
2.36G
        TXComplex src0 = { z[i0].im, z[i0].re };
1338
1339
2.36G
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1340
2.36G
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1341
2.36G
    }
1342
140M
}
tx_int32.c:ff_tx_mdct_inv_int32_c
Line
Count
Source
1314
49.1M
{
1315
49.1M
    TXComplex *z = _dst, *exp = s->exp;
1316
49.1M
    const TXSample *src = _src, *in1, *in2;
1317
49.1M
    const int len2 = s->len >> 1;
1318
49.1M
    const int len4 = s->len >> 2;
1319
49.1M
    const int *sub_map = s->map;
1320
1321
49.1M
    stride /= sizeof(*src);
1322
49.1M
    in1 = src;
1323
49.1M
    in2 = src + ((len2*2) - 1) * stride;
1324
1325
2.09G
    for (int i = 0; i < len2; i++) {
1326
2.04G
        int k = sub_map[i];
1327
2.04G
        TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1328
2.04G
        CMUL3(z[i], tmp, exp[i]);
1329
2.04G
    }
1330
1331
49.1M
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1332
1333
49.1M
    exp += len2;
1334
1.07G
    for (int i = 0; i < len4; i++) {
1335
1.02G
        const int i0 = len4 + i, i1 = len4 - i - 1;
1336
1.02G
        TXComplex src1 = { z[i1].im, z[i1].re };
1337
1.02G
        TXComplex src0 = { z[i0].im, z[i0].re };
1338
1339
1.02G
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1340
1.02G
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1341
1.02G
    }
1342
49.1M
}
1343
1344
static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
1345
    .name       = TX_NAME_STR("mdct_fwd"),
1346
    .function   = TX_NAME(ff_tx_mdct_fwd),
1347
    .type       = TX_TYPE(MDCT),
1348
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
1349
    .factors    = { 2, TX_FACTOR_ANY },
1350
    .nb_factors = 2,
1351
    .min_len    = 2,
1352
    .max_len    = TX_LEN_UNLIMITED,
1353
    .init       = TX_NAME(ff_tx_mdct_init),
1354
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1355
    .prio       = FF_TX_PRIO_BASE,
1356
};
1357
1358
static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = {
1359
    .name       = TX_NAME_STR("mdct_inv"),
1360
    .function   = TX_NAME(ff_tx_mdct_inv),
1361
    .type       = TX_TYPE(MDCT),
1362
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
1363
    .factors    = { 2, TX_FACTOR_ANY },
1364
    .nb_factors = 2,
1365
    .min_len    = 2,
1366
    .max_len    = TX_LEN_UNLIMITED,
1367
    .init       = TX_NAME(ff_tx_mdct_init),
1368
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1369
    .prio       = FF_TX_PRIO_BASE,
1370
};
1371
1372
static av_cold int TX_NAME(ff_tx_mdct_inv_full_init)(AVTXContext *s,
1373
                                                     const FFTXCodelet *cd,
1374
                                                     uint64_t flags,
1375
                                                     FFTXCodeletOptions *opts,
1376
                                                     int len, int inv,
1377
                                                     const void *scale)
1378
38.9k
{
1379
38.9k
    int ret;
1380
1381
38.9k
    s->scale_d = *((SCALE_TYPE *)scale);
1382
38.9k
    s->scale_f = s->scale_d;
1383
1384
38.9k
    flags &= ~AV_TX_FULL_IMDCT;
1385
1386
38.9k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1387
0
        return ret;
1388
1389
38.9k
    return 0;
1390
38.9k
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_inv_full_init_double_c
tx_float.c:ff_tx_mdct_inv_full_init_float_c
Line
Count
Source
1378
38.9k
{
1379
38.9k
    int ret;
1380
1381
38.9k
    s->scale_d = *((SCALE_TYPE *)scale);
1382
38.9k
    s->scale_f = s->scale_d;
1383
1384
38.9k
    flags &= ~AV_TX_FULL_IMDCT;
1385
1386
38.9k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1387
0
        return ret;
1388
1389
38.9k
    return 0;
1390
38.9k
}
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_inv_full_init_int32_c
1391
1392
static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst,
1393
                                         void *_src, ptrdiff_t stride)
1394
4.60M
{
1395
4.60M
    int len  = s->len << 1;
1396
4.60M
    int len2 = len >> 1;
1397
4.60M
    int len4 = len >> 2;
1398
4.60M
    TXSample *dst = _dst;
1399
1400
4.60M
    s->fn[0](&s->sub[0], dst + len4, _src, stride);
1401
1402
4.60M
    stride /= sizeof(*dst);
1403
1404
471M
    for (int i = 0; i < len4; i++) {
1405
466M
        dst[            i*stride] = -dst[(len2 - i - 1)*stride];
1406
466M
        dst[(len - i - 1)*stride] =  dst[(len2 + i + 0)*stride];
1407
466M
    }
1408
4.60M
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_inv_full_double_c
tx_float.c:ff_tx_mdct_inv_full_float_c
Line
Count
Source
1394
4.60M
{
1395
4.60M
    int len  = s->len << 1;
1396
4.60M
    int len2 = len >> 1;
1397
4.60M
    int len4 = len >> 2;
1398
4.60M
    TXSample *dst = _dst;
1399
1400
4.60M
    s->fn[0](&s->sub[0], dst + len4, _src, stride);
1401
1402
4.60M
    stride /= sizeof(*dst);
1403
1404
471M
    for (int i = 0; i < len4; i++) {
1405
466M
        dst[            i*stride] = -dst[(len2 - i - 1)*stride];
1406
466M
        dst[(len - i - 1)*stride] =  dst[(len2 + i + 0)*stride];
1407
466M
    }
1408
4.60M
}
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_inv_full_int32_c
1409
1410
static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = {
1411
    .name       = TX_NAME_STR("mdct_inv_full"),
1412
    .function   = TX_NAME(ff_tx_mdct_inv_full),
1413
    .type       = TX_TYPE(MDCT),
1414
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
1415
                  FF_TX_OUT_OF_PLACE | AV_TX_FULL_IMDCT,
1416
    .factors    = { 2, TX_FACTOR_ANY },
1417
    .nb_factors = 2,
1418
    .min_len    = 2,
1419
    .max_len    = TX_LEN_UNLIMITED,
1420
    .init       = TX_NAME(ff_tx_mdct_inv_full_init),
1421
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1422
    .prio       = FF_TX_PRIO_BASE,
1423
};
1424
1425
static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
1426
                                                const FFTXCodelet *cd,
1427
                                                uint64_t flags,
1428
                                                FFTXCodeletOptions *opts,
1429
                                                int len, int inv,
1430
                                                const void *scale)
1431
379k
{
1432
379k
    int ret, sub_len;
1433
379k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1434
1435
379k
    len >>= 1;
1436
379k
    sub_len = len / cd->factors[0];
1437
1438
379k
    s->scale_d = *((SCALE_TYPE *)scale);
1439
379k
    s->scale_f = s->scale_d;
1440
1441
379k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1442
379k
    flags |=  AV_TX_INPLACE;      /* in-place */
1443
379k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
1444
1445
379k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1446
379k
                                sub_len, inv, scale)))
1447
0
        return ret;
1448
1449
379k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1450
0
        return ret;
1451
1452
    /* Our 15-point transform is also a compound one, so embed its input map */
1453
379k
    if (cd->factors[0] == 15)
1454
338k
        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1455
1456
379k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1457
0
        return ret;
1458
1459
    /* Saves multiplies in loops. */
1460
87.4M
    for (int i = 0; i < len; i++)
1461
87.0M
        s->map[i] <<= 1;
1462
1463
379k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1464
0
        return AVERROR(ENOMEM);
1465
1466
379k
    TX_TAB(ff_tx_init_tabs)(len / sub_len);
1467
1468
379k
    return 0;
1469
379k
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_init_double_c
tx_float.c:ff_tx_mdct_pfa_init_float_c
Line
Count
Source
1431
349k
{
1432
349k
    int ret, sub_len;
1433
349k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1434
1435
349k
    len >>= 1;
1436
349k
    sub_len = len / cd->factors[0];
1437
1438
349k
    s->scale_d = *((SCALE_TYPE *)scale);
1439
349k
    s->scale_f = s->scale_d;
1440
1441
349k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1442
349k
    flags |=  AV_TX_INPLACE;      /* in-place */
1443
349k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
1444
1445
349k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1446
349k
                                sub_len, inv, scale)))
1447
0
        return ret;
1448
1449
349k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1450
0
        return ret;
1451
1452
    /* Our 15-point transform is also a compound one, so embed its input map */
1453
349k
    if (cd->factors[0] == 15)
1454
320k
        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1455
1456
349k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1457
0
        return ret;
1458
1459
    /* Saves multiplies in loops. */
1460
80.1M
    for (int i = 0; i < len; i++)
1461
79.8M
        s->map[i] <<= 1;
1462
1463
349k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1464
0
        return AVERROR(ENOMEM);
1465
1466
349k
    TX_TAB(ff_tx_init_tabs)(len / sub_len);
1467
1468
349k
    return 0;
1469
349k
}
tx_int32.c:ff_tx_mdct_pfa_init_int32_c
Line
Count
Source
1431
29.8k
{
1432
29.8k
    int ret, sub_len;
1433
29.8k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1434
1435
29.8k
    len >>= 1;
1436
29.8k
    sub_len = len / cd->factors[0];
1437
1438
29.8k
    s->scale_d = *((SCALE_TYPE *)scale);
1439
29.8k
    s->scale_f = s->scale_d;
1440
1441
29.8k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1442
29.8k
    flags |=  AV_TX_INPLACE;      /* in-place */
1443
29.8k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
1444
1445
29.8k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1446
29.8k
                                sub_len, inv, scale)))
1447
0
        return ret;
1448
1449
29.8k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1450
0
        return ret;
1451
1452
    /* Our 15-point transform is also a compound one, so embed its input map */
1453
29.8k
    if (cd->factors[0] == 15)
1454
17.9k
        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1455
1456
29.8k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1457
0
        return ret;
1458
1459
    /* Saves multiplies in loops. */
1460
7.26M
    for (int i = 0; i < len; i++)
1461
7.23M
        s->map[i] <<= 1;
1462
1463
29.8k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1464
0
        return AVERROR(ENOMEM);
1465
1466
29.8k
    TX_TAB(ff_tx_init_tabs)(len / sub_len);
1467
1468
29.8k
    return 0;
1469
29.8k
}
1470
1471
#define DECL_COMP_IMDCT(N)                                                     \
1472
static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst,    \
1473
2.46M
                                                void *_src, ptrdiff_t stride)  \
1474
2.46M
{                                                                              \
1475
2.46M
    TXComplex fft##N##in[N];                                                   \
1476
2.46M
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
2.46M
    const TXSample *src = _src, *in1, *in2;                                    \
1478
2.46M
    const int len4 = s->len >> 2;                                              \
1479
2.46M
    const int len2 = s->len >> 1;                                              \
1480
2.46M
    const int m = s->sub->len;                                                 \
1481
2.46M
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
2.46M
    const int *sub_map = s->sub->map;                                          \
1483
2.46M
                                                                               \
1484
2.46M
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
2.46M
    in1 = src;                                                                 \
1486
2.46M
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
2.46M
                                                                               \
1488
110M
    for (int i = 0; i < len2; i += N) {                                        \
1489
544M
        for (int j = 0; j < N; j++) {                                          \
1490
436M
            const int k = in_map[j];                                           \
1491
436M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
436M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
436M
        }                                                                      \
1494
108M
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
108M
        exp += N;                                                              \
1496
108M
        in_map += N;                                                           \
1497
108M
    }                                                                          \
1498
2.46M
                                                                               \
1499
16.0M
    for (int i = 0; i < N; i++)                                                \
1500
13.6M
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
2.46M
                                                                               \
1502
220M
    for (int i = 0; i < len4; i++) {                                           \
1503
218M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
218M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
218M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
218M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
218M
                                                                               \
1508
218M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
218M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
218M
    }                                                                          \
1511
2.46M
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_3xM_inv_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_5xM_inv_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_7xM_inv_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_9xM_inv_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_15xM_inv_double_c
tx_float.c:ff_tx_mdct_pfa_3xM_inv_float_c
Line
Count
Source
1473
1.17M
                                                void *_src, ptrdiff_t stride)  \
1474
1.17M
{                                                                              \
1475
1.17M
    TXComplex fft##N##in[N];                                                   \
1476
1.17M
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
1.17M
    const TXSample *src = _src, *in1, *in2;                                    \
1478
1.17M
    const int len4 = s->len >> 2;                                              \
1479
1.17M
    const int len2 = s->len >> 1;                                              \
1480
1.17M
    const int m = s->sub->len;                                                 \
1481
1.17M
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
1.17M
    const int *sub_map = s->sub->map;                                          \
1483
1.17M
                                                                               \
1484
1.17M
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
1.17M
    in1 = src;                                                                 \
1486
1.17M
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
1.17M
                                                                               \
1488
75.1M
    for (int i = 0; i < len2; i += N) {                                        \
1489
295M
        for (int j = 0; j < N; j++) {                                          \
1490
221M
            const int k = in_map[j];                                           \
1491
221M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
221M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
221M
        }                                                                      \
1494
73.9M
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
73.9M
        exp += N;                                                              \
1496
73.9M
        in_map += N;                                                           \
1497
73.9M
    }                                                                          \
1498
1.17M
                                                                               \
1499
4.69M
    for (int i = 0; i < N; i++)                                                \
1500
3.52M
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
1.17M
                                                                               \
1502
112M
    for (int i = 0; i < len4; i++) {                                           \
1503
110M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
110M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
110M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
110M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
110M
                                                                               \
1508
110M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
110M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
110M
    }                                                                          \
1511
1.17M
}                                                                              \
tx_float.c:ff_tx_mdct_pfa_5xM_inv_float_c
Line
Count
Source
1473
932k
                                                void *_src, ptrdiff_t stride)  \
1474
932k
{                                                                              \
1475
932k
    TXComplex fft##N##in[N];                                                   \
1476
932k
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
932k
    const TXSample *src = _src, *in1, *in2;                                    \
1478
932k
    const int len4 = s->len >> 2;                                              \
1479
932k
    const int len2 = s->len >> 1;                                              \
1480
932k
    const int m = s->sub->len;                                                 \
1481
932k
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
932k
    const int *sub_map = s->sub->map;                                          \
1483
932k
                                                                               \
1484
932k
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
932k
    in1 = src;                                                                 \
1486
932k
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
932k
                                                                               \
1488
30.7M
    for (int i = 0; i < len2; i += N) {                                        \
1489
178M
        for (int j = 0; j < N; j++) {                                          \
1490
149M
            const int k = in_map[j];                                           \
1491
149M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
149M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
149M
        }                                                                      \
1494
29.8M
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
29.8M
        exp += N;                                                              \
1496
29.8M
        in_map += N;                                                           \
1497
29.8M
    }                                                                          \
1498
932k
                                                                               \
1499
5.59M
    for (int i = 0; i < N; i++)                                                \
1500
4.66M
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
932k
                                                                               \
1502
75.4M
    for (int i = 0; i < len4; i++) {                                           \
1503
74.5M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
74.5M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
74.5M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
74.5M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
74.5M
                                                                               \
1508
74.5M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
74.5M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
74.5M
    }                                                                          \
1511
932k
}                                                                              \
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_7xM_inv_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_9xM_inv_float_c
tx_float.c:ff_tx_mdct_pfa_15xM_inv_float_c
Line
Count
Source
1473
347k
                                                void *_src, ptrdiff_t stride)  \
1474
347k
{                                                                              \
1475
347k
    TXComplex fft##N##in[N];                                                   \
1476
347k
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
347k
    const TXSample *src = _src, *in1, *in2;                                    \
1478
347k
    const int len4 = s->len >> 2;                                              \
1479
347k
    const int len2 = s->len >> 1;                                              \
1480
347k
    const int m = s->sub->len;                                                 \
1481
347k
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
347k
    const int *sub_map = s->sub->map;                                          \
1483
347k
                                                                               \
1484
347k
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
347k
    in1 = src;                                                                 \
1486
347k
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
347k
                                                                               \
1488
4.52M
    for (int i = 0; i < len2; i += N) {                                        \
1489
66.7M
        for (int j = 0; j < N; j++) {                                          \
1490
62.6M
            const int k = in_map[j];                                           \
1491
62.6M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
62.6M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
62.6M
        }                                                                      \
1494
4.17M
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
4.17M
        exp += N;                                                              \
1496
4.17M
        in_map += N;                                                           \
1497
4.17M
    }                                                                          \
1498
347k
                                                                               \
1499
5.56M
    for (int i = 0; i < N; i++)                                                \
1500
5.21M
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
347k
                                                                               \
1502
31.6M
    for (int i = 0; i < len4; i++) {                                           \
1503
31.3M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
31.3M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
31.3M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
31.3M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
31.3M
                                                                               \
1508
31.3M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
31.3M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
31.3M
    }                                                                          \
1511
347k
}                                                                              \
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_3xM_inv_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_5xM_inv_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_7xM_inv_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_9xM_inv_int32_c
tx_int32.c:ff_tx_mdct_pfa_15xM_inv_int32_c
Line
Count
Source
1473
14.5k
                                                void *_src, ptrdiff_t stride)  \
1474
14.5k
{                                                                              \
1475
14.5k
    TXComplex fft##N##in[N];                                                   \
1476
14.5k
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
14.5k
    const TXSample *src = _src, *in1, *in2;                                    \
1478
14.5k
    const int len4 = s->len >> 2;                                              \
1479
14.5k
    const int len2 = s->len >> 1;                                              \
1480
14.5k
    const int m = s->sub->len;                                                 \
1481
14.5k
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
14.5k
    const int *sub_map = s->sub->map;                                          \
1483
14.5k
                                                                               \
1484
14.5k
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
14.5k
    in1 = src;                                                                 \
1486
14.5k
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
14.5k
                                                                               \
1488
228k
    for (int i = 0; i < len2; i += N) {                                        \
1489
3.41M
        for (int j = 0; j < N; j++) {                                          \
1490
3.20M
            const int k = in_map[j];                                           \
1491
3.20M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
3.20M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
3.20M
        }                                                                      \
1494
213k
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
213k
        exp += N;                                                              \
1496
213k
        in_map += N;                                                           \
1497
213k
    }                                                                          \
1498
14.5k
                                                                               \
1499
232k
    for (int i = 0; i < N; i++)                                                \
1500
218k
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
14.5k
                                                                               \
1502
1.61M
    for (int i = 0; i < len4; i++) {                                           \
1503
1.60M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
1.60M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
1.60M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
1.60M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
1.60M
                                                                               \
1508
1.60M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
1.60M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
1.60M
    }                                                                          \
1511
14.5k
}                                                                              \
1512
                                                                               \
1513
static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = {           \
1514
    .name       = TX_NAME_STR("mdct_pfa_" #N "xM_inv"),                        \
1515
    .function   = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv),                         \
1516
    .type       = TX_TYPE(MDCT),                                               \
1517
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,   \
1518
    .factors    = { N, TX_FACTOR_ANY },                                        \
1519
    .nb_factors = 2,                                                           \
1520
    .min_len    = N*2,                                                         \
1521
    .max_len    = TX_LEN_UNLIMITED,                                            \
1522
    .init       = TX_NAME(ff_tx_mdct_pfa_init),                                \
1523
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
1524
    .prio       = FF_TX_PRIO_BASE,                                             \
1525
};
1526
1527
DECL_COMP_IMDCT(3)
1528
DECL_COMP_IMDCT(5)
1529
DECL_COMP_IMDCT(7)
1530
DECL_COMP_IMDCT(9)
1531
DECL_COMP_IMDCT(15)
1532
1533
#define DECL_COMP_MDCT(N)                                                      \
1534
static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst,    \
1535
0
                                                void *_src, ptrdiff_t stride)  \
1536
0
{                                                                              \
1537
0
    TXComplex fft##N##in[N];                                                   \
1538
0
    TXSample *src = _src, *dst = _dst;                                         \
1539
0
    TXComplex *exp = s->exp, tmp;                                              \
1540
0
    const int m = s->sub->len;                                                 \
1541
0
    const int len4 = N*m;                                                      \
1542
0
    const int len3 = len4 * 3;                                                 \
1543
0
    const int len8 = s->len >> 2;                                              \
1544
0
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1545
0
    const int *sub_map = s->sub->map;                                          \
1546
0
                                                                               \
1547
0
    stride /= sizeof(*dst);                                                    \
1548
0
                                                                               \
1549
0
    for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */             \
1550
0
        for (int j = 0; j < N; j++) {                                          \
1551
0
            const int k = in_map[i*N + j];                                     \
1552
0
            if (k < len4) {                                                    \
1553
0
                tmp.re = FOLD(-src[ len4 + k],  src[1*len4 - 1 - k]);          \
1554
0
                tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);          \
1555
0
            } else {                                                           \
1556
0
                tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]);          \
1557
0
                tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]);          \
1558
0
            }                                                                  \
1559
0
            CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im,           \
1560
0
                 exp[k >> 1].re, exp[k >> 1].im);                              \
1561
0
        }                                                                      \
1562
0
        fft##N(s->tmp + sub_map[i], fft##N##in, m);                            \
1563
0
    }                                                                          \
1564
0
                                                                               \
1565
0
    for (int i = 0; i < N; i++)                                                \
1566
0
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1567
0
                                                                               \
1568
0
    for (int i = 0; i < len8; i++) {                                           \
1569
0
        const int i0 = len8 + i, i1 = len8 - i - 1;                            \
1570
0
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1571
0
        TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im };                     \
1572
0
        TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im };                     \
1573
0
                                                                               \
1574
0
        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,    \
1575
0
             exp[i0].im, exp[i0].re);                                          \
1576
0
        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,    \
1577
0
             exp[i1].im, exp[i1].re);                                          \
1578
0
    }                                                                          \
1579
0
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_3xM_fwd_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_5xM_fwd_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_7xM_fwd_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_9xM_fwd_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_15xM_fwd_double_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_3xM_fwd_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_5xM_fwd_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_7xM_fwd_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_9xM_fwd_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_15xM_fwd_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_3xM_fwd_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_5xM_fwd_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_7xM_fwd_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_9xM_fwd_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_15xM_fwd_int32_c
1580
                                                                               \
1581
static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = {           \
1582
    .name       = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"),                        \
1583
    .function   = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd),                         \
1584
    .type       = TX_TYPE(MDCT),                                               \
1585
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,   \
1586
    .factors    = { N, TX_FACTOR_ANY },                                        \
1587
    .nb_factors = 2,                                                           \
1588
    .min_len    = N*2,                                                         \
1589
    .max_len    = TX_LEN_UNLIMITED,                                            \
1590
    .init       = TX_NAME(ff_tx_mdct_pfa_init),                                \
1591
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
1592
    .prio       = FF_TX_PRIO_BASE,                                             \
1593
};
1594
1595
DECL_COMP_MDCT(3)
1596
DECL_COMP_MDCT(5)
1597
DECL_COMP_MDCT(7)
1598
DECL_COMP_MDCT(9)
1599
DECL_COMP_MDCT(15)
1600
1601
static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s,
1602
                                            const FFTXCodelet *cd,
1603
                                            uint64_t flags,
1604
                                            FFTXCodeletOptions *opts,
1605
                                            int len, int inv,
1606
                                            const void *scale)
1607
6.07k
{
1608
6.07k
    int ret;
1609
6.07k
    double f, m;
1610
6.07k
    TXSample *tab;
1611
6.07k
    uint64_t r2r = flags & AV_TX_REAL_TO_REAL;
1612
6.07k
    int len4 = FFALIGN(len, 4) / 4;
1613
1614
6.07k
    s->scale_d = *((SCALE_TYPE *)scale);
1615
6.07k
    s->scale_f = s->scale_d;
1616
1617
6.07k
    flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY);
1618
1619
6.07k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1620
0
        return ret;
1621
1622
6.07k
    if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
1623
0
        return AVERROR(ENOMEM);
1624
1625
6.07k
    tab = (TXSample *)s->exp;
1626
1627
6.07k
    f = 2*M_PI/len;
1628
1629
6.07k
    m = (inv ? 2*s->scale_d : s->scale_d);
1630
1631
6.07k
    *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1632
6.07k
    *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1633
6.07k
    *tab++ = RESCALE( m);
1634
6.07k
    *tab++ = RESCALE(-m);
1635
1636
6.07k
    *tab++ = RESCALE( (0.5 - 0.0) * m);
1637
6.07k
    if (r2r)
1638
792
        *tab++ = 1 / s->scale_f;
1639
5.28k
    else
1640
5.28k
        *tab++ = RESCALE( (0.0 - 0.5) * m);
1641
6.07k
    *tab++ = RESCALE( (0.5 - inv) * m);
1642
6.07k
    *tab++ = RESCALE(-(0.5 - inv) * m);
1643
1644
796k
    for (int i = 0; i < len4; i++)
1645
790k
        *tab++ = RESCALE(cos(i*f));
1646
1647
6.07k
    tab = ((TXSample *)s->exp) + len4 + 8;
1648
1649
796k
    for (int i = 0; i < len4; i++)
1650
790k
        *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1);
1651
1652
6.07k
    return 0;
1653
6.07k
}
Unexecuted instantiation: tx_double.c:ff_tx_rdft_init_double_c
tx_float.c:ff_tx_rdft_init_float_c
Line
Count
Source
1607
6.07k
{
1608
6.07k
    int ret;
1609
6.07k
    double f, m;
1610
6.07k
    TXSample *tab;
1611
6.07k
    uint64_t r2r = flags & AV_TX_REAL_TO_REAL;
1612
6.07k
    int len4 = FFALIGN(len, 4) / 4;
1613
1614
6.07k
    s->scale_d = *((SCALE_TYPE *)scale);
1615
6.07k
    s->scale_f = s->scale_d;
1616
1617
6.07k
    flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY);
1618
1619
6.07k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1620
0
        return ret;
1621
1622
6.07k
    if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
1623
0
        return AVERROR(ENOMEM);
1624
1625
6.07k
    tab = (TXSample *)s->exp;
1626
1627
6.07k
    f = 2*M_PI/len;
1628
1629
6.07k
    m = (inv ? 2*s->scale_d : s->scale_d);
1630
1631
6.07k
    *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1632
6.07k
    *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1633
6.07k
    *tab++ = RESCALE( m);
1634
6.07k
    *tab++ = RESCALE(-m);
1635
1636
6.07k
    *tab++ = RESCALE( (0.5 - 0.0) * m);
1637
6.07k
    if (r2r)
1638
792
        *tab++ = 1 / s->scale_f;
1639
5.28k
    else
1640
5.28k
        *tab++ = RESCALE( (0.0 - 0.5) * m);
1641
6.07k
    *tab++ = RESCALE( (0.5 - inv) * m);
1642
6.07k
    *tab++ = RESCALE(-(0.5 - inv) * m);
1643
1644
796k
    for (int i = 0; i < len4; i++)
1645
790k
        *tab++ = RESCALE(cos(i*f));
1646
1647
6.07k
    tab = ((TXSample *)s->exp) + len4 + 8;
1648
1649
796k
    for (int i = 0; i < len4; i++)
1650
790k
        *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1);
1651
1652
6.07k
    return 0;
1653
6.07k
}
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_init_int32_c
1654
1655
#define DECL_RDFT(n, inv)                                                      \
1656
static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
1657
19.4M
                                     void *_src, ptrdiff_t stride)             \
1658
19.4M
{                                                                              \
1659
19.4M
    const int len2 = s->len >> 1;                                              \
1660
19.4M
    const int len4 = s->len >> 2;                                              \
1661
19.4M
    const TXSample *fact = (void *)s->exp;                                     \
1662
19.4M
    const TXSample *tcos = fact + 8;                                           \
1663
19.4M
    const TXSample *tsin = tcos + len4;                                        \
1664
19.4M
    TXComplex *data = inv ? _src : _dst;                                       \
1665
19.4M
    TXComplex t[3];                                                            \
1666
19.4M
                                                                               \
1667
19.4M
    if (!inv)                                                                  \
1668
19.4M
        s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex));                   \
1669
19.4M
    else                                                                       \
1670
19.4M
        data[0].im = data[len2].re;                                            \
1671
19.4M
                                                                               \
1672
19.4M
    /* The DC value's both components are real, but we need to change them     \
1673
19.4M
     * into complex values. Also, the middle of the array is special-cased.    \
1674
19.4M
     * These operations can be done before or after the loop. */               \
1675
19.4M
    t[0].re = data[0].re;                                                      \
1676
19.4M
    data[0].re = t[0].re + data[0].im;                                         \
1677
19.4M
    data[0].im = t[0].re - data[0].im;                                         \
1678
19.4M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1679
19.4M
    data[   0].im = MULT(fact[1], data[   0].im);                              \
1680
19.4M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1681
19.4M
    data[len4].im = MULT(fact[3], data[len4].im);                              \
1682
19.4M
                                                                               \
1683
1.07G
    for (int i = 1; i < len4; i++) {                                           \
1684
1.05G
        /* Separate even and odd FFTs */                                       \
1685
1.05G
        t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re));             \
1686
1.05G
        t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im));             \
1687
1.05G
        t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im));             \
1688
1.05G
        t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re));             \
1689
1.05G
                                                                               \
1690
1.05G
        /* Apply twiddle factors to the odd FFT and add to the even FFT */     \
1691
1.05G
        CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]);            \
1692
1.05G
                                                                               \
1693
1.05G
        data[       i].re = t[0].re + t[2].re;                                 \
1694
1.05G
        data[       i].im = t[2].im - t[0].im;                                 \
1695
1.05G
        data[len2 - i].re = t[0].re - t[2].re;                                 \
1696
1.05G
        data[len2 - i].im = t[2].im + t[0].im;                                 \
1697
1.05G
    }                                                                          \
1698
19.4M
                                                                               \
1699
19.4M
    if (inv) {                                                                 \
1700
17.1M
        s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex));                   \
1701
17.1M
    } else {                                                                   \
1702
2.27M
        /* Move [0].im to the last position, as convention requires */         \
1703
2.27M
        data[len2].re = data[0].im;                                            \
1704
2.27M
        data[   0].im = data[len2].im = 0;                                     \
1705
2.27M
    }                                                                          \
1706
19.4M
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2c_double_c
Unexecuted instantiation: tx_double.c:ff_tx_rdft_c2r_double_c
tx_float.c:ff_tx_rdft_r2c_float_c
Line
Count
Source
1657
2.27M
                                     void *_src, ptrdiff_t stride)             \
1658
2.27M
{                                                                              \
1659
2.27M
    const int len2 = s->len >> 1;                                              \
1660
2.27M
    const int len4 = s->len >> 2;                                              \
1661
2.27M
    const TXSample *fact = (void *)s->exp;                                     \
1662
2.27M
    const TXSample *tcos = fact + 8;                                           \
1663
2.27M
    const TXSample *tsin = tcos + len4;                                        \
1664
2.27M
    TXComplex *data = inv ? _src : _dst;                                       \
1665
2.27M
    TXComplex t[3];                                                            \
1666
2.27M
                                                                               \
1667
2.27M
    if (!inv)                                                                  \
1668
2.27M
        s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex));                   \
1669
2.27M
    else                                                                       \
1670
2.27M
        data[0].im = data[len2].re;                                            \
1671
2.27M
                                                                               \
1672
2.27M
    /* The DC value's both components are real, but we need to change them     \
1673
2.27M
     * into complex values. Also, the middle of the array is special-cased.    \
1674
2.27M
     * These operations can be done before or after the loop. */               \
1675
2.27M
    t[0].re = data[0].re;                                                      \
1676
2.27M
    data[0].re = t[0].re + data[0].im;                                         \
1677
2.27M
    data[0].im = t[0].re - data[0].im;                                         \
1678
2.27M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1679
2.27M
    data[   0].im = MULT(fact[1], data[   0].im);                              \
1680
2.27M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1681
2.27M
    data[len4].im = MULT(fact[3], data[len4].im);                              \
1682
2.27M
                                                                               \
1683
72.7M
    for (int i = 1; i < len4; i++) {                                           \
1684
70.4M
        /* Separate even and odd FFTs */                                       \
1685
70.4M
        t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re));             \
1686
70.4M
        t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im));             \
1687
70.4M
        t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im));             \
1688
70.4M
        t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re));             \
1689
70.4M
                                                                               \
1690
70.4M
        /* Apply twiddle factors to the odd FFT and add to the even FFT */     \
1691
70.4M
        CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]);            \
1692
70.4M
                                                                               \
1693
70.4M
        data[       i].re = t[0].re + t[2].re;                                 \
1694
70.4M
        data[       i].im = t[2].im - t[0].im;                                 \
1695
70.4M
        data[len2 - i].re = t[0].re - t[2].re;                                 \
1696
70.4M
        data[len2 - i].im = t[2].im + t[0].im;                                 \
1697
70.4M
    }                                                                          \
1698
2.27M
                                                                               \
1699
2.27M
    if (inv) {                                                                 \
1700
0
        s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex));                   \
1701
2.27M
    } else {                                                                   \
1702
2.27M
        /* Move [0].im to the last position, as convention requires */         \
1703
2.27M
        data[len2].re = data[0].im;                                            \
1704
2.27M
        data[   0].im = data[len2].im = 0;                                     \
1705
2.27M
    }                                                                          \
1706
2.27M
}                                                                              \
tx_float.c:ff_tx_rdft_c2r_float_c
Line
Count
Source
1657
17.1M
                                     void *_src, ptrdiff_t stride)             \
1658
17.1M
{                                                                              \
1659
17.1M
    const int len2 = s->len >> 1;                                              \
1660
17.1M
    const int len4 = s->len >> 2;                                              \
1661
17.1M
    const TXSample *fact = (void *)s->exp;                                     \
1662
17.1M
    const TXSample *tcos = fact + 8;                                           \
1663
17.1M
    const TXSample *tsin = tcos + len4;                                        \
1664
17.1M
    TXComplex *data = inv ? _src : _dst;                                       \
1665
17.1M
    TXComplex t[3];                                                            \
1666
17.1M
                                                                               \
1667
17.1M
    if (!inv)                                                                  \
1668
17.1M
        s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex));                   \
1669
17.1M
    else                                                                       \
1670
17.1M
        data[0].im = data[len2].re;                                            \
1671
17.1M
                                                                               \
1672
17.1M
    /* The DC value's both components are real, but we need to change them     \
1673
17.1M
     * into complex values. Also, the middle of the array is special-cased.    \
1674
17.1M
     * These operations can be done before or after the loop. */               \
1675
17.1M
    t[0].re = data[0].re;                                                      \
1676
17.1M
    data[0].re = t[0].re + data[0].im;                                         \
1677
17.1M
    data[0].im = t[0].re - data[0].im;                                         \
1678
17.1M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1679
17.1M
    data[   0].im = MULT(fact[1], data[   0].im);                              \
1680
17.1M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1681
17.1M
    data[len4].im = MULT(fact[3], data[len4].im);                              \
1682
17.1M
                                                                               \
1683
1.00G
    for (int i = 1; i < len4; i++) {                                           \
1684
984M
        /* Separate even and odd FFTs */                                       \
1685
984M
        t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re));             \
1686
984M
        t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im));             \
1687
984M
        t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im));             \
1688
984M
        t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re));             \
1689
984M
                                                                               \
1690
984M
        /* Apply twiddle factors to the odd FFT and add to the even FFT */     \
1691
984M
        CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]);            \
1692
984M
                                                                               \
1693
984M
        data[       i].re = t[0].re + t[2].re;                                 \
1694
984M
        data[       i].im = t[2].im - t[0].im;                                 \
1695
984M
        data[len2 - i].re = t[0].re - t[2].re;                                 \
1696
984M
        data[len2 - i].im = t[2].im + t[0].im;                                 \
1697
984M
    }                                                                          \
1698
17.1M
                                                                               \
1699
17.1M
    if (inv) {                                                                 \
1700
17.1M
        s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex));                   \
1701
17.1M
    } else {                                                                   \
1702
0
        /* Move [0].im to the last position, as convention requires */         \
1703
0
        data[len2].re = data[0].im;                                            \
1704
0
        data[   0].im = data[len2].im = 0;                                     \
1705
0
    }                                                                          \
1706
17.1M
}                                                                              \
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2c_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_c2r_int32_c
1707
                                                                               \
1708
static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
1709
    .name       = TX_NAME_STR("rdft_" #n),                                     \
1710
    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
1711
    .type       = TX_TYPE(RDFT),                                               \
1712
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |       \
1713
                  (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY),             \
1714
    .factors    = { 4, TX_FACTOR_ANY },                                        \
1715
    .nb_factors = 2,                                                           \
1716
    .min_len    = 4,                                                           \
1717
    .max_len    = TX_LEN_UNLIMITED,                                            \
1718
    .init       = TX_NAME(ff_tx_rdft_init),                                    \
1719
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
1720
    .prio       = FF_TX_PRIO_BASE,                                             \
1721
};
1722
1723
DECL_RDFT(r2c,  0)
1724
DECL_RDFT(c2r,  1)
1725
1726
#define DECL_RDFT_HALF(n, mode, mod2)                                          \
1727
static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
1728
1.51M
                                        void *_src, ptrdiff_t stride)          \
1729
1.51M
{                                                                              \
1730
1.51M
    const int len = s->len;                                                    \
1731
1.51M
    const int len2 = len >> 1;                                                 \
1732
1.51M
    const int len4 = len >> 2;                                                 \
1733
1.51M
    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
1734
1.51M
    const TXSample *fact = (void *)s->exp;                                     \
1735
1.51M
    const TXSample *tcos = fact + 8;                                           \
1736
1.51M
    const TXSample *tsin = tcos + aligned_len4;                                \
1737
1.51M
    TXComplex *data = _dst;                                                    \
1738
1.51M
    TXSample *out = _dst; /* Half-complex is forward-only */                   \
1739
1.51M
    TXSample tmp_dc;                                                           \
1740
1.51M
    av_unused TXSample tmp_mid;                                                \
1741
1.51M
    TXSample tmp[4];                                                           \
1742
1.51M
    TXComplex sf, sl;                                                          \
1743
1.51M
                                                                               \
1744
1.51M
    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
1745
1.51M
                                                                               \
1746
1.51M
    tmp_dc = data[0].re;                                                       \
1747
1.51M
    data[   0].re = tmp_dc + data[0].im;                                       \
1748
1.51M
    tmp_dc        = tmp_dc - data[0].im;                                       \
1749
1.51M
                                                                               \
1750
1.51M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1751
1.51M
    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
1752
1.51M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1753
1.51M
                                                                               \
1754
1.51M
    if (!mod2) {                                                               \
1755
0
        data[len4].im = MULT(fact[3], data[len4].im);                          \
1756
1.51M
    } else {                                                                   \
1757
1.51M
        sf = data[len4];                                                       \
1758
1.51M
        sl = data[len4 + 1];                                                   \
1759
1.51M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1760
1.51M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1761
1.51M
        else                                                                   \
1762
1.51M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1763
1.51M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1764
1.51M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1765
1.51M
                                                                               \
1766
1.51M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1767
757k
            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
1768
757k
            tmp_mid = (tmp[0] - tmp[3]);                                       \
1769
757k
        } else {                                                               \
1770
757k
            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
1771
757k
            tmp_mid = (tmp[0] + tmp[3]);                                       \
1772
757k
        }                                                                      \
1773
1.51M
    }                                                                          \
1774
1.51M
                                                                               \
1775
1.51M
    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
1776
49.2M
    for (int i = 1; i <= len4; i++) {                                          \
1777
47.7M
        TXSample tmp[4];                                                       \
1778
47.7M
        TXComplex sf = data[i];                                                \
1779
47.7M
        TXComplex sl = data[len2 - i];                                         \
1780
47.7M
                                                                               \
1781
47.7M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1782
47.7M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1783
47.7M
        else                                                                   \
1784
47.7M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1785
47.7M
                                                                               \
1786
47.7M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1787
47.7M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1788
47.7M
                                                                               \
1789
47.7M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1790
23.4M
            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
1791
23.4M
            out[i]           = (tmp[0] + tmp[3]);                              \
1792
23.4M
            out[len - i]     = (tmp[0] - tmp[3]);                              \
1793
24.2M
        } else {                                                               \
1794
24.2M
            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
1795
24.2M
            out[i - 1]       = (tmp[3] - tmp[0]);                              \
1796
24.2M
            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
1797
24.2M
        }                                                                      \
1798
47.7M
    }                                                                          \
1799
1.51M
                                                                               \
1800
48.4M
    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
1801
46.9M
        out[len2 - i] = out[len - i];                                          \
1802
1.51M
                                                                               \
1803
1.51M
    if (mode == AV_TX_REAL_TO_REAL) {                                          \
1804
757k
        out[len2] = tmp_dc;                                                    \
1805
757k
        if (mod2)                                                              \
1806
757k
            out[len4 + 1] = tmp_mid * fact[5];                                 \
1807
757k
    } else if (mod2) {                                                         \
1808
757k
        out[len4] = tmp_mid;                                                   \
1809
757k
    }                                                                          \
1810
1.51M
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2r_double_c
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2r_mod2_double_c
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2i_double_c
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2i_mod2_double_c
Unexecuted instantiation: tx_float.c:ff_tx_rdft_r2r_float_c
tx_float.c:ff_tx_rdft_r2r_mod2_float_c
Line
Count
Source
1728
757k
                                        void *_src, ptrdiff_t stride)          \
1729
757k
{                                                                              \
1730
757k
    const int len = s->len;                                                    \
1731
757k
    const int len2 = len >> 1;                                                 \
1732
757k
    const int len4 = len >> 2;                                                 \
1733
757k
    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
1734
757k
    const TXSample *fact = (void *)s->exp;                                     \
1735
757k
    const TXSample *tcos = fact + 8;                                           \
1736
757k
    const TXSample *tsin = tcos + aligned_len4;                                \
1737
757k
    TXComplex *data = _dst;                                                    \
1738
757k
    TXSample *out = _dst; /* Half-complex is forward-only */                   \
1739
757k
    TXSample tmp_dc;                                                           \
1740
757k
    av_unused TXSample tmp_mid;                                                \
1741
757k
    TXSample tmp[4];                                                           \
1742
757k
    TXComplex sf, sl;                                                          \
1743
757k
                                                                               \
1744
757k
    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
1745
757k
                                                                               \
1746
757k
    tmp_dc = data[0].re;                                                       \
1747
757k
    data[   0].re = tmp_dc + data[0].im;                                       \
1748
757k
    tmp_dc        = tmp_dc - data[0].im;                                       \
1749
757k
                                                                               \
1750
757k
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1751
757k
    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
1752
757k
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1753
757k
                                                                               \
1754
757k
    if (!mod2) {                                                               \
1755
0
        data[len4].im = MULT(fact[3], data[len4].im);                          \
1756
757k
    } else {                                                                   \
1757
757k
        sf = data[len4];                                                       \
1758
757k
        sl = data[len4 + 1];                                                   \
1759
757k
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1760
757k
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1761
757k
        else                                                                   \
1762
757k
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1763
757k
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1764
757k
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1765
757k
                                                                               \
1766
757k
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1767
757k
            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
1768
757k
            tmp_mid = (tmp[0] - tmp[3]);                                       \
1769
757k
        } else {                                                               \
1770
0
            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
1771
0
            tmp_mid = (tmp[0] + tmp[3]);                                       \
1772
0
        }                                                                      \
1773
757k
    }                                                                          \
1774
757k
                                                                               \
1775
757k
    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
1776
24.2M
    for (int i = 1; i <= len4; i++) {                                          \
1777
23.4M
        TXSample tmp[4];                                                       \
1778
23.4M
        TXComplex sf = data[i];                                                \
1779
23.4M
        TXComplex sl = data[len2 - i];                                         \
1780
23.4M
                                                                               \
1781
23.4M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1782
23.4M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1783
23.4M
        else                                                                   \
1784
23.4M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1785
23.4M
                                                                               \
1786
23.4M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1787
23.4M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1788
23.4M
                                                                               \
1789
23.4M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1790
23.4M
            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
1791
23.4M
            out[i]           = (tmp[0] + tmp[3]);                              \
1792
23.4M
            out[len - i]     = (tmp[0] - tmp[3]);                              \
1793
23.4M
        } else {                                                               \
1794
0
            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
1795
0
            out[i - 1]       = (tmp[3] - tmp[0]);                              \
1796
0
            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
1797
0
        }                                                                      \
1798
23.4M
    }                                                                          \
1799
757k
                                                                               \
1800
23.4M
    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
1801
22.7M
        out[len2 - i] = out[len - i];                                          \
1802
757k
                                                                               \
1803
757k
    if (mode == AV_TX_REAL_TO_REAL) {                                          \
1804
757k
        out[len2] = tmp_dc;                                                    \
1805
757k
        if (mod2)                                                              \
1806
757k
            out[len4 + 1] = tmp_mid * fact[5];                                 \
1807
757k
    } else if (mod2) {                                                         \
1808
0
        out[len4] = tmp_mid;                                                   \
1809
0
    }                                                                          \
1810
757k
}                                                                              \
Unexecuted instantiation: tx_float.c:ff_tx_rdft_r2i_float_c
tx_float.c:ff_tx_rdft_r2i_mod2_float_c
Line
Count
Source
1728
757k
                                        void *_src, ptrdiff_t stride)          \
1729
757k
{                                                                              \
1730
757k
    const int len = s->len;                                                    \
1731
757k
    const int len2 = len >> 1;                                                 \
1732
757k
    const int len4 = len >> 2;                                                 \
1733
757k
    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
1734
757k
    const TXSample *fact = (void *)s->exp;                                     \
1735
757k
    const TXSample *tcos = fact + 8;                                           \
1736
757k
    const TXSample *tsin = tcos + aligned_len4;                                \
1737
757k
    TXComplex *data = _dst;                                                    \
1738
757k
    TXSample *out = _dst; /* Half-complex is forward-only */                   \
1739
757k
    TXSample tmp_dc;                                                           \
1740
757k
    av_unused TXSample tmp_mid;                                                \
1741
757k
    TXSample tmp[4];                                                           \
1742
757k
    TXComplex sf, sl;                                                          \
1743
757k
                                                                               \
1744
757k
    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
1745
757k
                                                                               \
1746
757k
    tmp_dc = data[0].re;                                                       \
1747
757k
    data[   0].re = tmp_dc + data[0].im;                                       \
1748
757k
    tmp_dc        = tmp_dc - data[0].im;                                       \
1749
757k
                                                                               \
1750
757k
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1751
757k
    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
1752
757k
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1753
757k
                                                                               \
1754
757k
    if (!mod2) {                                                               \
1755
0
        data[len4].im = MULT(fact[3], data[len4].im);                          \
1756
757k
    } else {                                                                   \
1757
757k
        sf = data[len4];                                                       \
1758
757k
        sl = data[len4 + 1];                                                   \
1759
757k
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1760
757k
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1761
757k
        else                                                                   \
1762
757k
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1763
757k
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1764
757k
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1765
757k
                                                                               \
1766
757k
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1767
0
            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
1768
0
            tmp_mid = (tmp[0] - tmp[3]);                                       \
1769
757k
        } else {                                                               \
1770
757k
            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
1771
757k
            tmp_mid = (tmp[0] + tmp[3]);                                       \
1772
757k
        }                                                                      \
1773
757k
    }                                                                          \
1774
757k
                                                                               \
1775
757k
    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
1776
24.9M
    for (int i = 1; i <= len4; i++) {                                          \
1777
24.2M
        TXSample tmp[4];                                                       \
1778
24.2M
        TXComplex sf = data[i];                                                \
1779
24.2M
        TXComplex sl = data[len2 - i];                                         \
1780
24.2M
                                                                               \
1781
24.2M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1782
24.2M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1783
24.2M
        else                                                                   \
1784
24.2M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1785
24.2M
                                                                               \
1786
24.2M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1787
24.2M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1788
24.2M
                                                                               \
1789
24.2M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1790
0
            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
1791
0
            out[i]           = (tmp[0] + tmp[3]);                              \
1792
0
            out[len - i]     = (tmp[0] - tmp[3]);                              \
1793
24.2M
        } else {                                                               \
1794
24.2M
            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
1795
24.2M
            out[i - 1]       = (tmp[3] - tmp[0]);                              \
1796
24.2M
            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
1797
24.2M
        }                                                                      \
1798
24.2M
    }                                                                          \
1799
757k
                                                                               \
1800
24.9M
    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
1801
24.2M
        out[len2 - i] = out[len - i];                                          \
1802
757k
                                                                               \
1803
757k
    if (mode == AV_TX_REAL_TO_REAL) {                                          \
1804
0
        out[len2] = tmp_dc;                                                    \
1805
0
        if (mod2)                                                              \
1806
0
            out[len4 + 1] = tmp_mid * fact[5];                                 \
1807
757k
    } else if (mod2) {                                                         \
1808
757k
        out[len4] = tmp_mid;                                                   \
1809
757k
    }                                                                          \
1810
757k
}                                                                              \
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2r_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2r_mod2_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2i_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2i_mod2_int32_c
1811
                                                                               \
1812
static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
1813
    .name       = TX_NAME_STR("rdft_" #n),                                     \
1814
    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
1815
    .type       = TX_TYPE(RDFT),                                               \
1816
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | mode |                     \
1817
                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,                     \
1818
    .factors    = { 2 + 2*(!mod2), TX_FACTOR_ANY },                            \
1819
    .nb_factors = 2,                                                           \
1820
    .min_len    = 2 + 2*(!mod2),                                               \
1821
    .max_len    = TX_LEN_UNLIMITED,                                            \
1822
    .init       = TX_NAME(ff_tx_rdft_init),                                    \
1823
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
1824
    .prio       = FF_TX_PRIO_BASE,                                             \
1825
};
1826
1827
DECL_RDFT_HALF(r2r,      AV_TX_REAL_TO_REAL,      0)
1828
DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL,      1)
1829
DECL_RDFT_HALF(r2i,      AV_TX_REAL_TO_IMAGINARY, 0)
1830
DECL_RDFT_HALF(r2i_mod2, AV_TX_REAL_TO_IMAGINARY, 1)
1831
1832
static av_cold int TX_NAME(ff_tx_dct_init)(AVTXContext *s,
1833
                                           const FFTXCodelet *cd,
1834
                                           uint64_t flags,
1835
                                           FFTXCodeletOptions *opts,
1836
                                           int len, int inv,
1837
                                           const void *scale)
1838
771
{
1839
771
    int ret;
1840
771
    double freq;
1841
771
    TXSample *tab;
1842
771
    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
1843
1844
771
    if (inv) {
1845
771
        len *= 2;
1846
771
        s->len *= 2;
1847
771
        rsc *= 0.5;
1848
771
    }
1849
1850
771
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
1851
0
        return ret;
1852
1853
771
    s->exp = av_malloc((len/2)*3*sizeof(TXSample));
1854
771
    if (!s->exp)
1855
0
        return AVERROR(ENOMEM);
1856
1857
771
    tab = (TXSample *)s->exp;
1858
1859
771
    freq = M_PI/(len*2);
1860
1861
1.02M
    for (int i = 0; i < len; i++)
1862
1.02M
        tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
1863
1864
771
    if (inv) {
1865
514k
        for (int i = 0; i < len/2; i++)
1866
513k
            tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
1867
771
    } else {
1868
0
        for (int i = 0; i < len/2; i++)
1869
0
            tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
1870
0
    }
1871
1872
771
    return 0;
1873
771
}
Unexecuted instantiation: tx_double.c:ff_tx_dct_init_double_c
tx_float.c:ff_tx_dct_init_float_c
Line
Count
Source
1838
771
{
1839
771
    int ret;
1840
771
    double freq;
1841
771
    TXSample *tab;
1842
771
    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
1843
1844
771
    if (inv) {
1845
771
        len *= 2;
1846
771
        s->len *= 2;
1847
771
        rsc *= 0.5;
1848
771
    }
1849
1850
771
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
1851
0
        return ret;
1852
1853
771
    s->exp = av_malloc((len/2)*3*sizeof(TXSample));
1854
771
    if (!s->exp)
1855
0
        return AVERROR(ENOMEM);
1856
1857
771
    tab = (TXSample *)s->exp;
1858
1859
771
    freq = M_PI/(len*2);
1860
1861
1.02M
    for (int i = 0; i < len; i++)
1862
1.02M
        tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
1863
1864
771
    if (inv) {
1865
514k
        for (int i = 0; i < len/2; i++)
1866
513k
            tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
1867
771
    } else {
1868
0
        for (int i = 0; i < len/2; i++)
1869
0
            tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
1870
0
    }
1871
1872
771
    return 0;
1873
771
}
Unexecuted instantiation: tx_int32.c:ff_tx_dct_init_int32_c
1874
1875
static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst,
1876
                                 void *_src, ptrdiff_t stride)
1877
0
{
1878
0
    TXSample *dst = _dst;
1879
0
    TXSample *src = _src;
1880
0
    const int len = s->len;
1881
0
    const int len2 = len >> 1;
1882
0
    const TXSample *exp = (void *)s->exp;
1883
0
    TXSample next;
1884
#ifdef TX_INT32
1885
    int64_t tmp1, tmp2;
1886
#else
1887
    TXSample tmp1, tmp2;
1888
#endif
1889
1890
0
    for (int i = 0; i < len2; i++) {
1891
0
        TXSample in1 = src[i];
1892
0
        TXSample in2 = src[len - i - 1];
1893
0
        TXSample s    = exp[len + i];
1894
1895
#ifdef TX_INT32
1896
        tmp1 = in1 + in2;
1897
        tmp2 = in1 - in2;
1898
1899
        tmp1 >>= 1;
1900
        tmp2 *= s;
1901
1902
        tmp2 = (tmp2 + 0x40000000) >> 31;
1903
#else
1904
        tmp1 = (in1 + in2)*0.5;
1905
        tmp2 = (in1 - in2)*s;
1906
#endif
1907
1908
0
        src[i]           = tmp1 + tmp2;
1909
0
        src[len - i - 1] = tmp1 - tmp2;
1910
0
    }
1911
1912
0
    s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex));
1913
1914
0
    next = dst[len];
1915
1916
0
    for (int i = len - 2; i > 0; i -= 2) {
1917
0
        TXSample tmp;
1918
1919
0
        CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]);
1920
1921
0
        dst[i + 1] = next;
1922
1923
0
        next += tmp;
1924
0
    }
1925
1926
#ifdef TX_INT32
1927
    tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]);
1928
    dst[0] = (tmp1 + 0x40000000) >> 31;
1929
#else
1930
    dst[0] = exp[0] * dst[0];
1931
#endif
1932
0
    dst[1] = next;
1933
0
}
Unexecuted instantiation: tx_double.c:ff_tx_dctII_double_c
Unexecuted instantiation: tx_float.c:ff_tx_dctII_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_dctII_int32_c
1934
1935
static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst,
1936
                                  void *_src, ptrdiff_t stride)
1937
318k
{
1938
318k
    TXSample *dst = _dst;
1939
318k
    TXSample *src = _src;
1940
318k
    const int len = s->len;
1941
318k
    const int len2 = len >> 1;
1942
318k
    const TXSample *exp = (void *)s->exp;
1943
#ifdef TX_INT32
1944
    int64_t  tmp1, tmp2 = src[len - 1];
1945
    tmp2 = (2*tmp2 + 0x40000000) >> 31;
1946
#else
1947
    TXSample tmp1, tmp2 = 2*src[len - 1];
1948
#endif
1949
1950
318k
    src[len] = tmp2;
1951
1952
122M
    for (int i = len - 2; i >= 2; i -= 2) {
1953
121M
        TXSample val1 = src[i - 0];
1954
121M
        TXSample val2 = src[i - 1] - src[i + 1];
1955
1956
121M
        CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
1957
121M
    }
1958
1959
318k
    s->fn[0](&s->sub[0], dst, src, sizeof(float));
1960
1961
122M
    for (int i = 0; i < len2; i++) {
1962
122M
        TXSample in1 = dst[i];
1963
122M
        TXSample in2 = dst[len - i - 1];
1964
122M
        TXSample c   = exp[len + i];
1965
1966
122M
        tmp1 = in1 + in2;
1967
122M
        tmp2 = in1 - in2;
1968
122M
        tmp2 *= c;
1969
#ifdef TX_INT32
1970
        tmp2 = (tmp2 + 0x40000000) >> 31;
1971
#endif
1972
1973
122M
        dst[i]            = tmp1 + tmp2;
1974
122M
        dst[len - i - 1]  = tmp1 - tmp2;
1975
122M
    }
1976
318k
}
Unexecuted instantiation: tx_double.c:ff_tx_dctIII_double_c
tx_float.c:ff_tx_dctIII_float_c
Line
Count
Source
1937
318k
{
1938
318k
    TXSample *dst = _dst;
1939
318k
    TXSample *src = _src;
1940
318k
    const int len = s->len;
1941
318k
    const int len2 = len >> 1;
1942
318k
    const TXSample *exp = (void *)s->exp;
1943
#ifdef TX_INT32
1944
    int64_t  tmp1, tmp2 = src[len - 1];
1945
    tmp2 = (2*tmp2 + 0x40000000) >> 31;
1946
#else
1947
318k
    TXSample tmp1, tmp2 = 2*src[len - 1];
1948
318k
#endif
1949
1950
318k
    src[len] = tmp2;
1951
1952
122M
    for (int i = len - 2; i >= 2; i -= 2) {
1953
121M
        TXSample val1 = src[i - 0];
1954
121M
        TXSample val2 = src[i - 1] - src[i + 1];
1955
1956
121M
        CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
1957
121M
    }
1958
1959
318k
    s->fn[0](&s->sub[0], dst, src, sizeof(float));
1960
1961
122M
    for (int i = 0; i < len2; i++) {
1962
122M
        TXSample in1 = dst[i];
1963
122M
        TXSample in2 = dst[len - i - 1];
1964
122M
        TXSample c   = exp[len + i];
1965
1966
122M
        tmp1 = in1 + in2;
1967
122M
        tmp2 = in1 - in2;
1968
122M
        tmp2 *= c;
1969
#ifdef TX_INT32
1970
        tmp2 = (tmp2 + 0x40000000) >> 31;
1971
#endif
1972
1973
122M
        dst[i]            = tmp1 + tmp2;
1974
122M
        dst[len - i - 1]  = tmp1 - tmp2;
1975
122M
    }
1976
318k
}
Unexecuted instantiation: tx_int32.c:ff_tx_dctIII_int32_c
1977
1978
static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = {
1979
    .name       = TX_NAME_STR("dctII"),
1980
    .function   = TX_NAME(ff_tx_dctII),
1981
    .type       = TX_TYPE(DCT),
1982
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
1983
                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
1984
    .factors    = { 2, TX_FACTOR_ANY },
1985
    .min_len    = 2,
1986
    .max_len    = TX_LEN_UNLIMITED,
1987
    .init       = TX_NAME(ff_tx_dct_init),
1988
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1989
    .prio       = FF_TX_PRIO_BASE,
1990
};
1991
1992
static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = {
1993
    .name       = TX_NAME_STR("dctIII"),
1994
    .function   = TX_NAME(ff_tx_dctIII),
1995
    .type       = TX_TYPE(DCT),
1996
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
1997
                  FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
1998
    .factors    = { 2, TX_FACTOR_ANY },
1999
    .min_len    = 2,
2000
    .max_len    = TX_LEN_UNLIMITED,
2001
    .init       = TX_NAME(ff_tx_dct_init),
2002
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
2003
    .prio       = FF_TX_PRIO_BASE,
2004
};
2005
2006
static av_cold int TX_NAME(ff_tx_dcstI_init)(AVTXContext *s,
2007
                                             const FFTXCodelet *cd,
2008
                                             uint64_t flags,
2009
                                             FFTXCodeletOptions *opts,
2010
                                             int len, int inv,
2011
                                             const void *scale)
2012
1.58k
{
2013
1.58k
    int ret;
2014
1.58k
    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
2015
2016
1.58k
    if (inv) {
2017
0
        len *= 2;
2018
0
        s->len *= 2;
2019
0
        rsc *= 0.5;
2020
0
    }
2021
2022
    /* We want a half-complex RDFT */
2023
1.58k
    flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
2024
1.58k
                                          AV_TX_REAL_TO_IMAGINARY;
2025
2026
1.58k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
2027
1.58k
                                (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
2028
1.58k
                                0, &rsc)))
2029
0
        return ret;
2030
2031
1.58k
    s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
2032
1.58k
    if (!s->tmp)
2033
0
        return AVERROR(ENOMEM);
2034
2035
1.58k
    return 0;
2036
1.58k
}
Unexecuted instantiation: tx_double.c:ff_tx_dcstI_init_double_c
tx_float.c:ff_tx_dcstI_init_float_c
Line
Count
Source
2012
1.58k
{
2013
1.58k
    int ret;
2014
1.58k
    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
2015
2016
1.58k
    if (inv) {
2017
0
        len *= 2;
2018
0
        s->len *= 2;
2019
0
        rsc *= 0.5;
2020
0
    }
2021
2022
    /* We want a half-complex RDFT */
2023
1.58k
    flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
2024
1.58k
                                          AV_TX_REAL_TO_IMAGINARY;
2025
2026
1.58k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
2027
1.58k
                                (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
2028
1.58k
                                0, &rsc)))
2029
0
        return ret;
2030
2031
1.58k
    s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
2032
1.58k
    if (!s->tmp)
2033
0
        return AVERROR(ENOMEM);
2034
2035
1.58k
    return 0;
2036
1.58k
}
Unexecuted instantiation: tx_int32.c:ff_tx_dcstI_init_int32_c
2037
2038
static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst,
2039
                                void *_src, ptrdiff_t stride)
2040
757k
{
2041
757k
    TXSample *dst = _dst;
2042
757k
    TXSample *src = _src;
2043
757k
    const int len = s->len - 1;
2044
757k
    TXSample *tmp = (TXSample *)s->tmp;
2045
2046
757k
    stride /= sizeof(TXSample);
2047
2048
48.4M
    for (int i = 0; i < len; i++)
2049
47.7M
        tmp[i] = tmp[2*len - i] = src[i * stride];
2050
2051
757k
    tmp[len] = src[len * stride]; /* Middle */
2052
2053
757k
    s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
2054
757k
}
Unexecuted instantiation: tx_double.c:ff_tx_dctI_double_c
tx_float.c:ff_tx_dctI_float_c
Line
Count
Source
2040
757k
{
2041
757k
    TXSample *dst = _dst;
2042
757k
    TXSample *src = _src;
2043
757k
    const int len = s->len - 1;
2044
757k
    TXSample *tmp = (TXSample *)s->tmp;
2045
2046
757k
    stride /= sizeof(TXSample);
2047
2048
48.4M
    for (int i = 0; i < len; i++)
2049
47.7M
        tmp[i] = tmp[2*len - i] = src[i * stride];
2050
2051
757k
    tmp[len] = src[len * stride]; /* Middle */
2052
2053
757k
    s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
2054
757k
}
Unexecuted instantiation: tx_int32.c:ff_tx_dctI_int32_c
2055
2056
static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst,
2057
                                void *_src, ptrdiff_t stride)
2058
757k
{
2059
757k
    TXSample *dst = _dst;
2060
757k
    TXSample *src = _src;
2061
757k
    const int len = s->len + 1;
2062
757k
    TXSample *tmp = (void *)s->tmp;
2063
2064
757k
    stride /= sizeof(TXSample);
2065
2066
757k
    tmp[0] = 0;
2067
2068
49.2M
    for (int i = 1; i < len; i++) {
2069
48.4M
        TXSample a = src[(i - 1) * stride];
2070
48.4M
        tmp[i] = -a;
2071
48.4M
        tmp[2*len - i] = a;
2072
48.4M
    }
2073
2074
757k
    tmp[len] = 0; /* i == n, Nyquist */
2075
2076
757k
    s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
2077
757k
}
Unexecuted instantiation: tx_double.c:ff_tx_dstI_double_c
tx_float.c:ff_tx_dstI_float_c
Line
Count
Source
2058
757k
{
2059
757k
    TXSample *dst = _dst;
2060
757k
    TXSample *src = _src;
2061
757k
    const int len = s->len + 1;
2062
757k
    TXSample *tmp = (void *)s->tmp;
2063
2064
757k
    stride /= sizeof(TXSample);
2065
2066
757k
    tmp[0] = 0;
2067
2068
49.2M
    for (int i = 1; i < len; i++) {
2069
48.4M
        TXSample a = src[(i - 1) * stride];
2070
48.4M
        tmp[i] = -a;
2071
48.4M
        tmp[2*len - i] = a;
2072
48.4M
    }
2073
2074
757k
    tmp[len] = 0; /* i == n, Nyquist */
2075
2076
757k
    s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
2077
757k
}
Unexecuted instantiation: tx_int32.c:ff_tx_dstI_int32_c
2078
2079
static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = {
2080
    .name       = TX_NAME_STR("dctI"),
2081
    .function   = TX_NAME(ff_tx_dctI),
2082
    .type       = TX_TYPE(DCT_I),
2083
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
2084
    .factors    = { 2, TX_FACTOR_ANY },
2085
    .nb_factors = 2,
2086
    .min_len    = 2,
2087
    .max_len    = TX_LEN_UNLIMITED,
2088
    .init       = TX_NAME(ff_tx_dcstI_init),
2089
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
2090
    .prio       = FF_TX_PRIO_BASE,
2091
};
2092
2093
static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = {
2094
    .name       = TX_NAME_STR("dstI"),
2095
    .function   = TX_NAME(ff_tx_dstI),
2096
    .type       = TX_TYPE(DST_I),
2097
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
2098
    .factors    = { 2, TX_FACTOR_ANY },
2099
    .nb_factors = 2,
2100
    .min_len    = 2,
2101
    .max_len    = TX_LEN_UNLIMITED,
2102
    .init       = TX_NAME(ff_tx_dcstI_init),
2103
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
2104
    .prio       = FF_TX_PRIO_BASE,
2105
};
2106
2107
int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
2108
981k
{
2109
981k
    int off = 0;
2110
981k
    int len4 = s->len >> 1;
2111
981k
    double scale = s->scale_d;
2112
981k
    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2113
981k
    size_t alloc = pre_tab ? 2*len4 : len4;
2114
2115
981k
    if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2116
0
        return AVERROR(ENOMEM);
2117
2118
981k
    scale = sqrt(fabs(scale));
2119
2120
981k
    if (pre_tab)
2121
961k
        off = len4;
2122
2123
212M
    for (int i = 0; i < len4; i++) {
2124
211M
        const double alpha = M_PI_2 * (i + theta) / len4;
2125
211M
        s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2126
211M
                                       RESCALE(sin(alpha) * scale) };
2127
211M
    }
2128
2129
981k
    if (pre_tab)
2130
202M
        for (int i = 0; i < len4; i++)
2131
201M
            s->exp[i] = s->exp[len4 + pre_tab[i]];
2132
2133
981k
    return 0;
2134
981k
}
Unexecuted instantiation: ff_tx_mdct_gen_exp_double
ff_tx_mdct_gen_exp_float
Line
Count
Source
2108
835k
{
2109
835k
    int off = 0;
2110
835k
    int len4 = s->len >> 1;
2111
835k
    double scale = s->scale_d;
2112
835k
    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2113
835k
    size_t alloc = pre_tab ? 2*len4 : len4;
2114
2115
835k
    if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2116
0
        return AVERROR(ENOMEM);
2117
2118
835k
    scale = sqrt(fabs(scale));
2119
2120
835k
    if (pre_tab)
2121
820k
        off = len4;
2122
2123
193M
    for (int i = 0; i < len4; i++) {
2124
192M
        const double alpha = M_PI_2 * (i + theta) / len4;
2125
192M
        s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2126
192M
                                       RESCALE(sin(alpha) * scale) };
2127
192M
    }
2128
2129
835k
    if (pre_tab)
2130
185M
        for (int i = 0; i < len4; i++)
2131
185M
            s->exp[i] = s->exp[len4 + pre_tab[i]];
2132
2133
835k
    return 0;
2134
835k
}
ff_tx_mdct_gen_exp_int32
Line
Count
Source
2108
146k
{
2109
146k
    int off = 0;
2110
146k
    int len4 = s->len >> 1;
2111
146k
    double scale = s->scale_d;
2112
146k
    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2113
146k
    size_t alloc = pre_tab ? 2*len4 : len4;
2114
2115
146k
    if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2116
0
        return AVERROR(ENOMEM);
2117
2118
146k
    scale = sqrt(fabs(scale));
2119
2120
146k
    if (pre_tab)
2121
140k
        off = len4;
2122
2123
19.2M
    for (int i = 0; i < len4; i++) {
2124
19.1M
        const double alpha = M_PI_2 * (i + theta) / len4;
2125
19.1M
        s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2126
19.1M
                                       RESCALE(sin(alpha) * scale) };
2127
19.1M
    }
2128
2129
146k
    if (pre_tab)
2130
16.1M
        for (int i = 0; i < len4; i++)
2131
16.0M
            s->exp[i] = s->exp[len4 + pre_tab[i]];
2132
2133
146k
    return 0;
2134
146k
}
2135
2136
const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
2137
    /* Split-Radix codelets */
2138
    &TX_NAME(ff_tx_fft2_ns_def),
2139
    &TX_NAME(ff_tx_fft4_ns_def),
2140
    &TX_NAME(ff_tx_fft8_ns_def),
2141
    &TX_NAME(ff_tx_fft16_ns_def),
2142
    &TX_NAME(ff_tx_fft32_ns_def),
2143
    &TX_NAME(ff_tx_fft64_ns_def),
2144
    &TX_NAME(ff_tx_fft128_ns_def),
2145
    &TX_NAME(ff_tx_fft256_ns_def),
2146
    &TX_NAME(ff_tx_fft512_ns_def),
2147
    &TX_NAME(ff_tx_fft1024_ns_def),
2148
    &TX_NAME(ff_tx_fft2048_ns_def),
2149
    &TX_NAME(ff_tx_fft4096_ns_def),
2150
    &TX_NAME(ff_tx_fft8192_ns_def),
2151
    &TX_NAME(ff_tx_fft16384_ns_def),
2152
    &TX_NAME(ff_tx_fft32768_ns_def),
2153
    &TX_NAME(ff_tx_fft65536_ns_def),
2154
    &TX_NAME(ff_tx_fft131072_ns_def),
2155
2156
    /* Prime factor codelets */
2157
    &TX_NAME(ff_tx_fft3_ns_def),
2158
    &TX_NAME(ff_tx_fft5_ns_def),
2159
    &TX_NAME(ff_tx_fft7_ns_def),
2160
    &TX_NAME(ff_tx_fft9_ns_def),
2161
    &TX_NAME(ff_tx_fft15_ns_def),
2162
2163
    /* We get these for free */
2164
    &TX_NAME(ff_tx_fft3_fwd_def),
2165
    &TX_NAME(ff_tx_fft5_fwd_def),
2166
    &TX_NAME(ff_tx_fft7_fwd_def),
2167
    &TX_NAME(ff_tx_fft9_fwd_def),
2168
2169
    /* Standalone transforms */
2170
    &TX_NAME(ff_tx_fft_def),
2171
    &TX_NAME(ff_tx_fft_inplace_def),
2172
    &TX_NAME(ff_tx_fft_inplace_small_def),
2173
    &TX_NAME(ff_tx_fft_pfa_def),
2174
    &TX_NAME(ff_tx_fft_pfa_ns_def),
2175
    &TX_NAME(ff_tx_fft_naive_def),
2176
    &TX_NAME(ff_tx_fft_naive_small_def),
2177
    &TX_NAME(ff_tx_mdct_fwd_def),
2178
    &TX_NAME(ff_tx_mdct_inv_def),
2179
    &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
2180
    &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
2181
    &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
2182
    &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
2183
    &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
2184
    &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
2185
    &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
2186
    &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
2187
    &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
2188
    &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
2189
    &TX_NAME(ff_tx_mdct_naive_fwd_def),
2190
    &TX_NAME(ff_tx_mdct_naive_inv_def),
2191
    &TX_NAME(ff_tx_mdct_inv_full_def),
2192
    &TX_NAME(ff_tx_rdft_r2c_def),
2193
    &TX_NAME(ff_tx_rdft_r2r_def),
2194
    &TX_NAME(ff_tx_rdft_r2r_mod2_def),
2195
    &TX_NAME(ff_tx_rdft_r2i_def),
2196
    &TX_NAME(ff_tx_rdft_r2i_mod2_def),
2197
    &TX_NAME(ff_tx_rdft_c2r_def),
2198
    &TX_NAME(ff_tx_dctII_def),
2199
    &TX_NAME(ff_tx_dctIII_def),
2200
    &TX_NAME(ff_tx_dctI_def),
2201
    &TX_NAME(ff_tx_dstI_def),
2202
2203
    NULL,
2204
};