Coverage Report

Created: 2026-05-23 07:06

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ffmpeg/libavutil/tx_template.c
Line
Count
Source
1
/*
2
 * Copyright (c) Lynne
3
 *
4
 * Power of two FFT:
5
 * Copyright (c) Lynne
6
 * Copyright (c) 2008 Loren Merritt
7
 * Copyright (c) 2002 Fabrice Bellard
8
 * Partly based on libdjbfft by D. J. Bernstein
9
 *
10
 * This file is part of FFmpeg.
11
 *
12
 * FFmpeg is free software; you can redistribute it and/or
13
 * modify it under the terms of the GNU Lesser General Public
14
 * License as published by the Free Software Foundation; either
15
 * version 2.1 of the License, or (at your option) any later version.
16
 *
17
 * FFmpeg is distributed in the hope that it will be useful,
18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
 * Lesser General Public License for more details.
21
 *
22
 * You should have received a copy of the GNU Lesser General Public
23
 * License along with FFmpeg; if not, write to the Free Software
24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
 */
26
27
#include "mem.h"
28
29
#define TABLE_DEF(name, size) \
30
    DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
31
32
#define SR_POW2_TABLES \
33
    SR_TABLE(8)        \
34
    SR_TABLE(16)       \
35
    SR_TABLE(32)       \
36
    SR_TABLE(64)       \
37
    SR_TABLE(128)      \
38
    SR_TABLE(256)      \
39
    SR_TABLE(512)      \
40
    SR_TABLE(1024)     \
41
    SR_TABLE(2048)     \
42
    SR_TABLE(4096)     \
43
    SR_TABLE(8192)     \
44
    SR_TABLE(16384)    \
45
    SR_TABLE(32768)    \
46
    SR_TABLE(65536)    \
47
    SR_TABLE(131072)   \
48
49
#define SR_TABLE(len) \
50
    TABLE_DEF(len, len/4 + 1);
51
/* Power of two tables */
52
SR_POW2_TABLES
53
#undef SR_TABLE
54
55
/* Other factors' tables */
56
TABLE_DEF(53, 12);
57
TABLE_DEF( 7,  6);
58
TABLE_DEF( 9,  8);
59
60
typedef struct FFTabInitData {
61
    void (*func)(void);
62
    int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
63
} FFTabInitData;
64
65
#define SR_TABLE(len)                                              \
66
221
static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void)            \
67
221
{                                                                  \
68
221
    double freq = 2*M_PI/len;                                      \
69
221
    TXSample *tab = TX_TAB(ff_tx_tab_ ##len);                      \
70
221
                                                                   \
71
13.6k
    for (int i = 0; i < len/4; i++)                                \
72
13.4k
        *tab++ = RESCALE(cos(i*freq));                             \
73
221
                                                                   \
74
221
    *tab = 0;                                                      \
75
221
}
76
853
SR_POW2_TABLES
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_8_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_16_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_32_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_64_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_128_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_256_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_512_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_1024_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_2048_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_4096_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_8192_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_16384_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_32768_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_65536_double
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_131072_double
tx_float.c:ff_tx_init_tab_8_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_16_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_32_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_64_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_128_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_256_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_512_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_1024_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_2048_float
Line
Count
Source
76
SR_POW2_TABLES
tx_float.c:ff_tx_init_tab_4096_float
Line
Count
Source
76
SR_POW2_TABLES
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_8192_float
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_16384_float
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_32768_float
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_65536_float
Unexecuted instantiation: tx_float.c:ff_tx_init_tab_131072_float
tx_int32.c:ff_tx_init_tab_8_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_16_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_32_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_64_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_128_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_256_int32
Line
Count
Source
76
SR_POW2_TABLES
tx_int32.c:ff_tx_init_tab_512_int32
Line
Count
Source
76
SR_POW2_TABLES
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_1024_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_2048_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_4096_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_8192_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_16384_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_32768_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_65536_int32
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_131072_int32
77
853
#undef SR_TABLE
78
853
79
853
static void (*const sr_tabs_init_funcs[])(void) = {
80
853
#define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
81
853
    SR_POW2_TABLES
82
853
#undef SR_TABLE
83
853
};
84
853
85
853
static AVOnce sr_tabs_init_once[] = {
86
853
#define SR_TABLE(len) AV_ONCE_INIT,
87
853
    SR_POW2_TABLES
88
853
#undef SR_TABLE
89
853
};
90
853
91
853
static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
92
853
{
93
    /* 5pt, doubled to eliminate AVX lane shuffles */
94
7
    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI /  5));
95
7
    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI /  5));
96
7
    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
97
7
    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
98
7
    TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI /  5));
99
7
    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
100
7
    TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
101
7
    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
102
103
    /* 3pt */
104
7
    TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
105
7
    TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
106
7
    TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI /  6));
107
7
    TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI /  6));
108
7
}
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_53_double
tx_float.c:ff_tx_init_tab_53_float
Line
Count
Source
92
6
{
93
    /* 5pt, doubled to eliminate AVX lane shuffles */
94
6
    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI /  5));
95
6
    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI /  5));
96
6
    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
97
6
    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
98
6
    TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI /  5));
99
6
    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
100
6
    TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
101
6
    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
102
103
    /* 3pt */
104
6
    TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
105
6
    TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
106
6
    TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI /  6));
107
6
    TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI /  6));
108
6
}
tx_int32.c:ff_tx_init_tab_53_int32
Line
Count
Source
92
1
{
93
    /* 5pt, doubled to eliminate AVX lane shuffles */
94
1
    TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI /  5));
95
1
    TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI /  5));
96
1
    TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
97
1
    TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
98
1
    TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI /  5));
99
1
    TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI /  5));
100
1
    TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
101
1
    TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
102
103
    /* 3pt */
104
1
    TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
105
1
    TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
106
1
    TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI /  6));
107
1
    TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI /  6));
108
1
}
109
110
static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
111
1
{
112
1
    TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI /  7));
113
1
    TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI /  7));
114
1
    TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
115
1
    TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
116
1
    TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
117
1
    TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
118
1
}
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_7_double
tx_float.c:ff_tx_init_tab_7_float
Line
Count
Source
111
1
{
112
1
    TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI /  7));
113
1
    TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI /  7));
114
1
    TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
115
1
    TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
116
1
    TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
117
1
    TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
118
1
}
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_7_int32
119
120
static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
121
1
{
122
1
    TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI /  3));
123
1
    TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI /  3));
124
1
    TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI /  9));
125
1
    TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI /  9));
126
1
    TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
127
1
    TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
128
1
    TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
129
1
    TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
130
1
}
Unexecuted instantiation: tx_double.c:ff_tx_init_tab_9_double
tx_float.c:ff_tx_init_tab_9_float
Line
Count
Source
121
1
{
122
1
    TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI /  3));
123
1
    TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI /  3));
124
1
    TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI /  9));
125
1
    TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI /  9));
126
1
    TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
127
1
    TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
128
1
    TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
129
1
    TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
130
1
}
Unexecuted instantiation: tx_int32.c:ff_tx_init_tab_9_int32
131
132
static const FFTabInitData nptwo_tabs_init_data[] = {
133
    { TX_TAB(ff_tx_init_tab_53),      { 15, 5, 3 } },
134
    { TX_TAB(ff_tx_init_tab_9),       {  9 }       },
135
    { TX_TAB(ff_tx_init_tab_7),       {  7 }       },
136
};
137
138
static AVOnce nptwo_tabs_init_once[] = {
139
    AV_ONCE_INIT,
140
    AV_ONCE_INIT,
141
    AV_ONCE_INIT,
142
};
143
144
av_cold void TX_TAB(ff_tx_init_tabs)(int len)
145
1.51M
{
146
1.51M
    int factor_2 = ff_ctz(len);
147
1.51M
    if (factor_2) {
148
1.08M
        int idx = factor_2 - 3;
149
4.63M
        for (int i = 0; i <= idx; i++)
150
3.54M
            ff_thread_once(&sr_tabs_init_once[i],
151
1.08M
                            sr_tabs_init_funcs[i]);
152
1.08M
        len >>= factor_2;
153
1.08M
    }
154
155
1.93M
    for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
156
1.93M
        int f, f_idx = 0;
157
158
1.93M
        if (len <= 1)
159
1.51M
            return;
160
161
430k
        while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
162
428k
            if (f % len)
163
5.46k
                continue;
164
165
422k
            ff_thread_once(&nptwo_tabs_init_once[i],
166
422k
                            nptwo_tabs_init_data[i].func);
167
422k
            len /= f;
168
422k
            break;
169
428k
        }
170
425k
    }
171
1.51M
}
Unexecuted instantiation: ff_tx_init_tabs_double
ff_tx_init_tabs_float
Line
Count
Source
145
1.35M
{
146
1.35M
    int factor_2 = ff_ctz(len);
147
1.35M
    if (factor_2) {
148
958k
        int idx = factor_2 - 3;
149
4.04M
        for (int i = 0; i <= idx; i++)
150
3.08M
            ff_thread_once(&sr_tabs_init_once[i],
151
958k
                            sr_tabs_init_funcs[i]);
152
958k
        len >>= factor_2;
153
958k
    }
154
155
1.75M
    for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
156
1.75M
        int f, f_idx = 0;
157
158
1.75M
        if (len <= 1)
159
1.35M
            return;
160
161
403k
        while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
162
401k
            if (f % len)
163
5.46k
                continue;
164
165
395k
            ff_thread_once(&nptwo_tabs_init_once[i],
166
395k
                            nptwo_tabs_init_data[i].func);
167
395k
            len /= f;
168
395k
            break;
169
401k
        }
170
398k
    }
171
1.35M
}
ff_tx_init_tabs_int32
Line
Count
Source
145
156k
{
146
156k
    int factor_2 = ff_ctz(len);
147
156k
    if (factor_2) {
148
129k
        int idx = factor_2 - 3;
149
587k
        for (int i = 0; i <= idx; i++)
150
457k
            ff_thread_once(&sr_tabs_init_once[i],
151
129k
                            sr_tabs_init_funcs[i]);
152
129k
        len >>= factor_2;
153
129k
    }
154
155
183k
    for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
156
183k
        int f, f_idx = 0;
157
158
183k
        if (len <= 1)
159
156k
            return;
160
161
27.0k
        while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
162
27.0k
            if (f % len)
163
0
                continue;
164
165
27.0k
            ff_thread_once(&nptwo_tabs_init_once[i],
166
27.0k
                            nptwo_tabs_init_data[i].func);
167
27.0k
            len /= f;
168
27.0k
            break;
169
27.0k
        }
170
27.0k
    }
171
156k
}
172
173
static av_always_inline void fft3(TXComplex *out, TXComplex *in,
174
                                  ptrdiff_t stride)
175
51.6M
{
176
51.6M
    TXComplex tmp[3];
177
51.6M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);
178
#ifdef TX_INT32
179
    int64_t mtmp[4];
180
#endif
181
182
51.6M
    tmp[0] = in[0];
183
51.6M
    BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
184
51.6M
    BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
185
186
#ifdef TX_INT32
187
    out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
188
    out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
189
    mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
190
    mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
191
    mtmp[2] = (int64_t)tab[10] * tmp[2].re;
192
    mtmp[3] = (int64_t)tab[10] * tmp[2].im;
193
    out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
194
    out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
195
    out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
196
    out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
197
#else
198
    out[0*stride].re = tmp[0].re + tmp[2].re;
199
    out[0*stride].im = tmp[0].im + tmp[2].im;
200
    tmp[1].re = tab[ 8] * tmp[1].re;
201
    tmp[1].im = tab[ 9] * tmp[1].im;
202
    tmp[2].re = tab[10] * tmp[2].re;
203
    tmp[2].im = tab[10] * tmp[2].im;
204
    out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
205
    out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
206
    out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
207
    out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
208
#endif
209
51.6M
}
Unexecuted instantiation: tx_double.c:fft3
tx_float.c:fft3
Line
Count
Source
175
50.5M
{
176
50.5M
    TXComplex tmp[3];
177
50.5M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);
178
#ifdef TX_INT32
179
    int64_t mtmp[4];
180
#endif
181
182
50.5M
    tmp[0] = in[0];
183
50.5M
    BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
184
50.5M
    BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
185
186
#ifdef TX_INT32
187
    out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
188
    out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
189
    mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
190
    mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
191
    mtmp[2] = (int64_t)tab[10] * tmp[2].re;
192
    mtmp[3] = (int64_t)tab[10] * tmp[2].im;
193
    out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
194
    out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
195
    out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
196
    out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
197
#else
198
50.5M
    out[0*stride].re = tmp[0].re + tmp[2].re;
199
50.5M
    out[0*stride].im = tmp[0].im + tmp[2].im;
200
50.5M
    tmp[1].re = tab[ 8] * tmp[1].re;
201
50.5M
    tmp[1].im = tab[ 9] * tmp[1].im;
202
50.5M
    tmp[2].re = tab[10] * tmp[2].re;
203
50.5M
    tmp[2].im = tab[10] * tmp[2].im;
204
50.5M
    out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
205
50.5M
    out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
206
50.5M
    out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
207
50.5M
    out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
208
50.5M
#endif
209
50.5M
}
tx_int32.c:fft3
Line
Count
Source
175
1.08M
{
176
1.08M
    TXComplex tmp[3];
177
1.08M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);
178
1.08M
#ifdef TX_INT32
179
1.08M
    int64_t mtmp[4];
180
1.08M
#endif
181
182
1.08M
    tmp[0] = in[0];
183
1.08M
    BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
184
1.08M
    BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
185
186
1.08M
#ifdef TX_INT32
187
1.08M
    out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
188
1.08M
    out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
189
1.08M
    mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
190
1.08M
    mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
191
1.08M
    mtmp[2] = (int64_t)tab[10] * tmp[2].re;
192
1.08M
    mtmp[3] = (int64_t)tab[10] * tmp[2].im;
193
1.08M
    out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
194
1.08M
    out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
195
1.08M
    out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
196
1.08M
    out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
197
#else
198
    out[0*stride].re = tmp[0].re + tmp[2].re;
199
    out[0*stride].im = tmp[0].im + tmp[2].im;
200
    tmp[1].re = tab[ 8] * tmp[1].re;
201
    tmp[1].im = tab[ 9] * tmp[1].im;
202
    tmp[2].re = tab[10] * tmp[2].re;
203
    tmp[2].im = tab[10] * tmp[2].im;
204
    out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
205
    out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
206
    out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
207
    out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
208
#endif
209
1.08M
}
210
211
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)                         \
212
static av_always_inline void NAME(TXComplex *out, TXComplex *in,    \
213
60.4M
                                  ptrdiff_t stride)                 \
214
60.4M
{                                                                   \
215
60.4M
    TXComplex dc, z0[4], t[6];                                      \
216
60.4M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
60.4M
                                                                    \
218
60.4M
    dc = in[0];                                                     \
219
60.4M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
60.4M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
60.4M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
60.4M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
60.4M
                                                                    \
224
60.4M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
60.4M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
60.4M
                                                                    \
227
60.4M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
60.4M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
60.4M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
60.4M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
60.4M
                                                                    \
232
60.4M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
60.4M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
60.4M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
60.4M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
60.4M
                                                                    \
237
60.4M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
60.4M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
60.4M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
60.4M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
60.4M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
60.4M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
60.4M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
60.4M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
60.4M
}
Unexecuted instantiation: tx_double.c:fft5
Unexecuted instantiation: tx_double.c:fft5_m1
Unexecuted instantiation: tx_double.c:fft5_m2
Unexecuted instantiation: tx_double.c:fft5_m3
tx_float.c:fft5
Line
Count
Source
213
46.9M
                                  ptrdiff_t stride)                 \
214
46.9M
{                                                                   \
215
46.9M
    TXComplex dc, z0[4], t[6];                                      \
216
46.9M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
46.9M
                                                                    \
218
46.9M
    dc = in[0];                                                     \
219
46.9M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
46.9M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
46.9M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
46.9M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
46.9M
                                                                    \
224
46.9M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
46.9M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
46.9M
                                                                    \
227
46.9M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
46.9M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
46.9M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
46.9M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
46.9M
                                                                    \
232
46.9M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
46.9M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
46.9M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
46.9M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
46.9M
                                                                    \
237
46.9M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
46.9M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
46.9M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
46.9M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
46.9M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
46.9M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
46.9M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
46.9M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
46.9M
}
tx_float.c:fft5_m1
Line
Count
Source
213
4.30M
                                  ptrdiff_t stride)                 \
214
4.30M
{                                                                   \
215
4.30M
    TXComplex dc, z0[4], t[6];                                      \
216
4.30M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
4.30M
                                                                    \
218
4.30M
    dc = in[0];                                                     \
219
4.30M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
4.30M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
4.30M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
4.30M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
4.30M
                                                                    \
224
4.30M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
4.30M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
4.30M
                                                                    \
227
4.30M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
4.30M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
4.30M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
4.30M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
4.30M
                                                                    \
232
4.30M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
4.30M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
4.30M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
4.30M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
4.30M
                                                                    \
237
4.30M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
4.30M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
4.30M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
4.30M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
4.30M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
4.30M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
4.30M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
4.30M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
4.30M
}
tx_float.c:fft5_m2
Line
Count
Source
213
4.30M
                                  ptrdiff_t stride)                 \
214
4.30M
{                                                                   \
215
4.30M
    TXComplex dc, z0[4], t[6];                                      \
216
4.30M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
4.30M
                                                                    \
218
4.30M
    dc = in[0];                                                     \
219
4.30M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
4.30M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
4.30M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
4.30M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
4.30M
                                                                    \
224
4.30M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
4.30M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
4.30M
                                                                    \
227
4.30M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
4.30M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
4.30M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
4.30M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
4.30M
                                                                    \
232
4.30M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
4.30M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
4.30M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
4.30M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
4.30M
                                                                    \
237
4.30M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
4.30M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
4.30M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
4.30M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
4.30M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
4.30M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
4.30M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
4.30M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
4.30M
}
tx_float.c:fft5_m3
Line
Count
Source
213
4.30M
                                  ptrdiff_t stride)                 \
214
4.30M
{                                                                   \
215
4.30M
    TXComplex dc, z0[4], t[6];                                      \
216
4.30M
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
4.30M
                                                                    \
218
4.30M
    dc = in[0];                                                     \
219
4.30M
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
4.30M
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
4.30M
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
4.30M
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
4.30M
                                                                    \
224
4.30M
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
4.30M
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
4.30M
                                                                    \
227
4.30M
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
4.30M
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
4.30M
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
4.30M
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
4.30M
                                                                    \
232
4.30M
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
4.30M
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
4.30M
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
4.30M
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
4.30M
                                                                    \
237
4.30M
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
4.30M
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
4.30M
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
4.30M
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
4.30M
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
4.30M
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
4.30M
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
4.30M
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
4.30M
}
Unexecuted instantiation: tx_int32.c:fft5
tx_int32.c:fft5_m1
Line
Count
Source
213
216k
                                  ptrdiff_t stride)                 \
214
216k
{                                                                   \
215
216k
    TXComplex dc, z0[4], t[6];                                      \
216
216k
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
216k
                                                                    \
218
216k
    dc = in[0];                                                     \
219
216k
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
216k
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
216k
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
216k
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
216k
                                                                    \
224
216k
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
216k
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
216k
                                                                    \
227
216k
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
216k
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
216k
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
216k
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
216k
                                                                    \
232
216k
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
216k
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
216k
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
216k
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
216k
                                                                    \
237
216k
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
216k
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
216k
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
216k
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
216k
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
216k
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
216k
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
216k
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
216k
}
tx_int32.c:fft5_m2
Line
Count
Source
213
216k
                                  ptrdiff_t stride)                 \
214
216k
{                                                                   \
215
216k
    TXComplex dc, z0[4], t[6];                                      \
216
216k
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
216k
                                                                    \
218
216k
    dc = in[0];                                                     \
219
216k
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
216k
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
216k
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
216k
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
216k
                                                                    \
224
216k
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
216k
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
216k
                                                                    \
227
216k
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
216k
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
216k
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
216k
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
216k
                                                                    \
232
216k
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
216k
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
216k
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
216k
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
216k
                                                                    \
237
216k
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
216k
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
216k
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
216k
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
216k
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
216k
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
216k
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
216k
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
216k
}
tx_int32.c:fft5_m3
Line
Count
Source
213
216k
                                  ptrdiff_t stride)                 \
214
216k
{                                                                   \
215
216k
    TXComplex dc, z0[4], t[6];                                      \
216
216k
    const TXSample *tab = TX_TAB(ff_tx_tab_53);                     \
217
216k
                                                                    \
218
216k
    dc = in[0];                                                     \
219
216k
    BF(t[1].im, t[0].re, in[1].re, in[4].re);                       \
220
216k
    BF(t[1].re, t[0].im, in[1].im, in[4].im);                       \
221
216k
    BF(t[3].im, t[2].re, in[2].re, in[3].re);                       \
222
216k
    BF(t[3].re, t[2].im, in[2].im, in[3].im);                       \
223
216k
                                                                    \
224
216k
    out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re;        \
225
216k
    out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im;        \
226
216k
                                                                    \
227
216k
    SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re);       \
228
216k
    SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im);       \
229
216k
    CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re);       \
230
216k
    CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im);       \
231
216k
                                                                    \
232
216k
    BF(z0[0].re, z0[3].re, t[0].re, t[1].re);                       \
233
216k
    BF(z0[0].im, z0[3].im, t[0].im, t[1].im);                       \
234
216k
    BF(z0[2].re, z0[1].re, t[4].re, t[5].re);                       \
235
216k
    BF(z0[2].im, z0[1].im, t[4].im, t[5].im);                       \
236
216k
                                                                    \
237
216k
    out[D1*stride].re = dc.re + (TXUSample)z0[3].re;                 \
238
216k
    out[D1*stride].im = dc.im + (TXUSample)z0[0].im;                 \
239
216k
    out[D2*stride].re = dc.re + (TXUSample)z0[2].re;                 \
240
216k
    out[D2*stride].im = dc.im + (TXUSample)z0[1].im;                 \
241
216k
    out[D3*stride].re = dc.re + (TXUSample)z0[1].re;                 \
242
216k
    out[D3*stride].im = dc.im + (TXUSample)z0[2].im;                 \
243
216k
    out[D4*stride].re = dc.re + (TXUSample)z0[0].re;                 \
244
216k
    out[D4*stride].im = dc.im + (TXUSample)z0[3].im;                 \
245
216k
}
246
247
DECL_FFT5(fft5,     0,  1,  2,  3,  4)
248
DECL_FFT5(fft5_m1,  0,  6, 12,  3,  9)
249
DECL_FFT5(fft5_m2, 10,  1,  7, 13,  4)
250
DECL_FFT5(fft5_m3,  5, 11,  2,  8, 14)
251
252
static av_always_inline void fft7(TXComplex *out, TXComplex *in,
253
                                  ptrdiff_t stride)
254
14.5M
{
255
14.5M
    TXComplex dc, t[6], z[3];
256
14.5M
    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
257
#ifdef TX_INT32
258
    int64_t mtmp[12];
259
#endif
260
261
14.5M
    dc = in[0];
262
14.5M
    BF(t[1].re, t[0].re, in[1].re, in[6].re);
263
14.5M
    BF(t[1].im, t[0].im, in[1].im, in[6].im);
264
14.5M
    BF(t[3].re, t[2].re, in[2].re, in[5].re);
265
14.5M
    BF(t[3].im, t[2].im, in[2].im, in[5].im);
266
14.5M
    BF(t[5].re, t[4].re, in[3].re, in[4].re);
267
14.5M
    BF(t[5].im, t[4].im, in[3].im, in[4].im);
268
269
14.5M
    out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
270
14.5M
    out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
271
272
#ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
273
    mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
274
    mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
275
    mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
276
    mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
277
    mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
278
    mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
279
280
    mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
281
    mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
282
    mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
283
    mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
284
    mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
285
    mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
286
287
    z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
288
    z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
289
    z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
290
    z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
291
    z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
292
    z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
293
294
    t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
295
    t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
296
    t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
297
    t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
298
    t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
299
    t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
300
#else
301
    z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
302
    z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
303
    z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
304
    z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
305
    z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
306
    z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
307
308
    /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
309
     * multiplying the sum of all with the average of the twiddles */
310
311
    t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
312
    t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
313
    t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
314
    t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
315
    t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
316
    t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
317
#endif
318
319
14.5M
    BF(t[1].re, z[0].re, z[0].re, t[4].re);
320
14.5M
    BF(t[3].re, z[1].re, z[1].re, t[2].re);
321
14.5M
    BF(t[5].re, z[2].re, z[2].re, t[0].re);
322
14.5M
    BF(t[1].im, z[0].im, z[0].im, t[0].im);
323
14.5M
    BF(t[3].im, z[1].im, z[1].im, t[2].im);
324
14.5M
    BF(t[5].im, z[2].im, z[2].im, t[4].im);
325
326
14.5M
    out[1*stride].re = dc.re + z[0].re;
327
14.5M
    out[1*stride].im = dc.im + t[1].im;
328
14.5M
    out[2*stride].re = dc.re + t[3].re;
329
14.5M
    out[2*stride].im = dc.im + z[1].im;
330
14.5M
    out[3*stride].re = dc.re + z[2].re;
331
14.5M
    out[3*stride].im = dc.im + t[5].im;
332
14.5M
    out[4*stride].re = dc.re + t[5].re;
333
14.5M
    out[4*stride].im = dc.im + z[2].im;
334
14.5M
    out[5*stride].re = dc.re + z[1].re;
335
14.5M
    out[5*stride].im = dc.im + t[3].im;
336
14.5M
    out[6*stride].re = dc.re + t[1].re;
337
14.5M
    out[6*stride].im = dc.im + z[0].im;
338
14.5M
}
Unexecuted instantiation: tx_double.c:fft7
tx_float.c:fft7
Line
Count
Source
254
14.5M
{
255
14.5M
    TXComplex dc, t[6], z[3];
256
14.5M
    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
257
#ifdef TX_INT32
258
    int64_t mtmp[12];
259
#endif
260
261
14.5M
    dc = in[0];
262
14.5M
    BF(t[1].re, t[0].re, in[1].re, in[6].re);
263
14.5M
    BF(t[1].im, t[0].im, in[1].im, in[6].im);
264
14.5M
    BF(t[3].re, t[2].re, in[2].re, in[5].re);
265
14.5M
    BF(t[3].im, t[2].im, in[2].im, in[5].im);
266
14.5M
    BF(t[5].re, t[4].re, in[3].re, in[4].re);
267
14.5M
    BF(t[5].im, t[4].im, in[3].im, in[4].im);
268
269
14.5M
    out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
270
14.5M
    out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
271
272
#ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
273
    mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
274
    mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
275
    mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
276
    mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
277
    mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
278
    mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
279
280
    mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
281
    mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
282
    mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
283
    mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
284
    mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
285
    mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
286
287
    z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
288
    z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
289
    z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
290
    z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
291
    z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
292
    z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
293
294
    t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
295
    t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
296
    t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
297
    t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
298
    t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
299
    t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
300
#else
301
14.5M
    z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
302
14.5M
    z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
303
14.5M
    z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
304
14.5M
    z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
305
14.5M
    z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
306
14.5M
    z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
307
308
    /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
309
     * multiplying the sum of all with the average of the twiddles */
310
311
14.5M
    t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
312
14.5M
    t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
313
14.5M
    t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
314
14.5M
    t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
315
14.5M
    t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
316
14.5M
    t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
317
14.5M
#endif
318
319
14.5M
    BF(t[1].re, z[0].re, z[0].re, t[4].re);
320
14.5M
    BF(t[3].re, z[1].re, z[1].re, t[2].re);
321
14.5M
    BF(t[5].re, z[2].re, z[2].re, t[0].re);
322
14.5M
    BF(t[1].im, z[0].im, z[0].im, t[0].im);
323
14.5M
    BF(t[3].im, z[1].im, z[1].im, t[2].im);
324
14.5M
    BF(t[5].im, z[2].im, z[2].im, t[4].im);
325
326
14.5M
    out[1*stride].re = dc.re + z[0].re;
327
14.5M
    out[1*stride].im = dc.im + t[1].im;
328
14.5M
    out[2*stride].re = dc.re + t[3].re;
329
14.5M
    out[2*stride].im = dc.im + z[1].im;
330
14.5M
    out[3*stride].re = dc.re + z[2].re;
331
14.5M
    out[3*stride].im = dc.im + t[5].im;
332
14.5M
    out[4*stride].re = dc.re + t[5].re;
333
14.5M
    out[4*stride].im = dc.im + z[2].im;
334
14.5M
    out[5*stride].re = dc.re + z[1].re;
335
14.5M
    out[5*stride].im = dc.im + t[3].im;
336
14.5M
    out[6*stride].re = dc.re + t[1].re;
337
14.5M
    out[6*stride].im = dc.im + z[0].im;
338
14.5M
}
Unexecuted instantiation: tx_int32.c:fft7
339
340
static av_always_inline void fft9(TXComplex *out, TXComplex *in,
341
                                  ptrdiff_t stride)
342
11.3M
{
343
11.3M
    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
344
11.3M
    TXComplex dc, t[16], w[4], x[5], y[5], z[2];
345
#ifdef TX_INT32
346
    int64_t mtmp[12];
347
#endif
348
349
11.3M
    dc = in[0];
350
11.3M
    BF(t[1].re, t[0].re, in[1].re, in[8].re);
351
11.3M
    BF(t[1].im, t[0].im, in[1].im, in[8].im);
352
11.3M
    BF(t[3].re, t[2].re, in[2].re, in[7].re);
353
11.3M
    BF(t[3].im, t[2].im, in[2].im, in[7].im);
354
11.3M
    BF(t[5].re, t[4].re, in[3].re, in[6].re);
355
11.3M
    BF(t[5].im, t[4].im, in[3].im, in[6].im);
356
11.3M
    BF(t[7].re, t[6].re, in[4].re, in[5].re);
357
11.3M
    BF(t[7].im, t[6].im, in[4].im, in[5].im);
358
359
11.3M
    w[0].re = t[0].re - t[6].re;
360
11.3M
    w[0].im = t[0].im - t[6].im;
361
11.3M
    w[1].re = t[2].re - t[6].re;
362
11.3M
    w[1].im = t[2].im - t[6].im;
363
11.3M
    w[2].re = t[1].re - t[7].re;
364
11.3M
    w[2].im = t[1].im - t[7].im;
365
11.3M
    w[3].re = t[3].re + t[7].re;
366
11.3M
    w[3].im = t[3].im + t[7].im;
367
368
11.3M
    z[0].re = dc.re + t[4].re;
369
11.3M
    z[0].im = dc.im + t[4].im;
370
371
11.3M
    z[1].re = t[0].re + t[2].re + t[6].re;
372
11.3M
    z[1].im = t[0].im + t[2].im + t[6].im;
373
374
11.3M
    out[0*stride].re = z[0].re + z[1].re;
375
11.3M
    out[0*stride].im = z[0].im + z[1].im;
376
377
#ifdef TX_INT32
378
    mtmp[0] = t[1].re - t[3].re + t[7].re;
379
    mtmp[1] = t[1].im - t[3].im + t[7].im;
380
381
    y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
382
    y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
383
384
    mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
385
    mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
386
    mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
387
    mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
388
389
    x[3].re = z[0].re  + (int32_t)mtmp[0];
390
    x[3].im = z[0].im  + (int32_t)mtmp[1];
391
    z[0].re = in[0].re + (int32_t)mtmp[2];
392
    z[0].im = in[0].im + (int32_t)mtmp[3];
393
394
    mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
395
    mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
396
    mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
397
    mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
398
    mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
399
    mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
400
    mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
401
    mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
402
403
    x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
404
    x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
405
    x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
406
    x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
407
    y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
408
    y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
409
    y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
410
    y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
411
412
    y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
413
    y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
414
415
#else
416
    y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
417
    y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
418
419
    x[3].re = z[0].re  + tab[0].re*z[1].re;
420
    x[3].im = z[0].im  + tab[0].re*z[1].im;
421
    z[0].re = dc.re + tab[0].re*t[4].re;
422
    z[0].im = dc.im + tab[0].re*t[4].im;
423
424
    x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
425
    x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
426
    x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
427
    x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
428
    y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
429
    y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
430
    y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
431
    y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
432
433
    y[0].re = tab[0].im*t[5].re;
434
    y[0].im = tab[0].im*t[5].im;
435
#endif
436
437
11.3M
    x[4].re = x[1].re + x[2].re;
438
11.3M
    x[4].im = x[1].im + x[2].im;
439
440
11.3M
    y[4].re = y[1].re - y[2].re;
441
11.3M
    y[4].im = y[1].im - y[2].im;
442
11.3M
    x[1].re = z[0].re + x[1].re;
443
11.3M
    x[1].im = z[0].im + x[1].im;
444
11.3M
    y[1].re = y[0].re + y[1].re;
445
11.3M
    y[1].im = y[0].im + y[1].im;
446
11.3M
    x[2].re = z[0].re + x[2].re;
447
11.3M
    x[2].im = z[0].im + x[2].im;
448
11.3M
    y[2].re = y[2].re - y[0].re;
449
11.3M
    y[2].im = y[2].im - y[0].im;
450
11.3M
    x[4].re = z[0].re - x[4].re;
451
11.3M
    x[4].im = z[0].im - x[4].im;
452
11.3M
    y[4].re = y[0].re - y[4].re;
453
11.3M
    y[4].im = y[0].im - y[4].im;
454
455
11.3M
    out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
456
11.3M
    out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
457
11.3M
    out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
458
11.3M
    out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
459
11.3M
    out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
460
11.3M
    out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
461
11.3M
    out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
462
11.3M
    out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
463
11.3M
}
Unexecuted instantiation: tx_double.c:fft9
tx_float.c:fft9
Line
Count
Source
342
11.3M
{
343
11.3M
    const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
344
11.3M
    TXComplex dc, t[16], w[4], x[5], y[5], z[2];
345
#ifdef TX_INT32
346
    int64_t mtmp[12];
347
#endif
348
349
11.3M
    dc = in[0];
350
11.3M
    BF(t[1].re, t[0].re, in[1].re, in[8].re);
351
11.3M
    BF(t[1].im, t[0].im, in[1].im, in[8].im);
352
11.3M
    BF(t[3].re, t[2].re, in[2].re, in[7].re);
353
11.3M
    BF(t[3].im, t[2].im, in[2].im, in[7].im);
354
11.3M
    BF(t[5].re, t[4].re, in[3].re, in[6].re);
355
11.3M
    BF(t[5].im, t[4].im, in[3].im, in[6].im);
356
11.3M
    BF(t[7].re, t[6].re, in[4].re, in[5].re);
357
11.3M
    BF(t[7].im, t[6].im, in[4].im, in[5].im);
358
359
11.3M
    w[0].re = t[0].re - t[6].re;
360
11.3M
    w[0].im = t[0].im - t[6].im;
361
11.3M
    w[1].re = t[2].re - t[6].re;
362
11.3M
    w[1].im = t[2].im - t[6].im;
363
11.3M
    w[2].re = t[1].re - t[7].re;
364
11.3M
    w[2].im = t[1].im - t[7].im;
365
11.3M
    w[3].re = t[3].re + t[7].re;
366
11.3M
    w[3].im = t[3].im + t[7].im;
367
368
11.3M
    z[0].re = dc.re + t[4].re;
369
11.3M
    z[0].im = dc.im + t[4].im;
370
371
11.3M
    z[1].re = t[0].re + t[2].re + t[6].re;
372
11.3M
    z[1].im = t[0].im + t[2].im + t[6].im;
373
374
11.3M
    out[0*stride].re = z[0].re + z[1].re;
375
11.3M
    out[0*stride].im = z[0].im + z[1].im;
376
377
#ifdef TX_INT32
378
    mtmp[0] = t[1].re - t[3].re + t[7].re;
379
    mtmp[1] = t[1].im - t[3].im + t[7].im;
380
381
    y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
382
    y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
383
384
    mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
385
    mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
386
    mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
387
    mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
388
389
    x[3].re = z[0].re  + (int32_t)mtmp[0];
390
    x[3].im = z[0].im  + (int32_t)mtmp[1];
391
    z[0].re = in[0].re + (int32_t)mtmp[2];
392
    z[0].im = in[0].im + (int32_t)mtmp[3];
393
394
    mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
395
    mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
396
    mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
397
    mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
398
    mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
399
    mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
400
    mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
401
    mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
402
403
    x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
404
    x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
405
    x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
406
    x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
407
    y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
408
    y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
409
    y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
410
    y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
411
412
    y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
413
    y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
414
415
#else
416
11.3M
    y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
417
11.3M
    y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
418
419
11.3M
    x[3].re = z[0].re  + tab[0].re*z[1].re;
420
11.3M
    x[3].im = z[0].im  + tab[0].re*z[1].im;
421
11.3M
    z[0].re = dc.re + tab[0].re*t[4].re;
422
11.3M
    z[0].im = dc.im + tab[0].re*t[4].im;
423
424
11.3M
    x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
425
11.3M
    x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
426
11.3M
    x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
427
11.3M
    x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
428
11.3M
    y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
429
11.3M
    y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
430
11.3M
    y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
431
11.3M
    y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
432
433
11.3M
    y[0].re = tab[0].im*t[5].re;
434
11.3M
    y[0].im = tab[0].im*t[5].im;
435
11.3M
#endif
436
437
11.3M
    x[4].re = x[1].re + x[2].re;
438
11.3M
    x[4].im = x[1].im + x[2].im;
439
440
11.3M
    y[4].re = y[1].re - y[2].re;
441
11.3M
    y[4].im = y[1].im - y[2].im;
442
11.3M
    x[1].re = z[0].re + x[1].re;
443
11.3M
    x[1].im = z[0].im + x[1].im;
444
11.3M
    y[1].re = y[0].re + y[1].re;
445
11.3M
    y[1].im = y[0].im + y[1].im;
446
11.3M
    x[2].re = z[0].re + x[2].re;
447
11.3M
    x[2].im = z[0].im + x[2].im;
448
11.3M
    y[2].re = y[2].re - y[0].re;
449
11.3M
    y[2].im = y[2].im - y[0].im;
450
11.3M
    x[4].re = z[0].re - x[4].re;
451
11.3M
    x[4].im = z[0].im - x[4].im;
452
11.3M
    y[4].re = y[0].re - y[4].re;
453
11.3M
    y[4].im = y[0].im - y[4].im;
454
455
11.3M
    out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
456
11.3M
    out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
457
11.3M
    out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
458
11.3M
    out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
459
11.3M
    out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
460
11.3M
    out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
461
11.3M
    out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
462
11.3M
    out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
463
11.3M
}
Unexecuted instantiation: tx_int32.c:fft9
464
465
static av_always_inline void fft15(TXComplex *out, TXComplex *in,
466
                                   ptrdiff_t stride)
467
4.52M
{
468
4.52M
    TXComplex tmp[15];
469
470
27.1M
    for (int i = 0; i < 5; i++)
471
22.6M
        fft3(tmp + i, in + i*3, 5);
472
473
4.52M
    fft5_m1(out, tmp +  0, stride);
474
4.52M
    fft5_m2(out, tmp +  5, stride);
475
4.52M
    fft5_m3(out, tmp + 10, stride);
476
4.52M
}
Unexecuted instantiation: tx_double.c:fft15
tx_float.c:fft15
Line
Count
Source
467
4.30M
{
468
4.30M
    TXComplex tmp[15];
469
470
25.8M
    for (int i = 0; i < 5; i++)
471
21.5M
        fft3(tmp + i, in + i*3, 5);
472
473
4.30M
    fft5_m1(out, tmp +  0, stride);
474
4.30M
    fft5_m2(out, tmp +  5, stride);
475
4.30M
    fft5_m3(out, tmp + 10, stride);
476
4.30M
}
tx_int32.c:fft15
Line
Count
Source
467
216k
{
468
216k
    TXComplex tmp[15];
469
470
1.29M
    for (int i = 0; i < 5; i++)
471
1.08M
        fft3(tmp + i, in + i*3, 5);
472
473
216k
    fft5_m1(out, tmp +  0, stride);
474
216k
    fft5_m2(out, tmp +  5, stride);
475
216k
    fft5_m3(out, tmp + 10, stride);
476
216k
}
477
478
static av_cold int TX_NAME(ff_tx_fft_factor_init)(AVTXContext *s,
479
                                                  const FFTXCodelet *cd,
480
                                                  uint64_t flags,
481
                                                  FFTXCodeletOptions *opts,
482
                                                  int len, int inv,
483
                                                  const void *scale)
484
2.34k
{
485
2.34k
    int ret = 0;
486
2.34k
    TX_TAB(ff_tx_init_tabs)(len);
487
488
2.34k
    if (len == 15)
489
0
        ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
490
2.34k
    else if (flags & FF_TX_PRESHUFFLE)
491
2.34k
        ret = ff_tx_gen_default_map(s, opts);
492
493
2.34k
    return ret;
494
2.34k
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_factor_init_double_c
tx_float.c:ff_tx_fft_factor_init_float_c
Line
Count
Source
484
2.34k
{
485
2.34k
    int ret = 0;
486
2.34k
    TX_TAB(ff_tx_init_tabs)(len);
487
488
2.34k
    if (len == 15)
489
0
        ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
490
2.34k
    else if (flags & FF_TX_PRESHUFFLE)
491
2.34k
        ret = ff_tx_gen_default_map(s, opts);
492
493
2.34k
    return ret;
494
2.34k
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_factor_init_int32_c
495
496
#define DECL_FACTOR_S(n)                                                       \
497
static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst,                   \
498
46.9M
                                  void *src, ptrdiff_t stride)                 \
499
46.9M
{                                                                              \
500
46.9M
    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
501
46.9M
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_fft3_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft5_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft7_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft9_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft15_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft3_float_c
tx_float.c:ff_tx_fft5_float_c
Line
Count
Source
498
21.0M
                                  void *src, ptrdiff_t stride)                 \
499
21.0M
{                                                                              \
500
21.0M
    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
501
21.0M
}                                                                              \
tx_float.c:ff_tx_fft7_float_c
Line
Count
Source
498
14.5M
                                  void *src, ptrdiff_t stride)                 \
499
14.5M
{                                                                              \
500
14.5M
    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
501
14.5M
}                                                                              \
tx_float.c:ff_tx_fft9_float_c
Line
Count
Source
498
11.3M
                                  void *src, ptrdiff_t stride)                 \
499
11.3M
{                                                                              \
500
11.3M
    fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex));    \
501
11.3M
}                                                                              \
Unexecuted instantiation: tx_float.c:ff_tx_fft15_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft3_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft5_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft7_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft9_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft15_int32_c
502
static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = {                    \
503
    .name       = TX_NAME_STR("fft" #n "_ns"),                                 \
504
    .function   = TX_NAME(ff_tx_fft##n),                                       \
505
    .type       = TX_TYPE(FFT),                                                \
506
    .flags      = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |                         \
507
                  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE,                          \
508
    .factors[0] = n,                                                           \
509
    .nb_factors = 1,                                                           \
510
    .min_len    = n,                                                           \
511
    .max_len    = n,                                                           \
512
    .init       = TX_NAME(ff_tx_fft_factor_init),                              \
513
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
514
    .prio       = FF_TX_PRIO_BASE,                                             \
515
};
516
517
#define DECL_FACTOR_F(n)                                                       \
518
DECL_FACTOR_S(n)                                                               \
519
static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = {                   \
520
    .name       = TX_NAME_STR("fft" #n "_fwd"),                                \
521
    .function   = TX_NAME(ff_tx_fft##n),                                       \
522
    .type       = TX_TYPE(FFT),                                                \
523
    .flags      = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |                         \
524
                  AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY,                        \
525
    .factors[0] = n,                                                           \
526
    .nb_factors = 1,                                                           \
527
    .min_len    = n,                                                           \
528
    .max_len    = n,                                                           \
529
    .init       = TX_NAME(ff_tx_fft_factor_init),                              \
530
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
531
    .prio       = FF_TX_PRIO_BASE,                                             \
532
};
533
534
DECL_FACTOR_F(3)
535
DECL_FACTOR_F(5)
536
DECL_FACTOR_F(7)
537
DECL_FACTOR_F(9)
538
DECL_FACTOR_S(15)
539
540
#define BUTTERFLIES(a0, a1, a2, a3)            \
541
6.83G
    do {                                       \
542
6.83G
        r0=a0.re;                              \
543
6.83G
        i0=a0.im;                              \
544
6.83G
        r1=a1.re;                              \
545
6.83G
        i1=a1.im;                              \
546
6.83G
        BF(t3, t5, t5, t1);                    \
547
6.83G
        BF(a2.re, a0.re, r0, t5);              \
548
6.83G
        BF(a3.im, a1.im, i1, t3);              \
549
6.83G
        BF(t4, t6, t2, t6);                    \
550
6.83G
        BF(a3.re, a1.re, r1, t4);              \
551
6.83G
        BF(a2.im, a0.im, i0, t6);              \
552
6.83G
    } while (0)
553
554
#define TRANSFORM(a0, a1, a2, a3, wre, wim)    \
555
5.69G
    do {                                       \
556
5.69G
        CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
557
5.69G
        CMUL(t5, t6, a3.re, a3.im, wre,  wim); \
558
5.69G
        BUTTERFLIES(a0, a1, a2, a3);           \
559
5.69G
    } while (0)
560
561
/* z[0...8n-1], w[1...2n-1] */
562
static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z,
563
                                                 const TXSample *cos, int len)
564
293M
{
565
293M
    int o1 = 2*len;
566
293M
    int o2 = 4*len;
567
293M
    int o3 = 6*len;
568
293M
    const TXSample *wim = cos + o1 - 7;
569
293M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
570
571
781M
    for (int i = 0; i < len; i += 4) {
572
487M
        TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
573
487M
        TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
574
487M
        TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
575
487M
        TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
576
577
487M
        TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
578
487M
        TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
579
487M
        TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
580
487M
        TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
581
582
487M
        z   += 2*4;
583
487M
        cos += 2*4;
584
487M
        wim -= 2*4;
585
487M
    }
586
293M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_sr_combine_double_c
tx_float.c:ff_tx_fft_sr_combine_float_c
Line
Count
Source
564
238M
{
565
238M
    int o1 = 2*len;
566
238M
    int o2 = 4*len;
567
238M
    int o3 = 6*len;
568
238M
    const TXSample *wim = cos + o1 - 7;
569
238M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
570
571
644M
    for (int i = 0; i < len; i += 4) {
572
405M
        TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
573
405M
        TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
574
405M
        TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
575
405M
        TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
576
577
405M
        TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
578
405M
        TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
579
405M
        TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
580
405M
        TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
581
582
405M
        z   += 2*4;
583
405M
        cos += 2*4;
584
405M
        wim -= 2*4;
585
405M
    }
586
238M
}
tx_int32.c:ff_tx_fft_sr_combine_int32_c
Line
Count
Source
564
54.9M
{
565
54.9M
    int o1 = 2*len;
566
54.9M
    int o2 = 4*len;
567
54.9M
    int o3 = 6*len;
568
54.9M
    const TXSample *wim = cos + o1 - 7;
569
54.9M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
570
571
136M
    for (int i = 0; i < len; i += 4) {
572
81.9M
        TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
573
81.9M
        TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
574
81.9M
        TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
575
81.9M
        TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
576
577
81.9M
        TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
578
81.9M
        TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
579
81.9M
        TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
580
81.9M
        TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
581
582
81.9M
        z   += 2*4;
583
81.9M
        cos += 2*4;
584
81.9M
        wim -= 2*4;
585
81.9M
    }
586
54.9M
}
587
588
static av_cold int TX_NAME(ff_tx_fft_sr_codelet_init)(AVTXContext *s,
589
                                                      const FFTXCodelet *cd,
590
                                                      uint64_t flags,
591
                                                      FFTXCodeletOptions *opts,
592
                                                      int len, int inv,
593
                                                      const void *scale)
594
1.08M
{
595
1.08M
    TX_TAB(ff_tx_init_tabs)(len);
596
1.08M
    return ff_tx_gen_ptwo_revtab(s, opts);
597
1.08M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_sr_codelet_init_double_c
tx_float.c:ff_tx_fft_sr_codelet_init_float_c
Line
Count
Source
594
958k
{
595
958k
    TX_TAB(ff_tx_init_tabs)(len);
596
958k
    return ff_tx_gen_ptwo_revtab(s, opts);
597
958k
}
tx_int32.c:ff_tx_fft_sr_codelet_init_int32_c
Line
Count
Source
594
129k
{
595
129k
    TX_TAB(ff_tx_init_tabs)(len);
596
129k
    return ff_tx_gen_ptwo_revtab(s, opts);
597
129k
}
598
599
#define DECL_SR_CODELET_DEF(n)                              \
600
static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
601
    .name       = TX_NAME_STR("fft" #n "_ns"),              \
602
    .function   = TX_NAME(ff_tx_fft##n##_ns),               \
603
    .type       = TX_TYPE(FFT),                             \
604
    .flags      = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE |      \
605
                  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE,       \
606
    .factors[0] = 2,                                        \
607
    .nb_factors = 1,                                        \
608
    .min_len    = n,                                        \
609
    .max_len    = n,                                        \
610
    .init       = TX_NAME(ff_tx_fft_sr_codelet_init),       \
611
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                      \
612
    .prio       = FF_TX_PRIO_BASE,                          \
613
};
614
615
#define DECL_SR_CODELET(n, n2, n4)                                    \
616
static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst,    \
617
293M
                                        void *_src, ptrdiff_t stride) \
618
293M
{                                                                     \
619
293M
    TXComplex *src = _src;                                            \
620
293M
    TXComplex *dst = _dst;                                            \
621
293M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
293M
                                                                      \
623
293M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
293M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
293M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
293M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
293M
}                                                                     \
Unexecuted instantiation: tx_double.c:ff_tx_fft32_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft64_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft128_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft256_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft512_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft1024_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft2048_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft4096_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft8192_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft16384_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft32768_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft65536_ns_double_c
Unexecuted instantiation: tx_double.c:ff_tx_fft131072_ns_double_c
tx_float.c:ff_tx_fft32_ns_float_c
Line
Count
Source
617
176M
                                        void *_src, ptrdiff_t stride) \
618
176M
{                                                                     \
619
176M
    TXComplex *src = _src;                                            \
620
176M
    TXComplex *dst = _dst;                                            \
621
176M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
176M
                                                                      \
623
176M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
176M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
176M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
176M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
176M
}                                                                     \
tx_float.c:ff_tx_fft64_ns_float_c
Line
Count
Source
617
39.7M
                                        void *_src, ptrdiff_t stride) \
618
39.7M
{                                                                     \
619
39.7M
    TXComplex *src = _src;                                            \
620
39.7M
    TXComplex *dst = _dst;                                            \
621
39.7M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
39.7M
                                                                      \
623
39.7M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
39.7M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
39.7M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
39.7M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
39.7M
}                                                                     \
tx_float.c:ff_tx_fft128_ns_float_c
Line
Count
Source
617
15.9M
                                        void *_src, ptrdiff_t stride) \
618
15.9M
{                                                                     \
619
15.9M
    TXComplex *src = _src;                                            \
620
15.9M
    TXComplex *dst = _dst;                                            \
621
15.9M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
15.9M
                                                                      \
623
15.9M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
15.9M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
15.9M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
15.9M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
15.9M
}                                                                     \
tx_float.c:ff_tx_fft256_ns_float_c
Line
Count
Source
617
4.36M
                                        void *_src, ptrdiff_t stride) \
618
4.36M
{                                                                     \
619
4.36M
    TXComplex *src = _src;                                            \
620
4.36M
    TXComplex *dst = _dst;                                            \
621
4.36M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
4.36M
                                                                      \
623
4.36M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
4.36M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
4.36M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
4.36M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
4.36M
}                                                                     \
tx_float.c:ff_tx_fft512_ns_float_c
Line
Count
Source
617
1.91M
                                        void *_src, ptrdiff_t stride) \
618
1.91M
{                                                                     \
619
1.91M
    TXComplex *src = _src;                                            \
620
1.91M
    TXComplex *dst = _dst;                                            \
621
1.91M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
1.91M
                                                                      \
623
1.91M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
1.91M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
1.91M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
1.91M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
1.91M
}                                                                     \
tx_float.c:ff_tx_fft1024_ns_float_c
Line
Count
Source
617
393k
                                        void *_src, ptrdiff_t stride) \
618
393k
{                                                                     \
619
393k
    TXComplex *src = _src;                                            \
620
393k
    TXComplex *dst = _dst;                                            \
621
393k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
393k
                                                                      \
623
393k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
393k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
393k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
393k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
393k
}                                                                     \
tx_float.c:ff_tx_fft2048_ns_float_c
Line
Count
Source
617
67.6k
                                        void *_src, ptrdiff_t stride) \
618
67.6k
{                                                                     \
619
67.6k
    TXComplex *src = _src;                                            \
620
67.6k
    TXComplex *dst = _dst;                                            \
621
67.6k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
67.6k
                                                                      \
623
67.6k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
67.6k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
67.6k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
67.6k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
67.6k
}                                                                     \
tx_float.c:ff_tx_fft4096_ns_float_c
Line
Count
Source
617
23.8k
                                        void *_src, ptrdiff_t stride) \
618
23.8k
{                                                                     \
619
23.8k
    TXComplex *src = _src;                                            \
620
23.8k
    TXComplex *dst = _dst;                                            \
621
23.8k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
23.8k
                                                                      \
623
23.8k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
23.8k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
23.8k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
23.8k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
23.8k
}                                                                     \
Unexecuted instantiation: tx_float.c:ff_tx_fft8192_ns_float_c
Unexecuted instantiation: tx_float.c:ff_tx_fft16384_ns_float_c
Unexecuted instantiation: tx_float.c:ff_tx_fft32768_ns_float_c
Unexecuted instantiation: tx_float.c:ff_tx_fft65536_ns_float_c
Unexecuted instantiation: tx_float.c:ff_tx_fft131072_ns_float_c
tx_int32.c:ff_tx_fft32_ns_int32_c
Line
Count
Source
617
44.2M
                                        void *_src, ptrdiff_t stride) \
618
44.2M
{                                                                     \
619
44.2M
    TXComplex *src = _src;                                            \
620
44.2M
    TXComplex *dst = _dst;                                            \
621
44.2M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
44.2M
                                                                      \
623
44.2M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
44.2M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
44.2M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
44.2M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
44.2M
}                                                                     \
tx_int32.c:ff_tx_fft64_ns_int32_c
Line
Count
Source
617
5.92M
                                        void *_src, ptrdiff_t stride) \
618
5.92M
{                                                                     \
619
5.92M
    TXComplex *src = _src;                                            \
620
5.92M
    TXComplex *dst = _dst;                                            \
621
5.92M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
5.92M
                                                                      \
623
5.92M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
5.92M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
5.92M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
5.92M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
5.92M
}                                                                     \
tx_int32.c:ff_tx_fft128_ns_int32_c
Line
Count
Source
617
3.95M
                                        void *_src, ptrdiff_t stride) \
618
3.95M
{                                                                     \
619
3.95M
    TXComplex *src = _src;                                            \
620
3.95M
    TXComplex *dst = _dst;                                            \
621
3.95M
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
3.95M
                                                                      \
623
3.95M
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
3.95M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
3.95M
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
3.95M
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
3.95M
}                                                                     \
tx_int32.c:ff_tx_fft256_ns_int32_c
Line
Count
Source
617
439k
                                        void *_src, ptrdiff_t stride) \
618
439k
{                                                                     \
619
439k
    TXComplex *src = _src;                                            \
620
439k
    TXComplex *dst = _dst;                                            \
621
439k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
439k
                                                                      \
623
439k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
439k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
439k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
439k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
439k
}                                                                     \
tx_int32.c:ff_tx_fft512_ns_int32_c
Line
Count
Source
617
409k
                                        void *_src, ptrdiff_t stride) \
618
409k
{                                                                     \
619
409k
    TXComplex *src = _src;                                            \
620
409k
    TXComplex *dst = _dst;                                            \
621
409k
    const TXSample *cos = TX_TAB(ff_tx_tab_##n);                      \
622
409k
                                                                      \
623
409k
    TX_NAME(ff_tx_fft##n2##_ns)(s, dst,        src,        stride);   \
624
409k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride);   \
625
409k
    TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride);   \
626
409k
    TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1);                 \
627
409k
}                                                                     \
Unexecuted instantiation: tx_int32.c:ff_tx_fft1024_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft2048_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft4096_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft8192_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft16384_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft32768_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft65536_ns_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft131072_ns_int32_c
628
                                                                      \
629
DECL_SR_CODELET_DEF(n)
630
631
static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst,
632
                                   void *_src, ptrdiff_t stride)
633
0
{
634
0
    TXComplex *src = _src;
635
0
    TXComplex *dst = _dst;
636
0
    TXComplex tmp;
637
638
0
    BF(tmp.re, dst[0].re, src[0].re, src[1].re);
639
0
    BF(tmp.im, dst[0].im, src[0].im, src[1].im);
640
0
    dst[1] = tmp;
641
0
}
Unexecuted instantiation: tx_double.c:ff_tx_fft2_ns_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft2_ns_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft2_ns_int32_c
642
643
static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst,
644
                                   void *_src, ptrdiff_t stride)
645
1.47G
{
646
1.47G
    TXComplex *src = _src;
647
1.47G
    TXComplex *dst = _dst;
648
1.47G
    TXSample t1, t2, t3, t4, t5, t6, t7, t8;
649
650
1.47G
    BF(t3, t1, src[0].re, src[1].re);
651
1.47G
    BF(t8, t6, src[3].re, src[2].re);
652
1.47G
    BF(dst[2].re, dst[0].re, t1, t6);
653
1.47G
    BF(t4, t2, src[0].im, src[1].im);
654
1.47G
    BF(t7, t5, src[2].im, src[3].im);
655
1.47G
    BF(dst[3].im, dst[1].im, t4, t8);
656
1.47G
    BF(dst[3].re, dst[1].re, t3, t7);
657
1.47G
    BF(dst[2].im, dst[0].im, t2, t5);
658
1.47G
}
Unexecuted instantiation: tx_double.c:ff_tx_fft4_ns_double_c
tx_float.c:ff_tx_fft4_ns_float_c
Line
Count
Source
645
1.21G
{
646
1.21G
    TXComplex *src = _src;
647
1.21G
    TXComplex *dst = _dst;
648
1.21G
    TXSample t1, t2, t3, t4, t5, t6, t7, t8;
649
650
1.21G
    BF(t3, t1, src[0].re, src[1].re);
651
1.21G
    BF(t8, t6, src[3].re, src[2].re);
652
1.21G
    BF(dst[2].re, dst[0].re, t1, t6);
653
1.21G
    BF(t4, t2, src[0].im, src[1].im);
654
1.21G
    BF(t7, t5, src[2].im, src[3].im);
655
1.21G
    BF(dst[3].im, dst[1].im, t4, t8);
656
1.21G
    BF(dst[3].re, dst[1].re, t3, t7);
657
1.21G
    BF(dst[2].im, dst[0].im, t2, t5);
658
1.21G
}
tx_int32.c:ff_tx_fft4_ns_int32_c
Line
Count
Source
645
256M
{
646
256M
    TXComplex *src = _src;
647
256M
    TXComplex *dst = _dst;
648
256M
    TXSample t1, t2, t3, t4, t5, t6, t7, t8;
649
650
256M
    BF(t3, t1, src[0].re, src[1].re);
651
256M
    BF(t8, t6, src[3].re, src[2].re);
652
256M
    BF(dst[2].re, dst[0].re, t1, t6);
653
256M
    BF(t4, t2, src[0].im, src[1].im);
654
256M
    BF(t7, t5, src[2].im, src[3].im);
655
256M
    BF(dst[3].im, dst[1].im, t4, t8);
656
256M
    BF(dst[3].re, dst[1].re, t3, t7);
657
256M
    BF(dst[2].im, dst[0].im, t2, t5);
658
256M
}
659
660
static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst,
661
                                   void *_src, ptrdiff_t stride)
662
817M
{
663
817M
    TXComplex *src = _src;
664
817M
    TXComplex *dst = _dst;
665
817M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
666
817M
    const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
667
668
817M
    TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
669
670
817M
    BF(t1, dst[5].re, src[4].re, -src[5].re);
671
817M
    BF(t2, dst[5].im, src[4].im, -src[5].im);
672
817M
    BF(t5, dst[7].re, src[6].re, -src[7].re);
673
817M
    BF(t6, dst[7].im, src[6].im, -src[7].im);
674
675
817M
    BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
676
817M
    TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
677
817M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft8_ns_double_c
tx_float.c:ff_tx_fft8_ns_float_c
Line
Count
Source
662
672M
{
663
672M
    TXComplex *src = _src;
664
672M
    TXComplex *dst = _dst;
665
672M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
666
672M
    const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
667
668
672M
    TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
669
670
672M
    BF(t1, dst[5].re, src[4].re, -src[5].re);
671
672M
    BF(t2, dst[5].im, src[4].im, -src[5].im);
672
672M
    BF(t5, dst[7].re, src[6].re, -src[7].re);
673
672M
    BF(t6, dst[7].im, src[6].im, -src[7].im);
674
675
672M
    BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
676
672M
    TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
677
672M
}
tx_int32.c:ff_tx_fft8_ns_int32_c
Line
Count
Source
662
144M
{
663
144M
    TXComplex *src = _src;
664
144M
    TXComplex *dst = _dst;
665
144M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
666
144M
    const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
667
668
144M
    TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
669
670
144M
    BF(t1, dst[5].re, src[4].re, -src[5].re);
671
144M
    BF(t2, dst[5].im, src[4].im, -src[5].im);
672
144M
    BF(t5, dst[7].re, src[6].re, -src[7].re);
673
144M
    BF(t6, dst[7].im, src[6].im, -src[7].im);
674
675
144M
    BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
676
144M
    TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
677
144M
}
678
679
static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst,
680
                                    void *_src, ptrdiff_t stride)
681
325M
{
682
325M
    TXComplex *src = _src;
683
325M
    TXComplex *dst = _dst;
684
325M
    const TXSample *cos = TX_TAB(ff_tx_tab_16);
685
686
325M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
687
325M
    TXSample cos_16_1 = cos[1];
688
325M
    TXSample cos_16_2 = cos[2];
689
325M
    TXSample cos_16_3 = cos[3];
690
691
325M
    TX_NAME(ff_tx_fft8_ns)(s, dst +  0, src +  0, stride);
692
325M
    TX_NAME(ff_tx_fft4_ns)(s, dst +  8, src +  8, stride);
693
325M
    TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
694
695
325M
    t1 = dst[ 8].re;
696
325M
    t2 = dst[ 8].im;
697
325M
    t5 = dst[12].re;
698
325M
    t6 = dst[12].im;
699
325M
    BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
700
701
325M
    TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
702
325M
    TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
703
325M
    TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
704
325M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft16_ns_double_c
tx_float.c:ff_tx_fft16_ns_float_c
Line
Count
Source
681
269M
{
682
269M
    TXComplex *src = _src;
683
269M
    TXComplex *dst = _dst;
684
269M
    const TXSample *cos = TX_TAB(ff_tx_tab_16);
685
686
269M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
687
269M
    TXSample cos_16_1 = cos[1];
688
269M
    TXSample cos_16_2 = cos[2];
689
269M
    TXSample cos_16_3 = cos[3];
690
691
269M
    TX_NAME(ff_tx_fft8_ns)(s, dst +  0, src +  0, stride);
692
269M
    TX_NAME(ff_tx_fft4_ns)(s, dst +  8, src +  8, stride);
693
269M
    TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
694
695
269M
    t1 = dst[ 8].re;
696
269M
    t2 = dst[ 8].im;
697
269M
    t5 = dst[12].re;
698
269M
    t6 = dst[12].im;
699
269M
    BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
700
701
269M
    TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
702
269M
    TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
703
269M
    TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
704
269M
}
tx_int32.c:ff_tx_fft16_ns_int32_c
Line
Count
Source
681
56.0M
{
682
56.0M
    TXComplex *src = _src;
683
56.0M
    TXComplex *dst = _dst;
684
56.0M
    const TXSample *cos = TX_TAB(ff_tx_tab_16);
685
686
56.0M
    TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
687
56.0M
    TXSample cos_16_1 = cos[1];
688
56.0M
    TXSample cos_16_2 = cos[2];
689
56.0M
    TXSample cos_16_3 = cos[3];
690
691
56.0M
    TX_NAME(ff_tx_fft8_ns)(s, dst +  0, src +  0, stride);
692
56.0M
    TX_NAME(ff_tx_fft4_ns)(s, dst +  8, src +  8, stride);
693
56.0M
    TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
694
695
56.0M
    t1 = dst[ 8].re;
696
56.0M
    t2 = dst[ 8].im;
697
56.0M
    t5 = dst[12].re;
698
56.0M
    t6 = dst[12].im;
699
56.0M
    BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
700
701
56.0M
    TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
702
56.0M
    TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
703
56.0M
    TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
704
56.0M
}
705
706
DECL_SR_CODELET_DEF(2)
707
DECL_SR_CODELET_DEF(4)
708
DECL_SR_CODELET_DEF(8)
709
DECL_SR_CODELET_DEF(16)
710
DECL_SR_CODELET(32,16,8)
711
DECL_SR_CODELET(64,32,16)
712
DECL_SR_CODELET(128,64,32)
713
DECL_SR_CODELET(256,128,64)
714
DECL_SR_CODELET(512,256,128)
715
DECL_SR_CODELET(1024,512,256)
716
DECL_SR_CODELET(2048,1024,512)
717
DECL_SR_CODELET(4096,2048,1024)
718
DECL_SR_CODELET(8192,4096,2048)
719
DECL_SR_CODELET(16384,8192,4096)
720
DECL_SR_CODELET(32768,16384,8192)
721
DECL_SR_CODELET(65536,32768,16384)
722
DECL_SR_CODELET(131072,65536,32768)
723
724
static av_cold int TX_NAME(ff_tx_fft_init)(AVTXContext *s,
725
                                           const FFTXCodelet *cd,
726
                                           uint64_t flags,
727
                                           FFTXCodeletOptions *opts,
728
                                           int len, int inv,
729
                                           const void *scale)
730
11.0k
{
731
11.0k
    int ret;
732
11.0k
    int is_inplace = !!(flags & AV_TX_INPLACE);
733
11.0k
    FFTXCodeletOptions sub_opts = {
734
11.0k
        .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
735
11.0k
    };
736
737
11.0k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
738
11.0k
    flags |=  AV_TX_INPLACE;      /* in-place */
739
11.0k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
740
741
11.0k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
742
2.34k
        return ret;
743
744
8.67k
    if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
745
0
        return ret;
746
747
8.67k
    return 0;
748
8.67k
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_init_double_c
tx_float.c:ff_tx_fft_init_float_c
Line
Count
Source
730
11.0k
{
731
11.0k
    int ret;
732
11.0k
    int is_inplace = !!(flags & AV_TX_INPLACE);
733
11.0k
    FFTXCodeletOptions sub_opts = {
734
11.0k
        .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
735
11.0k
    };
736
737
11.0k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
738
11.0k
    flags |=  AV_TX_INPLACE;      /* in-place */
739
11.0k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
740
741
11.0k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
742
2.34k
        return ret;
743
744
8.67k
    if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
745
0
        return ret;
746
747
8.67k
    return 0;
748
8.67k
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_init_int32_c
749
750
static av_cold int TX_NAME(ff_tx_fft_inplace_small_init)(AVTXContext *s,
751
                                                         const FFTXCodelet *cd,
752
                                                         uint64_t flags,
753
                                                         FFTXCodeletOptions *opts,
754
                                                         int len, int inv,
755
                                                         const void *scale)
756
780
{
757
780
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
758
0
        return AVERROR(ENOMEM);
759
780
    flags &= ~AV_TX_INPLACE;
760
780
    return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
761
780
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_inplace_small_init_double_c
tx_float.c:ff_tx_fft_inplace_small_init_float_c
Line
Count
Source
756
780
{
757
780
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
758
0
        return AVERROR(ENOMEM);
759
780
    flags &= ~AV_TX_INPLACE;
760
780
    return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
761
780
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_inplace_small_init_int32_c
762
763
static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst,
764
                               void *_src, ptrdiff_t stride)
765
10.8M
{
766
10.8M
    TXComplex *src = _src;
767
10.8M
    TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
768
10.8M
    TXComplex *dst2 = _dst;
769
10.8M
    int *map = s->sub[0].map;
770
10.8M
    int len = s->len;
771
772
    /* Compilers can't vectorize this anyway without assuming AVX2, which they
773
     * generally don't, at least without -march=native -mtune=native */
774
1.21G
    for (int i = 0; i < len; i++)
775
1.19G
        dst1[i] = src[map[i]];
776
777
10.8M
    s->fn[0](&s->sub[0], dst2, dst1, stride);
778
10.8M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_double_c
tx_float.c:ff_tx_fft_float_c
Line
Count
Source
765
10.8M
{
766
10.8M
    TXComplex *src = _src;
767
10.8M
    TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
768
10.8M
    TXComplex *dst2 = _dst;
769
10.8M
    int *map = s->sub[0].map;
770
10.8M
    int len = s->len;
771
772
    /* Compilers can't vectorize this anyway without assuming AVX2, which they
773
     * generally don't, at least without -march=native -mtune=native */
774
1.21G
    for (int i = 0; i < len; i++)
775
1.19G
        dst1[i] = src[map[i]];
776
777
10.8M
    s->fn[0](&s->sub[0], dst2, dst1, stride);
778
10.8M
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_int32_c
779
780
static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst,
781
                                       void *_src, ptrdiff_t stride)
782
0
{
783
0
    TXComplex *src = _src;
784
0
    TXComplex *dst = _dst;
785
0
    TXComplex tmp;
786
0
    const int *map = s->sub->map;
787
0
    const int *inplace_idx = s->map;
788
0
    int src_idx, dst_idx;
789
790
0
    src_idx = *inplace_idx++;
791
0
    do {
792
0
        tmp = src[src_idx];
793
0
        dst_idx = map[src_idx];
794
0
        do {
795
0
            FFSWAP(TXComplex, tmp, src[dst_idx]);
796
0
            dst_idx = map[dst_idx];
797
0
        } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */
798
0
        src[dst_idx] = tmp;
799
0
    } while ((src_idx = *inplace_idx++));
800
801
0
    s->fn[0](&s->sub[0], dst, src, stride);
802
0
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_inplace_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft_inplace_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft_inplace_int32_c
803
804
static const FFTXCodelet TX_NAME(ff_tx_fft_def) = {
805
    .name       = TX_NAME_STR("fft"),
806
    .function   = TX_NAME(ff_tx_fft),
807
    .type       = TX_TYPE(FFT),
808
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
809
    .factors[0] = TX_FACTOR_ANY,
810
    .nb_factors = 1,
811
    .min_len    = 2,
812
    .max_len    = TX_LEN_UNLIMITED,
813
    .init       = TX_NAME(ff_tx_fft_init),
814
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
815
    .prio       = FF_TX_PRIO_BASE,
816
};
817
818
static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = {
819
    .name       = TX_NAME_STR("fft_inplace_small"),
820
    .function   = TX_NAME(ff_tx_fft),
821
    .type       = TX_TYPE(FFT),
822
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
823
    .factors[0] = TX_FACTOR_ANY,
824
    .nb_factors = 1,
825
    .min_len    = 2,
826
    .max_len    = 65536,
827
    .init       = TX_NAME(ff_tx_fft_inplace_small_init),
828
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
829
    .prio       = FF_TX_PRIO_BASE - 256,
830
};
831
832
static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = {
833
    .name       = TX_NAME_STR("fft_inplace"),
834
    .function   = TX_NAME(ff_tx_fft_inplace),
835
    .type       = TX_TYPE(FFT),
836
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | AV_TX_INPLACE,
837
    .factors[0] = TX_FACTOR_ANY,
838
    .nb_factors = 1,
839
    .min_len    = 2,
840
    .max_len    = TX_LEN_UNLIMITED,
841
    .init       = TX_NAME(ff_tx_fft_init),
842
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
843
    .prio       = FF_TX_PRIO_BASE - 512,
844
};
845
846
static av_cold int TX_NAME(ff_tx_fft_init_naive_small)(AVTXContext *s,
847
                                                       const FFTXCodelet *cd,
848
                                                       uint64_t flags,
849
                                                       FFTXCodeletOptions *opts,
850
                                                       int len, int inv,
851
                                                       const void *scale)
852
780
{
853
780
    const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
854
855
780
    if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
856
0
        return AVERROR(ENOMEM);
857
858
10.9k
    for (int i = 0; i < len; i++) {
859
141k
        for (int j = 0; j < len; j++) {
860
131k
            const double factor = phase*i*j;
861
131k
            s->exp[i*j] = (TXComplex){
862
131k
                RESCALE(cos(factor)),
863
131k
                RESCALE(sin(factor)),
864
131k
            };
865
131k
        }
866
10.1k
    }
867
868
780
    return 0;
869
780
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_init_naive_small_double_c
tx_float.c:ff_tx_fft_init_naive_small_float_c
Line
Count
Source
852
780
{
853
780
    const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
854
855
780
    if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
856
0
        return AVERROR(ENOMEM);
857
858
10.9k
    for (int i = 0; i < len; i++) {
859
141k
        for (int j = 0; j < len; j++) {
860
131k
            const double factor = phase*i*j;
861
131k
            s->exp[i*j] = (TXComplex){
862
131k
                RESCALE(cos(factor)),
863
131k
                RESCALE(sin(factor)),
864
131k
            };
865
131k
        }
866
10.1k
    }
867
868
780
    return 0;
869
780
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_init_naive_small_int32_c
870
871
static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src,
872
                                     ptrdiff_t stride)
873
0
{
874
0
    TXComplex *src = _src;
875
0
    TXComplex *dst = _dst;
876
0
    const int n = s->len;
877
0
    double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n;
878
879
0
    stride /= sizeof(*dst);
880
881
0
    for (int i = 0; i < n; i++) {
882
0
        TXComplex tmp = { 0 };
883
0
        for (int j = 0; j < n; j++) {
884
0
            const double factor = phase*i*j;
885
0
            const TXComplex mult = {
886
0
                RESCALE(cos(factor)),
887
0
                RESCALE(sin(factor)),
888
0
            };
889
0
            TXComplex res;
890
0
            CMUL3(res, src[j], mult);
891
0
            tmp.re += res.re;
892
0
            tmp.im += res.im;
893
0
        }
894
0
        dst[i*stride] = tmp;
895
0
    }
896
0
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_naive_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft_naive_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft_naive_int32_c
897
898
static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src,
899
                                           ptrdiff_t stride)
900
8.09M
{
901
8.09M
    TXComplex *src = _src;
902
8.09M
    TXComplex *dst = _dst;
903
8.09M
    const int n = s->len;
904
905
8.09M
    stride /= sizeof(*dst);
906
907
113M
    for (int i = 0; i < n; i++) {
908
105M
        TXComplex tmp = { 0 };
909
1.47G
        for (int j = 0; j < n; j++) {
910
1.36G
            TXComplex res;
911
1.36G
            const TXComplex mult = s->exp[i*j];
912
1.36G
            CMUL3(res, src[j], mult);
913
1.36G
            tmp.re += res.re;
914
1.36G
            tmp.im += res.im;
915
1.36G
        }
916
105M
        dst[i*stride] = tmp;
917
105M
    }
918
8.09M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_naive_small_double_c
tx_float.c:ff_tx_fft_naive_small_float_c
Line
Count
Source
900
8.09M
{
901
8.09M
    TXComplex *src = _src;
902
8.09M
    TXComplex *dst = _dst;
903
8.09M
    const int n = s->len;
904
905
8.09M
    stride /= sizeof(*dst);
906
907
113M
    for (int i = 0; i < n; i++) {
908
105M
        TXComplex tmp = { 0 };
909
1.47G
        for (int j = 0; j < n; j++) {
910
1.36G
            TXComplex res;
911
1.36G
            const TXComplex mult = s->exp[i*j];
912
1.36G
            CMUL3(res, src[j], mult);
913
1.36G
            tmp.re += res.re;
914
1.36G
            tmp.im += res.im;
915
1.36G
        }
916
105M
        dst[i*stride] = tmp;
917
105M
    }
918
8.09M
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_naive_small_int32_c
919
920
static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = {
921
    .name       = TX_NAME_STR("fft_naive_small"),
922
    .function   = TX_NAME(ff_tx_fft_naive_small),
923
    .type       = TX_TYPE(FFT),
924
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
925
    .factors[0] = TX_FACTOR_ANY,
926
    .nb_factors = 1,
927
    .min_len    = 2,
928
    .max_len    = 1024,
929
    .init       = TX_NAME(ff_tx_fft_init_naive_small),
930
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
931
    .prio       = FF_TX_PRIO_MIN/2,
932
};
933
934
static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = {
935
    .name       = TX_NAME_STR("fft_naive"),
936
    .function   = TX_NAME(ff_tx_fft_naive),
937
    .type       = TX_TYPE(FFT),
938
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE,
939
    .factors[0] = TX_FACTOR_ANY,
940
    .nb_factors = 1,
941
    .min_len    = 2,
942
    .max_len    = TX_LEN_UNLIMITED,
943
    .init       = NULL,
944
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
945
    .prio       = FF_TX_PRIO_MIN,
946
};
947
948
static av_cold int TX_NAME(ff_tx_fft_pfa_init)(AVTXContext *s,
949
                                               const FFTXCodelet *cd,
950
                                               uint64_t flags,
951
                                               FFTXCodeletOptions *opts,
952
                                               int len, int inv,
953
                                               const void *scale)
954
1.00M
{
955
1.00M
    int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
956
1.00M
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
957
1.00M
    size_t extra_tmp_len = 0;
958
1.00M
    int len_list[TX_MAX_DECOMPOSITIONS];
959
960
1.00M
    if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
961
999k
        return ret;
962
963
    /* Two iterations to test both orderings. */
964
1.56k
    for (int i = 0; i < ret; i++) {
965
1.56k
        int len1 = len_list[i];
966
1.56k
        int len2 = len / len1;
967
968
        /* Our ptwo transforms don't support striding the output. */
969
1.56k
        if (len2 & (len2 - 1))
970
1.56k
            FFSWAP(int, len1, len2);
971
972
1.56k
        ff_tx_clear_ctx(s);
973
974
        /* First transform */
975
1.56k
        sub_opts.map_dir = FF_TX_MAP_GATHER;
976
1.56k
        flags &= ~AV_TX_INPLACE;
977
1.56k
        flags |=  FF_TX_OUT_OF_PLACE;
978
1.56k
        flags |=  FF_TX_PRESHUFFLE; /* This function handles the permute step */
979
1.56k
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
980
1.56k
                               len1, inv, scale);
981
982
1.56k
        if (ret == AVERROR(ENOMEM)) {
983
0
            return ret;
984
1.56k
        } else if (ret < 0) { /* Try again without a preshuffle flag */
985
780
            flags &= ~FF_TX_PRESHUFFLE;
986
780
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
987
780
                                   len1, inv, scale);
988
780
            if (ret == AVERROR(ENOMEM))
989
0
                return ret;
990
780
            else if (ret < 0)
991
0
                continue;
992
780
        }
993
994
        /* Second transform. */
995
1.56k
        sub_opts.map_dir = FF_TX_MAP_SCATTER;
996
1.56k
        flags |=  FF_TX_PRESHUFFLE;
997
1.56k
retry:
998
1.56k
        flags &= ~FF_TX_OUT_OF_PLACE;
999
1.56k
        flags |=  AV_TX_INPLACE;
1000
1.56k
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1001
1.56k
                               len2, inv, scale);
1002
1003
1.56k
        if (ret == AVERROR(ENOMEM)) {
1004
0
            return ret;
1005
1.56k
        } else if (ret < 0) { /* Try again with an out-of-place transform */
1006
0
            flags |= FF_TX_OUT_OF_PLACE;
1007
0
            flags &= ~AV_TX_INPLACE;
1008
0
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1009
0
                                   len2, inv, scale);
1010
0
            if (ret == AVERROR(ENOMEM)) {
1011
0
                return ret;
1012
0
            } else if (ret < 0) {
1013
0
                if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1014
0
                    flags &= ~FF_TX_PRESHUFFLE;
1015
0
                    goto retry;
1016
0
                } else {
1017
0
                    continue;
1018
0
                }
1019
0
            }
1020
0
        }
1021
1022
        /* Success */
1023
1.56k
        break;
1024
1.56k
    }
1025
1026
    /* If nothing was successful, error out */
1027
1.56k
    if (ret < 0)
1028
0
        return ret;
1029
1030
    /* Generate PFA map */
1031
1.56k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1032
1.56k
                                          s->sub[0].len, s->sub[1].len)))
1033
0
        return ret;
1034
1035
1.56k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1036
0
        return AVERROR(ENOMEM);
1037
1038
    /* Flatten input map */
1039
1.56k
    tmp = (int *)s->tmp;
1040
12.4k
    for (int k = 0; k < len; k += s->sub[0].len) {
1041
10.9k
        memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1042
110k
        for (int i = 0; i < s->sub[0].len; i++)
1043
99.8k
            s->map[k + i] = tmp[s->sub[0].map[i]];
1044
10.9k
    }
1045
1046
    /* Only allocate extra temporary memory if we need it */
1047
1.56k
    if (!(s->sub[1].flags & AV_TX_INPLACE))
1048
0
        extra_tmp_len = len;
1049
1.56k
    else if (!ps)
1050
1.56k
        extra_tmp_len = s->sub[0].len;
1051
1052
1.56k
    if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1053
0
        return AVERROR(ENOMEM);
1054
1055
1.56k
    return 0;
1056
1.56k
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_pfa_init_double_c
tx_float.c:ff_tx_fft_pfa_init_float_c
Line
Count
Source
954
876k
{
955
876k
    int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
956
876k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
957
876k
    size_t extra_tmp_len = 0;
958
876k
    int len_list[TX_MAX_DECOMPOSITIONS];
959
960
876k
    if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
961
874k
        return ret;
962
963
    /* Two iterations to test both orderings. */
964
1.56k
    for (int i = 0; i < ret; i++) {
965
1.56k
        int len1 = len_list[i];
966
1.56k
        int len2 = len / len1;
967
968
        /* Our ptwo transforms don't support striding the output. */
969
1.56k
        if (len2 & (len2 - 1))
970
1.56k
            FFSWAP(int, len1, len2);
971
972
1.56k
        ff_tx_clear_ctx(s);
973
974
        /* First transform */
975
1.56k
        sub_opts.map_dir = FF_TX_MAP_GATHER;
976
1.56k
        flags &= ~AV_TX_INPLACE;
977
1.56k
        flags |=  FF_TX_OUT_OF_PLACE;
978
1.56k
        flags |=  FF_TX_PRESHUFFLE; /* This function handles the permute step */
979
1.56k
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
980
1.56k
                               len1, inv, scale);
981
982
1.56k
        if (ret == AVERROR(ENOMEM)) {
983
0
            return ret;
984
1.56k
        } else if (ret < 0) { /* Try again without a preshuffle flag */
985
780
            flags &= ~FF_TX_PRESHUFFLE;
986
780
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
987
780
                                   len1, inv, scale);
988
780
            if (ret == AVERROR(ENOMEM))
989
0
                return ret;
990
780
            else if (ret < 0)
991
0
                continue;
992
780
        }
993
994
        /* Second transform. */
995
1.56k
        sub_opts.map_dir = FF_TX_MAP_SCATTER;
996
1.56k
        flags |=  FF_TX_PRESHUFFLE;
997
1.56k
retry:
998
1.56k
        flags &= ~FF_TX_OUT_OF_PLACE;
999
1.56k
        flags |=  AV_TX_INPLACE;
1000
1.56k
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1001
1.56k
                               len2, inv, scale);
1002
1003
1.56k
        if (ret == AVERROR(ENOMEM)) {
1004
0
            return ret;
1005
1.56k
        } else if (ret < 0) { /* Try again with an out-of-place transform */
1006
0
            flags |= FF_TX_OUT_OF_PLACE;
1007
0
            flags &= ~AV_TX_INPLACE;
1008
0
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1009
0
                                   len2, inv, scale);
1010
0
            if (ret == AVERROR(ENOMEM)) {
1011
0
                return ret;
1012
0
            } else if (ret < 0) {
1013
0
                if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1014
0
                    flags &= ~FF_TX_PRESHUFFLE;
1015
0
                    goto retry;
1016
0
                } else {
1017
0
                    continue;
1018
0
                }
1019
0
            }
1020
0
        }
1021
1022
        /* Success */
1023
1.56k
        break;
1024
1.56k
    }
1025
1026
    /* If nothing was successful, error out */
1027
1.56k
    if (ret < 0)
1028
0
        return ret;
1029
1030
    /* Generate PFA map */
1031
1.56k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1032
1.56k
                                          s->sub[0].len, s->sub[1].len)))
1033
0
        return ret;
1034
1035
1.56k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1036
0
        return AVERROR(ENOMEM);
1037
1038
    /* Flatten input map */
1039
1.56k
    tmp = (int *)s->tmp;
1040
12.4k
    for (int k = 0; k < len; k += s->sub[0].len) {
1041
10.9k
        memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1042
110k
        for (int i = 0; i < s->sub[0].len; i++)
1043
99.8k
            s->map[k + i] = tmp[s->sub[0].map[i]];
1044
10.9k
    }
1045
1046
    /* Only allocate extra temporary memory if we need it */
1047
1.56k
    if (!(s->sub[1].flags & AV_TX_INPLACE))
1048
0
        extra_tmp_len = len;
1049
1.56k
    else if (!ps)
1050
1.56k
        extra_tmp_len = s->sub[0].len;
1051
1052
1.56k
    if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1053
0
        return AVERROR(ENOMEM);
1054
1055
1.56k
    return 0;
1056
1.56k
}
tx_int32.c:ff_tx_fft_pfa_init_int32_c
Line
Count
Source
954
124k
{
955
124k
    int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
956
124k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
957
124k
    size_t extra_tmp_len = 0;
958
124k
    int len_list[TX_MAX_DECOMPOSITIONS];
959
960
124k
    if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
961
124k
        return ret;
962
963
    /* Two iterations to test both orderings. */
964
0
    for (int i = 0; i < ret; i++) {
965
0
        int len1 = len_list[i];
966
0
        int len2 = len / len1;
967
968
        /* Our ptwo transforms don't support striding the output. */
969
0
        if (len2 & (len2 - 1))
970
0
            FFSWAP(int, len1, len2);
971
972
0
        ff_tx_clear_ctx(s);
973
974
        /* First transform */
975
0
        sub_opts.map_dir = FF_TX_MAP_GATHER;
976
0
        flags &= ~AV_TX_INPLACE;
977
0
        flags |=  FF_TX_OUT_OF_PLACE;
978
0
        flags |=  FF_TX_PRESHUFFLE; /* This function handles the permute step */
979
0
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
980
0
                               len1, inv, scale);
981
982
0
        if (ret == AVERROR(ENOMEM)) {
983
0
            return ret;
984
0
        } else if (ret < 0) { /* Try again without a preshuffle flag */
985
0
            flags &= ~FF_TX_PRESHUFFLE;
986
0
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
987
0
                                   len1, inv, scale);
988
0
            if (ret == AVERROR(ENOMEM))
989
0
                return ret;
990
0
            else if (ret < 0)
991
0
                continue;
992
0
        }
993
994
        /* Second transform. */
995
0
        sub_opts.map_dir = FF_TX_MAP_SCATTER;
996
0
        flags |=  FF_TX_PRESHUFFLE;
997
0
retry:
998
0
        flags &= ~FF_TX_OUT_OF_PLACE;
999
0
        flags |=  AV_TX_INPLACE;
1000
0
        ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1001
0
                               len2, inv, scale);
1002
1003
0
        if (ret == AVERROR(ENOMEM)) {
1004
0
            return ret;
1005
0
        } else if (ret < 0) { /* Try again with an out-of-place transform */
1006
0
            flags |= FF_TX_OUT_OF_PLACE;
1007
0
            flags &= ~AV_TX_INPLACE;
1008
0
            ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1009
0
                                   len2, inv, scale);
1010
0
            if (ret == AVERROR(ENOMEM)) {
1011
0
                return ret;
1012
0
            } else if (ret < 0) {
1013
0
                if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1014
0
                    flags &= ~FF_TX_PRESHUFFLE;
1015
0
                    goto retry;
1016
0
                } else {
1017
0
                    continue;
1018
0
                }
1019
0
            }
1020
0
        }
1021
1022
        /* Success */
1023
0
        break;
1024
0
    }
1025
1026
    /* If nothing was successful, error out */
1027
0
    if (ret < 0)
1028
0
        return ret;
1029
1030
    /* Generate PFA map */
1031
0
    if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1032
0
                                          s->sub[0].len, s->sub[1].len)))
1033
0
        return ret;
1034
1035
0
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1036
0
        return AVERROR(ENOMEM);
1037
1038
    /* Flatten input map */
1039
0
    tmp = (int *)s->tmp;
1040
0
    for (int k = 0; k < len; k += s->sub[0].len) {
1041
0
        memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1042
0
        for (int i = 0; i < s->sub[0].len; i++)
1043
0
            s->map[k + i] = tmp[s->sub[0].map[i]];
1044
0
    }
1045
1046
    /* Only allocate extra temporary memory if we need it */
1047
0
    if (!(s->sub[1].flags & AV_TX_INPLACE))
1048
0
        extra_tmp_len = len;
1049
0
    else if (!ps)
1050
0
        extra_tmp_len = s->sub[0].len;
1051
1052
0
    if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1053
0
        return AVERROR(ENOMEM);
1054
1055
0
    return 0;
1056
0
}
1057
1058
static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out,
1059
                                   void *_in, ptrdiff_t stride)
1060
3.23M
{
1061
3.23M
    const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1062
3.23M
    const int *in_map = s->map, *out_map = in_map + l;
1063
3.23M
    const int *sub_map = s->sub[1].map;
1064
3.23M
    TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1065
3.23M
    TXComplex *in = _in, *out = _out;
1066
1067
3.23M
    stride /= sizeof(*out);
1068
1069
25.8M
    for (int i = 0; i < m; i++) {
1070
229M
        for (int j = 0; j < n; j++)
1071
207M
            s->exp[j] = in[in_map[i*n + j]];
1072
22.6M
        s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
1073
22.6M
    }
1074
1075
35.6M
    for (int i = 0; i < n; i++)
1076
32.3M
        s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1077
1078
210M
    for (int i = 0; i < l; i++)
1079
207M
        out[i*stride] = tmp1[out_map[i]];
1080
3.23M
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_pfa_double_c
tx_float.c:ff_tx_fft_pfa_float_c
Line
Count
Source
1060
3.23M
{
1061
3.23M
    const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1062
3.23M
    const int *in_map = s->map, *out_map = in_map + l;
1063
3.23M
    const int *sub_map = s->sub[1].map;
1064
3.23M
    TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1065
3.23M
    TXComplex *in = _in, *out = _out;
1066
1067
3.23M
    stride /= sizeof(*out);
1068
1069
25.8M
    for (int i = 0; i < m; i++) {
1070
229M
        for (int j = 0; j < n; j++)
1071
207M
            s->exp[j] = in[in_map[i*n + j]];
1072
22.6M
        s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
1073
22.6M
    }
1074
1075
35.6M
    for (int i = 0; i < n; i++)
1076
32.3M
        s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1077
1078
210M
    for (int i = 0; i < l; i++)
1079
207M
        out[i*stride] = tmp1[out_map[i]];
1080
3.23M
}
Unexecuted instantiation: tx_int32.c:ff_tx_fft_pfa_int32_c
1081
1082
static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out,
1083
                                      void *_in, ptrdiff_t stride)
1084
0
{
1085
0
    const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1086
0
    const int *in_map = s->map, *out_map = in_map + l;
1087
0
    const int *sub_map = s->sub[1].map;
1088
0
    TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1089
0
    TXComplex *in = _in, *out = _out;
1090
1091
0
    stride /= sizeof(*out);
1092
1093
0
    for (int i = 0; i < m; i++)
1094
0
        s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex));
1095
1096
0
    for (int i = 0; i < n; i++)
1097
0
        s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1098
1099
0
    for (int i = 0; i < l; i++)
1100
0
        out[i*stride] = tmp1[out_map[i]];
1101
0
}
Unexecuted instantiation: tx_double.c:ff_tx_fft_pfa_ns_double_c
Unexecuted instantiation: tx_float.c:ff_tx_fft_pfa_ns_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_fft_pfa_ns_int32_c
1102
1103
static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = {
1104
    .name       = TX_NAME_STR("fft_pfa"),
1105
    .function   = TX_NAME(ff_tx_fft_pfa),
1106
    .type       = TX_TYPE(FFT),
1107
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
1108
    .factors    = { 7, 5, 3, 2, TX_FACTOR_ANY },
1109
    .nb_factors = 2,
1110
    .min_len    = 2*3,
1111
    .max_len    = TX_LEN_UNLIMITED,
1112
    .init       = TX_NAME(ff_tx_fft_pfa_init),
1113
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1114
    .prio       = FF_TX_PRIO_BASE,
1115
};
1116
1117
static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = {
1118
    .name       = TX_NAME_STR("fft_pfa_ns"),
1119
    .function   = TX_NAME(ff_tx_fft_pfa_ns),
1120
    .type       = TX_TYPE(FFT),
1121
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |
1122
                  FF_TX_PRESHUFFLE,
1123
    .factors    = { 7, 5, 3, 2, TX_FACTOR_ANY },
1124
    .nb_factors = 2,
1125
    .min_len    = 2*3,
1126
    .max_len    = TX_LEN_UNLIMITED,
1127
    .init       = TX_NAME(ff_tx_fft_pfa_init),
1128
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1129
    .prio       = FF_TX_PRIO_BASE,
1130
};
1131
1132
static av_cold int TX_NAME(ff_tx_mdct_naive_init)(AVTXContext *s,
1133
                                                  const FFTXCodelet *cd,
1134
                                                  uint64_t flags,
1135
                                                  FFTXCodeletOptions *opts,
1136
                                                  int len, int inv,
1137
                                                  const void *scale)
1138
0
{
1139
0
    s->scale_d = *((SCALE_TYPE *)scale);
1140
0
    s->scale_f = s->scale_d;
1141
0
    return 0;
1142
0
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_naive_init_double_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_naive_init_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_naive_init_int32_c
1143
1144
static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst,
1145
                                          void *_src, ptrdiff_t stride)
1146
0
{
1147
0
    TXSample *src = _src;
1148
0
    TXSample *dst = _dst;
1149
0
    double scale = s->scale_d;
1150
0
    int len = s->len;
1151
0
    const double phase = M_PI/(4.0*len);
1152
1153
0
    stride /= sizeof(*dst);
1154
1155
0
    for (int i = 0; i < len; i++) {
1156
0
        double sum = 0.0;
1157
0
        for (int j = 0; j < len*2; j++) {
1158
0
            int a = (2*j + 1 + len) * (2*i + 1);
1159
0
            sum += UNSCALE(src[j]) * cos(a * phase);
1160
0
        }
1161
0
        dst[i*stride] = RESCALE(sum*scale);
1162
0
    }
1163
0
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_naive_fwd_double_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_naive_fwd_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_naive_fwd_int32_c
1164
1165
static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst,
1166
                                          void *_src, ptrdiff_t stride)
1167
0
{
1168
0
    TXSample *src = _src;
1169
0
    TXSample *dst = _dst;
1170
0
    double scale = s->scale_d;
1171
0
    int len = s->len >> 1;
1172
0
    int len2 = len*2;
1173
0
    const double phase = M_PI/(4.0*len2);
1174
1175
0
    stride /= sizeof(*src);
1176
1177
0
    for (int i = 0; i < len; i++) {
1178
0
        double sum_d = 0.0;
1179
0
        double sum_u = 0.0;
1180
0
        double i_d = phase * (4*len  - 2*i - 1);
1181
0
        double i_u = phase * (3*len2 + 2*i + 1);
1182
0
        for (int j = 0; j < len2; j++) {
1183
0
            double a = (2 * j + 1);
1184
0
            double a_d = cos(a * i_d);
1185
0
            double a_u = cos(a * i_u);
1186
0
            double val = UNSCALE(src[j*stride]);
1187
0
            sum_d += a_d * val;
1188
0
            sum_u += a_u * val;
1189
0
        }
1190
0
        dst[i +   0] = RESCALE( sum_d*scale);
1191
0
        dst[i + len] = RESCALE(-sum_u*scale);
1192
0
    }
1193
0
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_naive_inv_double_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_naive_inv_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_naive_inv_int32_c
1194
1195
static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = {
1196
    .name       = TX_NAME_STR("mdct_naive_fwd"),
1197
    .function   = TX_NAME(ff_tx_mdct_naive_fwd),
1198
    .type       = TX_TYPE(MDCT),
1199
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
1200
    .factors    = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */
1201
    .nb_factors = 2,
1202
    .min_len    = 2,
1203
    .max_len    = TX_LEN_UNLIMITED,
1204
    .init       = TX_NAME(ff_tx_mdct_naive_init),
1205
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1206
    .prio       = FF_TX_PRIO_MIN,
1207
};
1208
1209
static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = {
1210
    .name       = TX_NAME_STR("mdct_naive_inv"),
1211
    .function   = TX_NAME(ff_tx_mdct_naive_inv),
1212
    .type       = TX_TYPE(MDCT),
1213
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
1214
    .factors    = { 2, TX_FACTOR_ANY },
1215
    .nb_factors = 2,
1216
    .min_len    = 2,
1217
    .max_len    = TX_LEN_UNLIMITED,
1218
    .init       = TX_NAME(ff_tx_mdct_naive_init),
1219
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1220
    .prio       = FF_TX_PRIO_MIN,
1221
};
1222
1223
static av_cold int TX_NAME(ff_tx_mdct_init)(AVTXContext *s,
1224
                                            const FFTXCodelet *cd,
1225
                                            uint64_t flags,
1226
                                            FFTXCodeletOptions *opts,
1227
                                            int len, int inv,
1228
                                            const void *scale)
1229
659k
{
1230
659k
    int ret;
1231
659k
    FFTXCodeletOptions sub_opts = {
1232
659k
        .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
1233
659k
    };
1234
1235
659k
    s->scale_d = *((SCALE_TYPE *)scale);
1236
659k
    s->scale_f = s->scale_d;
1237
1238
659k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1239
659k
    flags |=  AV_TX_INPLACE;      /* in-place */
1240
659k
    flags |=  FF_TX_PRESHUFFLE;   /* First try with an in-place transform */
1241
1242
659k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1243
659k
                                inv, scale))) {
1244
0
        flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1245
0
        if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1246
0
                                    inv, scale)))
1247
0
            return ret;
1248
0
    }
1249
1250
659k
    s->map = av_malloc((len >> 1)*sizeof(*s->map));
1251
659k
    if (!s->map)
1252
0
        return AVERROR(ENOMEM);
1253
1254
    /* If we need to preshuffle copy the map from the subcontext */
1255
659k
    if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1256
659k
        memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1257
659k
    } else {
1258
0
        for (int i = 0; i < len >> 1; i++)
1259
0
            s->map[i] = i;
1260
0
    }
1261
1262
659k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1263
0
        return ret;
1264
1265
    /* Saves a multiply in a hot path. */
1266
659k
    if (inv)
1267
121M
        for (int i = 0; i < (s->len >> 1); i++)
1268
120M
            s->map[i] <<= 1;
1269
1270
659k
    return 0;
1271
659k
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_init_double_c
tx_float.c:ff_tx_mdct_init_float_c
Line
Count
Source
1229
556k
{
1230
556k
    int ret;
1231
556k
    FFTXCodeletOptions sub_opts = {
1232
556k
        .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
1233
556k
    };
1234
1235
556k
    s->scale_d = *((SCALE_TYPE *)scale);
1236
556k
    s->scale_f = s->scale_d;
1237
1238
556k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1239
556k
    flags |=  AV_TX_INPLACE;      /* in-place */
1240
556k
    flags |=  FF_TX_PRESHUFFLE;   /* First try with an in-place transform */
1241
1242
556k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1243
556k
                                inv, scale))) {
1244
0
        flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1245
0
        if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1246
0
                                    inv, scale)))
1247
0
            return ret;
1248
0
    }
1249
1250
556k
    s->map = av_malloc((len >> 1)*sizeof(*s->map));
1251
556k
    if (!s->map)
1252
0
        return AVERROR(ENOMEM);
1253
1254
    /* If we need to preshuffle copy the map from the subcontext */
1255
556k
    if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1256
556k
        memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1257
556k
    } else {
1258
0
        for (int i = 0; i < len >> 1; i++)
1259
0
            s->map[i] = i;
1260
0
    }
1261
1262
556k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1263
0
        return ret;
1264
1265
    /* Saves a multiply in a hot path. */
1266
556k
    if (inv)
1267
113M
        for (int i = 0; i < (s->len >> 1); i++)
1268
112M
            s->map[i] <<= 1;
1269
1270
556k
    return 0;
1271
556k
}
tx_int32.c:ff_tx_mdct_init_int32_c
Line
Count
Source
1229
102k
{
1230
102k
    int ret;
1231
102k
    FFTXCodeletOptions sub_opts = {
1232
102k
        .map_dir = !inv ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
1233
102k
    };
1234
1235
102k
    s->scale_d = *((SCALE_TYPE *)scale);
1236
102k
    s->scale_f = s->scale_d;
1237
1238
102k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1239
102k
    flags |=  AV_TX_INPLACE;      /* in-place */
1240
102k
    flags |=  FF_TX_PRESHUFFLE;   /* First try with an in-place transform */
1241
1242
102k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1243
102k
                                inv, scale))) {
1244
0
        flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1245
0
        if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1246
0
                                    inv, scale)))
1247
0
            return ret;
1248
0
    }
1249
1250
102k
    s->map = av_malloc((len >> 1)*sizeof(*s->map));
1251
102k
    if (!s->map)
1252
0
        return AVERROR(ENOMEM);
1253
1254
    /* If we need to preshuffle copy the map from the subcontext */
1255
102k
    if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1256
102k
        memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1257
102k
    } else {
1258
0
        for (int i = 0; i < len >> 1; i++)
1259
0
            s->map[i] = i;
1260
0
    }
1261
1262
102k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1263
0
        return ret;
1264
1265
    /* Saves a multiply in a hot path. */
1266
102k
    if (inv)
1267
8.02M
        for (int i = 0; i < (s->len >> 1); i++)
1268
7.92M
            s->map[i] <<= 1;
1269
1270
102k
    return 0;
1271
102k
}
1272
1273
static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
1274
                                    ptrdiff_t stride)
1275
49.8k
{
1276
49.8k
    TXSample *src = _src, *dst = _dst;
1277
49.8k
    TXComplex *exp = s->exp, tmp, *z = _dst;
1278
49.8k
    const int len2 = s->len >> 1;
1279
49.8k
    const int len4 = s->len >> 2;
1280
49.8k
    const int len3 = len2 * 3;
1281
49.8k
    const int *sub_map = s->map;
1282
1283
49.8k
    stride /= sizeof(*dst);
1284
1285
25.5M
    for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1286
25.5M
        const int k = 2*i;
1287
25.5M
        const int idx = sub_map[i];
1288
25.5M
        if (k < len2) {
1289
12.7M
            tmp.re = FOLD(-src[ len2 + k],  src[1*len2 - 1 - k]);
1290
12.7M
            tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1291
12.7M
        } else {
1292
12.7M
            tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1293
12.7M
            tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1294
12.7M
        }
1295
25.5M
        CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1296
25.5M
    }
1297
1298
49.8k
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1299
1300
12.8M
    for (int i = 0; i < len4; i++) {
1301
12.7M
        const int i0 = len4 + i, i1 = len4 - i - 1;
1302
12.7M
        TXComplex src1 = { z[i1].re, z[i1].im };
1303
12.7M
        TXComplex src0 = { z[i0].re, z[i0].im };
1304
1305
12.7M
        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1306
12.7M
             exp[i0].im, exp[i0].re);
1307
12.7M
        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1308
12.7M
             exp[i1].im, exp[i1].re);
1309
12.7M
    }
1310
49.8k
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_fwd_double_c
tx_float.c:ff_tx_mdct_fwd_float_c
Line
Count
Source
1275
19.5k
{
1276
19.5k
    TXSample *src = _src, *dst = _dst;
1277
19.5k
    TXComplex *exp = s->exp, tmp, *z = _dst;
1278
19.5k
    const int len2 = s->len >> 1;
1279
19.5k
    const int len4 = s->len >> 2;
1280
19.5k
    const int len3 = len2 * 3;
1281
19.5k
    const int *sub_map = s->map;
1282
1283
19.5k
    stride /= sizeof(*dst);
1284
1285
10.0M
    for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1286
10.0M
        const int k = 2*i;
1287
10.0M
        const int idx = sub_map[i];
1288
10.0M
        if (k < len2) {
1289
5.01M
            tmp.re = FOLD(-src[ len2 + k],  src[1*len2 - 1 - k]);
1290
5.01M
            tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1291
5.01M
        } else {
1292
5.01M
            tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1293
5.01M
            tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1294
5.01M
        }
1295
10.0M
        CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1296
10.0M
    }
1297
1298
19.5k
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1299
1300
5.03M
    for (int i = 0; i < len4; i++) {
1301
5.01M
        const int i0 = len4 + i, i1 = len4 - i - 1;
1302
5.01M
        TXComplex src1 = { z[i1].re, z[i1].im };
1303
5.01M
        TXComplex src0 = { z[i0].re, z[i0].im };
1304
1305
5.01M
        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1306
5.01M
             exp[i0].im, exp[i0].re);
1307
5.01M
        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1308
5.01M
             exp[i1].im, exp[i1].re);
1309
5.01M
    }
1310
19.5k
}
tx_int32.c:ff_tx_mdct_fwd_int32_c
Line
Count
Source
1275
30.2k
{
1276
30.2k
    TXSample *src = _src, *dst = _dst;
1277
30.2k
    TXComplex *exp = s->exp, tmp, *z = _dst;
1278
30.2k
    const int len2 = s->len >> 1;
1279
30.2k
    const int len4 = s->len >> 2;
1280
30.2k
    const int len3 = len2 * 3;
1281
30.2k
    const int *sub_map = s->map;
1282
1283
30.2k
    stride /= sizeof(*dst);
1284
1285
15.5M
    for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1286
15.4M
        const int k = 2*i;
1287
15.4M
        const int idx = sub_map[i];
1288
15.4M
        if (k < len2) {
1289
7.73M
            tmp.re = FOLD(-src[ len2 + k],  src[1*len2 - 1 - k]);
1290
7.73M
            tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1291
7.73M
        } else {
1292
7.73M
            tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1293
7.73M
            tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1294
7.73M
        }
1295
15.4M
        CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1296
15.4M
    }
1297
1298
30.2k
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1299
1300
7.76M
    for (int i = 0; i < len4; i++) {
1301
7.73M
        const int i0 = len4 + i, i1 = len4 - i - 1;
1302
7.73M
        TXComplex src1 = { z[i1].re, z[i1].im };
1303
7.73M
        TXComplex src0 = { z[i0].re, z[i0].im };
1304
1305
7.73M
        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1306
7.73M
             exp[i0].im, exp[i0].re);
1307
7.73M
        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1308
7.73M
             exp[i1].im, exp[i1].re);
1309
7.73M
    }
1310
30.2k
}
1311
1312
static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
1313
                                    ptrdiff_t stride)
1314
210M
{
1315
210M
    TXComplex *z = _dst, *exp = s->exp;
1316
210M
    const TXSample *src = _src, *in1, *in2;
1317
210M
    const int len2 = s->len >> 1;
1318
210M
    const int len4 = s->len >> 2;
1319
210M
    const int *sub_map = s->map;
1320
1321
210M
    stride /= sizeof(*src);
1322
210M
    in1 = src;
1323
210M
    in2 = src + ((len2*2) - 1) * stride;
1324
1325
7.85G
    for (int i = 0; i < len2; i++) {
1326
7.64G
        int k = sub_map[i];
1327
7.64G
        TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1328
7.64G
        CMUL3(z[i], tmp, exp[i]);
1329
7.64G
    }
1330
1331
210M
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1332
1333
210M
    exp += len2;
1334
4.03G
    for (int i = 0; i < len4; i++) {
1335
3.82G
        const int i0 = len4 + i, i1 = len4 - i - 1;
1336
3.82G
        TXComplex src1 = { z[i1].im, z[i1].re };
1337
3.82G
        TXComplex src0 = { z[i0].im, z[i0].re };
1338
1339
3.82G
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1340
3.82G
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1341
3.82G
    }
1342
210M
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_inv_double_c
tx_float.c:ff_tx_mdct_inv_float_c
Line
Count
Source
1314
176M
{
1315
176M
    TXComplex *z = _dst, *exp = s->exp;
1316
176M
    const TXSample *src = _src, *in1, *in2;
1317
176M
    const int len2 = s->len >> 1;
1318
176M
    const int len4 = s->len >> 2;
1319
176M
    const int *sub_map = s->map;
1320
1321
176M
    stride /= sizeof(*src);
1322
176M
    in1 = src;
1323
176M
    in2 = src + ((len2*2) - 1) * stride;
1324
1325
6.23G
    for (int i = 0; i < len2; i++) {
1326
6.05G
        int k = sub_map[i];
1327
6.05G
        TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1328
6.05G
        CMUL3(z[i], tmp, exp[i]);
1329
6.05G
    }
1330
1331
176M
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1332
1333
176M
    exp += len2;
1334
3.20G
    for (int i = 0; i < len4; i++) {
1335
3.02G
        const int i0 = len4 + i, i1 = len4 - i - 1;
1336
3.02G
        TXComplex src1 = { z[i1].im, z[i1].re };
1337
3.02G
        TXComplex src0 = { z[i0].im, z[i0].re };
1338
1339
3.02G
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1340
3.02G
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1341
3.02G
    }
1342
176M
}
tx_int32.c:ff_tx_mdct_inv_int32_c
Line
Count
Source
1314
34.4M
{
1315
34.4M
    TXComplex *z = _dst, *exp = s->exp;
1316
34.4M
    const TXSample *src = _src, *in1, *in2;
1317
34.4M
    const int len2 = s->len >> 1;
1318
34.4M
    const int len4 = s->len >> 2;
1319
34.4M
    const int *sub_map = s->map;
1320
1321
34.4M
    stride /= sizeof(*src);
1322
34.4M
    in1 = src;
1323
34.4M
    in2 = src + ((len2*2) - 1) * stride;
1324
1325
1.62G
    for (int i = 0; i < len2; i++) {
1326
1.58G
        int k = sub_map[i];
1327
1.58G
        TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1328
1.58G
        CMUL3(z[i], tmp, exp[i]);
1329
1.58G
    }
1330
1331
34.4M
    s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1332
1333
34.4M
    exp += len2;
1334
827M
    for (int i = 0; i < len4; i++) {
1335
793M
        const int i0 = len4 + i, i1 = len4 - i - 1;
1336
793M
        TXComplex src1 = { z[i1].im, z[i1].re };
1337
793M
        TXComplex src0 = { z[i0].im, z[i0].re };
1338
1339
793M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1340
793M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1341
793M
    }
1342
34.4M
}
1343
1344
static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
1345
    .name       = TX_NAME_STR("mdct_fwd"),
1346
    .function   = TX_NAME(ff_tx_mdct_fwd),
1347
    .type       = TX_TYPE(MDCT),
1348
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
1349
    .factors    = { 2, TX_FACTOR_ANY },
1350
    .nb_factors = 2,
1351
    .min_len    = 2,
1352
    .max_len    = TX_LEN_UNLIMITED,
1353
    .init       = TX_NAME(ff_tx_mdct_init),
1354
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1355
    .prio       = FF_TX_PRIO_BASE,
1356
};
1357
1358
static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = {
1359
    .name       = TX_NAME_STR("mdct_inv"),
1360
    .function   = TX_NAME(ff_tx_mdct_inv),
1361
    .type       = TX_TYPE(MDCT),
1362
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
1363
    .factors    = { 2, TX_FACTOR_ANY },
1364
    .nb_factors = 2,
1365
    .min_len    = 2,
1366
    .max_len    = TX_LEN_UNLIMITED,
1367
    .init       = TX_NAME(ff_tx_mdct_init),
1368
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1369
    .prio       = FF_TX_PRIO_BASE,
1370
};
1371
1372
static av_cold int TX_NAME(ff_tx_mdct_inv_full_init)(AVTXContext *s,
1373
                                                     const FFTXCodelet *cd,
1374
                                                     uint64_t flags,
1375
                                                     FFTXCodeletOptions *opts,
1376
                                                     int len, int inv,
1377
                                                     const void *scale)
1378
38.9k
{
1379
38.9k
    int ret;
1380
1381
38.9k
    s->scale_d = *((SCALE_TYPE *)scale);
1382
38.9k
    s->scale_f = s->scale_d;
1383
1384
38.9k
    flags &= ~AV_TX_FULL_IMDCT;
1385
1386
38.9k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1387
0
        return ret;
1388
1389
38.9k
    return 0;
1390
38.9k
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_inv_full_init_double_c
tx_float.c:ff_tx_mdct_inv_full_init_float_c
Line
Count
Source
1378
38.9k
{
1379
38.9k
    int ret;
1380
1381
38.9k
    s->scale_d = *((SCALE_TYPE *)scale);
1382
38.9k
    s->scale_f = s->scale_d;
1383
1384
38.9k
    flags &= ~AV_TX_FULL_IMDCT;
1385
1386
38.9k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1387
0
        return ret;
1388
1389
38.9k
    return 0;
1390
38.9k
}
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_inv_full_init_int32_c
1391
1392
static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst,
1393
                                         void *_src, ptrdiff_t stride)
1394
5.35M
{
1395
5.35M
    int len  = s->len << 1;
1396
5.35M
    int len2 = len >> 1;
1397
5.35M
    int len4 = len >> 2;
1398
5.35M
    TXSample *dst = _dst;
1399
1400
5.35M
    s->fn[0](&s->sub[0], dst + len4, _src, stride);
1401
1402
5.35M
    stride /= sizeof(*dst);
1403
1404
597M
    for (int i = 0; i < len4; i++) {
1405
592M
        dst[            i*stride] = -dst[(len2 - i - 1)*stride];
1406
592M
        dst[(len - i - 1)*stride] =  dst[(len2 + i + 0)*stride];
1407
592M
    }
1408
5.35M
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_inv_full_double_c
tx_float.c:ff_tx_mdct_inv_full_float_c
Line
Count
Source
1394
5.35M
{
1395
5.35M
    int len  = s->len << 1;
1396
5.35M
    int len2 = len >> 1;
1397
5.35M
    int len4 = len >> 2;
1398
5.35M
    TXSample *dst = _dst;
1399
1400
5.35M
    s->fn[0](&s->sub[0], dst + len4, _src, stride);
1401
1402
5.35M
    stride /= sizeof(*dst);
1403
1404
597M
    for (int i = 0; i < len4; i++) {
1405
592M
        dst[            i*stride] = -dst[(len2 - i - 1)*stride];
1406
592M
        dst[(len - i - 1)*stride] =  dst[(len2 + i + 0)*stride];
1407
592M
    }
1408
5.35M
}
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_inv_full_int32_c
1409
1410
static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = {
1411
    .name       = TX_NAME_STR("mdct_inv_full"),
1412
    .function   = TX_NAME(ff_tx_mdct_inv_full),
1413
    .type       = TX_TYPE(MDCT),
1414
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
1415
                  FF_TX_OUT_OF_PLACE | AV_TX_FULL_IMDCT,
1416
    .factors    = { 2, TX_FACTOR_ANY },
1417
    .nb_factors = 2,
1418
    .min_len    = 2,
1419
    .max_len    = TX_LEN_UNLIMITED,
1420
    .init       = TX_NAME(ff_tx_mdct_inv_full_init),
1421
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1422
    .prio       = FF_TX_PRIO_BASE,
1423
};
1424
1425
static av_cold int TX_NAME(ff_tx_mdct_pfa_init)(AVTXContext *s,
1426
                                                const FFTXCodelet *cd,
1427
                                                uint64_t flags,
1428
                                                FFTXCodeletOptions *opts,
1429
                                                int len, int inv,
1430
                                                const void *scale)
1431
420k
{
1432
420k
    int ret, sub_len;
1433
420k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1434
1435
420k
    len >>= 1;
1436
420k
    sub_len = len / cd->factors[0];
1437
1438
420k
    s->scale_d = *((SCALE_TYPE *)scale);
1439
420k
    s->scale_f = s->scale_d;
1440
1441
420k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1442
420k
    flags |=  AV_TX_INPLACE;      /* in-place */
1443
420k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
1444
1445
420k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1446
420k
                                sub_len, inv, scale)))
1447
0
        return ret;
1448
1449
420k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1450
0
        return ret;
1451
1452
    /* Our 15-point transform is also a compound one, so embed its input map */
1453
420k
    if (cd->factors[0] == 15)
1454
365k
        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1455
1456
420k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1457
0
        return ret;
1458
1459
    /* Saves multiplies in loops. */
1460
97.2M
    for (int i = 0; i < len; i++)
1461
96.8M
        s->map[i] <<= 1;
1462
1463
420k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1464
0
        return AVERROR(ENOMEM);
1465
1466
420k
    TX_TAB(ff_tx_init_tabs)(len / sub_len);
1467
1468
420k
    return 0;
1469
420k
}
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_init_double_c
tx_float.c:ff_tx_mdct_pfa_init_float_c
Line
Count
Source
1431
393k
{
1432
393k
    int ret, sub_len;
1433
393k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1434
1435
393k
    len >>= 1;
1436
393k
    sub_len = len / cd->factors[0];
1437
1438
393k
    s->scale_d = *((SCALE_TYPE *)scale);
1439
393k
    s->scale_f = s->scale_d;
1440
1441
393k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1442
393k
    flags |=  AV_TX_INPLACE;      /* in-place */
1443
393k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
1444
1445
393k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1446
393k
                                sub_len, inv, scale)))
1447
0
        return ret;
1448
1449
393k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1450
0
        return ret;
1451
1452
    /* Our 15-point transform is also a compound one, so embed its input map */
1453
393k
    if (cd->factors[0] == 15)
1454
349k
        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1455
1456
393k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1457
0
        return ret;
1458
1459
    /* Saves multiplies in loops. */
1460
90.6M
    for (int i = 0; i < len; i++)
1461
90.3M
        s->map[i] <<= 1;
1462
1463
393k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1464
0
        return AVERROR(ENOMEM);
1465
1466
393k
    TX_TAB(ff_tx_init_tabs)(len / sub_len);
1467
1468
393k
    return 0;
1469
393k
}
tx_int32.c:ff_tx_mdct_pfa_init_int32_c
Line
Count
Source
1431
27.0k
{
1432
27.0k
    int ret, sub_len;
1433
27.0k
    FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1434
1435
27.0k
    len >>= 1;
1436
27.0k
    sub_len = len / cd->factors[0];
1437
1438
27.0k
    s->scale_d = *((SCALE_TYPE *)scale);
1439
27.0k
    s->scale_f = s->scale_d;
1440
1441
27.0k
    flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1442
27.0k
    flags |=  AV_TX_INPLACE;      /* in-place */
1443
27.0k
    flags |=  FF_TX_PRESHUFFLE;   /* This function handles the permute step */
1444
1445
27.0k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1446
27.0k
                                sub_len, inv, scale)))
1447
0
        return ret;
1448
1449
27.0k
    if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1450
0
        return ret;
1451
1452
    /* Our 15-point transform is also a compound one, so embed its input map */
1453
27.0k
    if (cd->factors[0] == 15)
1454
16.2k
        TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1455
1456
27.0k
    if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1457
0
        return ret;
1458
1459
    /* Saves multiplies in loops. */
1460
6.57M
    for (int i = 0; i < len; i++)
1461
6.54M
        s->map[i] <<= 1;
1462
1463
27.0k
    if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1464
0
        return AVERROR(ENOMEM);
1465
1466
27.0k
    TX_TAB(ff_tx_init_tabs)(len / sub_len);
1467
1468
27.0k
    return 0;
1469
27.0k
}
1470
1471
#define DECL_COMP_IMDCT(N)                                                     \
1472
static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst,    \
1473
1.59M
                                                void *_src, ptrdiff_t stride)  \
1474
1.59M
{                                                                              \
1475
1.59M
    TXComplex fft##N##in[N];                                                   \
1476
1.59M
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
1.59M
    const TXSample *src = _src, *in1, *in2;                                    \
1478
1.59M
    const int len4 = s->len >> 2;                                              \
1479
1.59M
    const int len2 = s->len >> 1;                                              \
1480
1.59M
    const int m = s->sub->len;                                                 \
1481
1.59M
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
1.59M
    const int *sub_map = s->sub->map;                                          \
1483
1.59M
                                                                               \
1484
1.59M
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
1.59M
    in1 = src;                                                                 \
1486
1.59M
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
1.59M
                                                                               \
1488
60.9M
    for (int i = 0; i < len2; i += N) {                                        \
1489
343M
        for (int j = 0; j < N; j++) {                                          \
1490
284M
            const int k = in_map[j];                                           \
1491
284M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
284M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
284M
        }                                                                      \
1494
59.4M
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
59.4M
        exp += N;                                                              \
1496
59.4M
        in_map += N;                                                           \
1497
59.4M
    }                                                                          \
1498
1.59M
                                                                               \
1499
12.3M
    for (int i = 0; i < N; i++)                                                \
1500
10.8M
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
1.59M
                                                                               \
1502
143M
    for (int i = 0; i < len4; i++) {                                           \
1503
142M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
142M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
142M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
142M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
142M
                                                                               \
1508
142M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
142M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
142M
    }                                                                          \
1511
1.59M
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_3xM_inv_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_5xM_inv_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_7xM_inv_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_9xM_inv_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_15xM_inv_double_c
tx_float.c:ff_tx_mdct_pfa_3xM_inv_float_c
Line
Count
Source
1473
418k
                                                void *_src, ptrdiff_t stride)  \
1474
418k
{                                                                              \
1475
418k
    TXComplex fft##N##in[N];                                                   \
1476
418k
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
418k
    const TXSample *src = _src, *in1, *in2;                                    \
1478
418k
    const int len4 = s->len >> 2;                                              \
1479
418k
    const int len2 = s->len >> 1;                                              \
1480
418k
    const int m = s->sub->len;                                                 \
1481
418k
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
418k
    const int *sub_map = s->sub->map;                                          \
1483
418k
                                                                               \
1484
418k
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
418k
    in1 = src;                                                                 \
1486
418k
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
418k
                                                                               \
1488
29.4M
    for (int i = 0; i < len2; i += N) {                                        \
1489
116M
        for (int j = 0; j < N; j++) {                                          \
1490
87.0M
            const int k = in_map[j];                                           \
1491
87.0M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
87.0M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
87.0M
        }                                                                      \
1494
29.0M
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
29.0M
        exp += N;                                                              \
1496
29.0M
        in_map += N;                                                           \
1497
29.0M
    }                                                                          \
1498
418k
                                                                               \
1499
1.67M
    for (int i = 0; i < N; i++)                                                \
1500
1.25M
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
418k
                                                                               \
1502
43.9M
    for (int i = 0; i < len4; i++) {                                           \
1503
43.5M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
43.5M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
43.5M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
43.5M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
43.5M
                                                                               \
1508
43.5M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
43.5M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
43.5M
    }                                                                          \
1511
418k
}                                                                              \
tx_float.c:ff_tx_mdct_pfa_5xM_inv_float_c
Line
Count
Source
1473
808k
                                                void *_src, ptrdiff_t stride)  \
1474
808k
{                                                                              \
1475
808k
    TXComplex fft##N##in[N];                                                   \
1476
808k
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
808k
    const TXSample *src = _src, *in1, *in2;                                    \
1478
808k
    const int len4 = s->len >> 2;                                              \
1479
808k
    const int len2 = s->len >> 1;                                              \
1480
808k
    const int m = s->sub->len;                                                 \
1481
808k
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
808k
    const int *sub_map = s->sub->map;                                          \
1483
808k
                                                                               \
1484
808k
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
808k
    in1 = src;                                                                 \
1486
808k
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
808k
                                                                               \
1488
26.6M
    for (int i = 0; i < len2; i += N) {                                        \
1489
155M
        for (int j = 0; j < N; j++) {                                          \
1490
129M
            const int k = in_map[j];                                           \
1491
129M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
129M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
129M
        }                                                                      \
1494
25.8M
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
25.8M
        exp += N;                                                              \
1496
25.8M
        in_map += N;                                                           \
1497
25.8M
    }                                                                          \
1498
808k
                                                                               \
1499
4.84M
    for (int i = 0; i < N; i++)                                                \
1500
4.04M
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
808k
                                                                               \
1502
65.4M
    for (int i = 0; i < len4; i++) {                                           \
1503
64.6M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
64.6M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
64.6M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
64.6M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
64.6M
                                                                               \
1508
64.6M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
64.6M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
64.6M
    }                                                                          \
1511
808k
}                                                                              \
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_7xM_inv_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_9xM_inv_float_c
tx_float.c:ff_tx_mdct_pfa_15xM_inv_float_c
Line
Count
Source
1473
352k
                                                void *_src, ptrdiff_t stride)  \
1474
352k
{                                                                              \
1475
352k
    TXComplex fft##N##in[N];                                                   \
1476
352k
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
352k
    const TXSample *src = _src, *in1, *in2;                                    \
1478
352k
    const int len4 = s->len >> 2;                                              \
1479
352k
    const int len2 = s->len >> 1;                                              \
1480
352k
    const int m = s->sub->len;                                                 \
1481
352k
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
352k
    const int *sub_map = s->sub->map;                                          \
1483
352k
                                                                               \
1484
352k
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
352k
    in1 = src;                                                                 \
1486
352k
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
352k
                                                                               \
1488
4.65M
    for (int i = 0; i < len2; i += N) {                                        \
1489
68.8M
        for (int j = 0; j < N; j++) {                                          \
1490
64.5M
            const int k = in_map[j];                                           \
1491
64.5M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
64.5M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
64.5M
        }                                                                      \
1494
4.30M
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
4.30M
        exp += N;                                                              \
1496
4.30M
        in_map += N;                                                           \
1497
4.30M
    }                                                                          \
1498
352k
                                                                               \
1499
5.64M
    for (int i = 0; i < N; i++)                                                \
1500
5.29M
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
352k
                                                                               \
1502
32.6M
    for (int i = 0; i < len4; i++) {                                           \
1503
32.2M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
32.2M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
32.2M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
32.2M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
32.2M
                                                                               \
1508
32.2M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
32.2M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
32.2M
    }                                                                          \
1511
352k
}                                                                              \
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_3xM_inv_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_5xM_inv_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_7xM_inv_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_9xM_inv_int32_c
tx_int32.c:ff_tx_mdct_pfa_15xM_inv_int32_c
Line
Count
Source
1473
14.0k
                                                void *_src, ptrdiff_t stride)  \
1474
14.0k
{                                                                              \
1475
14.0k
    TXComplex fft##N##in[N];                                                   \
1476
14.0k
    TXComplex *z = _dst, *exp = s->exp;                                        \
1477
14.0k
    const TXSample *src = _src, *in1, *in2;                                    \
1478
14.0k
    const int len4 = s->len >> 2;                                              \
1479
14.0k
    const int len2 = s->len >> 1;                                              \
1480
14.0k
    const int m = s->sub->len;                                                 \
1481
14.0k
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1482
14.0k
    const int *sub_map = s->sub->map;                                          \
1483
14.0k
                                                                               \
1484
14.0k
    stride /= sizeof(*src); /* To convert it from bytes */                     \
1485
14.0k
    in1 = src;                                                                 \
1486
14.0k
    in2 = src + ((N*m*2) - 1) * stride;                                        \
1487
14.0k
                                                                               \
1488
230k
    for (int i = 0; i < len2; i += N) {                                        \
1489
3.46M
        for (int j = 0; j < N; j++) {                                          \
1490
3.24M
            const int k = in_map[j];                                           \
1491
3.24M
            TXComplex tmp = { in2[-k*stride], in1[k*stride] };                 \
1492
3.24M
            CMUL3(fft##N##in[j], tmp, exp[j]);                                 \
1493
3.24M
        }                                                                      \
1494
216k
        fft##N(s->tmp + *(sub_map++), fft##N##in, m);                          \
1495
216k
        exp += N;                                                              \
1496
216k
        in_map += N;                                                           \
1497
216k
    }                                                                          \
1498
14.0k
                                                                               \
1499
224k
    for (int i = 0; i < N; i++)                                                \
1500
210k
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1501
14.0k
                                                                               \
1502
1.63M
    for (int i = 0; i < len4; i++) {                                           \
1503
1.62M
        const int i0 = len4 + i, i1 = len4 - i - 1;                            \
1504
1.62M
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1505
1.62M
        TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re };                     \
1506
1.62M
        TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re };                     \
1507
1.62M
                                                                               \
1508
1.62M
        CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);    \
1509
1.62M
        CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);    \
1510
1.62M
    }                                                                          \
1511
14.0k
}                                                                              \
1512
                                                                               \
1513
static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = {           \
1514
    .name       = TX_NAME_STR("mdct_pfa_" #N "xM_inv"),                        \
1515
    .function   = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv),                         \
1516
    .type       = TX_TYPE(MDCT),                                               \
1517
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,   \
1518
    .factors    = { N, TX_FACTOR_ANY },                                        \
1519
    .nb_factors = 2,                                                           \
1520
    .min_len    = N*2,                                                         \
1521
    .max_len    = TX_LEN_UNLIMITED,                                            \
1522
    .init       = TX_NAME(ff_tx_mdct_pfa_init),                                \
1523
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
1524
    .prio       = FF_TX_PRIO_BASE,                                             \
1525
};
1526
1527
DECL_COMP_IMDCT(3)
1528
DECL_COMP_IMDCT(5)
1529
DECL_COMP_IMDCT(7)
1530
DECL_COMP_IMDCT(9)
1531
DECL_COMP_IMDCT(15)
1532
1533
#define DECL_COMP_MDCT(N)                                                      \
1534
static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst,    \
1535
0
                                                void *_src, ptrdiff_t stride)  \
1536
0
{                                                                              \
1537
0
    TXComplex fft##N##in[N];                                                   \
1538
0
    TXSample *src = _src, *dst = _dst;                                         \
1539
0
    TXComplex *exp = s->exp, tmp;                                              \
1540
0
    const int m = s->sub->len;                                                 \
1541
0
    const int len4 = N*m;                                                      \
1542
0
    const int len3 = len4 * 3;                                                 \
1543
0
    const int len8 = s->len >> 2;                                              \
1544
0
    const int *in_map = s->map, *out_map = in_map + N*m;                       \
1545
0
    const int *sub_map = s->sub->map;                                          \
1546
0
                                                                               \
1547
0
    stride /= sizeof(*dst);                                                    \
1548
0
                                                                               \
1549
0
    for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */             \
1550
0
        for (int j = 0; j < N; j++) {                                          \
1551
0
            const int k = in_map[i*N + j];                                     \
1552
0
            if (k < len4) {                                                    \
1553
0
                tmp.re = FOLD(-src[ len4 + k],  src[1*len4 - 1 - k]);          \
1554
0
                tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);          \
1555
0
            } else {                                                           \
1556
0
                tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]);          \
1557
0
                tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]);          \
1558
0
            }                                                                  \
1559
0
            CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im,           \
1560
0
                 exp[k >> 1].re, exp[k >> 1].im);                              \
1561
0
        }                                                                      \
1562
0
        fft##N(s->tmp + sub_map[i], fft##N##in, m);                            \
1563
0
    }                                                                          \
1564
0
                                                                               \
1565
0
    for (int i = 0; i < N; i++)                                                \
1566
0
        s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex));   \
1567
0
                                                                               \
1568
0
    for (int i = 0; i < len8; i++) {                                           \
1569
0
        const int i0 = len8 + i, i1 = len8 - i - 1;                            \
1570
0
        const int s0 = out_map[i0], s1 = out_map[i1];                          \
1571
0
        TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im };                     \
1572
0
        TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im };                     \
1573
0
                                                                               \
1574
0
        CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,    \
1575
0
             exp[i0].im, exp[i0].re);                                          \
1576
0
        CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,    \
1577
0
             exp[i1].im, exp[i1].re);                                          \
1578
0
    }                                                                          \
1579
0
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_3xM_fwd_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_5xM_fwd_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_7xM_fwd_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_9xM_fwd_double_c
Unexecuted instantiation: tx_double.c:ff_tx_mdct_pfa_15xM_fwd_double_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_3xM_fwd_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_5xM_fwd_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_7xM_fwd_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_9xM_fwd_float_c
Unexecuted instantiation: tx_float.c:ff_tx_mdct_pfa_15xM_fwd_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_3xM_fwd_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_5xM_fwd_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_7xM_fwd_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_9xM_fwd_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_mdct_pfa_15xM_fwd_int32_c
1580
                                                                               \
1581
static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = {           \
1582
    .name       = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"),                        \
1583
    .function   = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd),                         \
1584
    .type       = TX_TYPE(MDCT),                                               \
1585
    .flags      = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,   \
1586
    .factors    = { N, TX_FACTOR_ANY },                                        \
1587
    .nb_factors = 2,                                                           \
1588
    .min_len    = N*2,                                                         \
1589
    .max_len    = TX_LEN_UNLIMITED,                                            \
1590
    .init       = TX_NAME(ff_tx_mdct_pfa_init),                                \
1591
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
1592
    .prio       = FF_TX_PRIO_BASE,                                             \
1593
};
1594
1595
DECL_COMP_MDCT(3)
1596
DECL_COMP_MDCT(5)
1597
DECL_COMP_MDCT(7)
1598
DECL_COMP_MDCT(9)
1599
DECL_COMP_MDCT(15)
1600
1601
static av_cold int TX_NAME(ff_tx_rdft_init)(AVTXContext *s,
1602
                                            const FFTXCodelet *cd,
1603
                                            uint64_t flags,
1604
                                            FFTXCodeletOptions *opts,
1605
                                            int len, int inv,
1606
                                            const void *scale)
1607
5.95k
{
1608
5.95k
    int ret;
1609
5.95k
    double f, m;
1610
5.95k
    TXSample *tab;
1611
5.95k
    uint64_t r2r = flags & AV_TX_REAL_TO_REAL;
1612
5.95k
    int len4 = FFALIGN(len, 4) / 4;
1613
1614
5.95k
    s->scale_d = *((SCALE_TYPE *)scale);
1615
5.95k
    s->scale_f = s->scale_d;
1616
1617
5.95k
    flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY);
1618
1619
5.95k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1620
0
        return ret;
1621
1622
5.95k
    if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
1623
0
        return AVERROR(ENOMEM);
1624
1625
5.95k
    tab = (TXSample *)s->exp;
1626
1627
5.95k
    f = 2*M_PI/len;
1628
1629
5.95k
    m = (inv ? 2*s->scale_d : s->scale_d);
1630
1631
5.95k
    *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1632
5.95k
    *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1633
5.95k
    *tab++ = RESCALE( m);
1634
5.95k
    *tab++ = RESCALE(-m);
1635
1636
5.95k
    *tab++ = RESCALE( (0.5 - 0.0) * m);
1637
5.95k
    if (r2r)
1638
780
        *tab++ = 1 / s->scale_f;
1639
5.17k
    else
1640
5.17k
        *tab++ = RESCALE( (0.0 - 0.5) * m);
1641
5.95k
    *tab++ = RESCALE( (0.5 - inv) * m);
1642
5.95k
    *tab++ = RESCALE(-(0.5 - inv) * m);
1643
1644
783k
    for (int i = 0; i < len4; i++)
1645
777k
        *tab++ = RESCALE(cos(i*f));
1646
1647
5.95k
    tab = ((TXSample *)s->exp) + len4 + 8;
1648
1649
783k
    for (int i = 0; i < len4; i++)
1650
777k
        *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1);
1651
1652
5.95k
    return 0;
1653
5.95k
}
Unexecuted instantiation: tx_double.c:ff_tx_rdft_init_double_c
tx_float.c:ff_tx_rdft_init_float_c
Line
Count
Source
1607
5.95k
{
1608
5.95k
    int ret;
1609
5.95k
    double f, m;
1610
5.95k
    TXSample *tab;
1611
5.95k
    uint64_t r2r = flags & AV_TX_REAL_TO_REAL;
1612
5.95k
    int len4 = FFALIGN(len, 4) / 4;
1613
1614
5.95k
    s->scale_d = *((SCALE_TYPE *)scale);
1615
5.95k
    s->scale_f = s->scale_d;
1616
1617
5.95k
    flags &= ~(AV_TX_REAL_TO_REAL | AV_TX_REAL_TO_IMAGINARY);
1618
1619
5.95k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1620
0
        return ret;
1621
1622
5.95k
    if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
1623
0
        return AVERROR(ENOMEM);
1624
1625
5.95k
    tab = (TXSample *)s->exp;
1626
1627
5.95k
    f = 2*M_PI/len;
1628
1629
5.95k
    m = (inv ? 2*s->scale_d : s->scale_d);
1630
1631
5.95k
    *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1632
5.95k
    *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1633
5.95k
    *tab++ = RESCALE( m);
1634
5.95k
    *tab++ = RESCALE(-m);
1635
1636
5.95k
    *tab++ = RESCALE( (0.5 - 0.0) * m);
1637
5.95k
    if (r2r)
1638
780
        *tab++ = 1 / s->scale_f;
1639
5.17k
    else
1640
5.17k
        *tab++ = RESCALE( (0.0 - 0.5) * m);
1641
5.95k
    *tab++ = RESCALE( (0.5 - inv) * m);
1642
5.95k
    *tab++ = RESCALE(-(0.5 - inv) * m);
1643
1644
783k
    for (int i = 0; i < len4; i++)
1645
777k
        *tab++ = RESCALE(cos(i*f));
1646
1647
5.95k
    tab = ((TXSample *)s->exp) + len4 + 8;
1648
1649
783k
    for (int i = 0; i < len4; i++)
1650
777k
        *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1);
1651
1652
5.95k
    return 0;
1653
5.95k
}
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_init_int32_c
1654
1655
#define DECL_RDFT(n, inv)                                                      \
1656
static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
1657
10.0M
                                     void *_src, ptrdiff_t stride)             \
1658
10.0M
{                                                                              \
1659
10.0M
    const int len2 = s->len >> 1;                                              \
1660
10.0M
    const int len4 = s->len >> 2;                                              \
1661
10.0M
    const TXSample *fact = (void *)s->exp;                                     \
1662
10.0M
    const TXSample *tcos = fact + 8;                                           \
1663
10.0M
    const TXSample *tsin = tcos + len4;                                        \
1664
10.0M
    TXComplex *data = inv ? _src : _dst;                                       \
1665
10.0M
    TXComplex t[3];                                                            \
1666
10.0M
                                                                               \
1667
10.0M
    if (!inv)                                                                  \
1668
10.0M
        s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex));                   \
1669
10.0M
    else                                                                       \
1670
10.0M
        data[0].im = data[len2].re;                                            \
1671
10.0M
                                                                               \
1672
10.0M
    /* The DC value's both components are real, but we need to change them     \
1673
10.0M
     * into complex values. Also, the middle of the array is special-cased.    \
1674
10.0M
     * These operations can be done before or after the loop. */               \
1675
10.0M
    t[0].re = data[0].re;                                                      \
1676
10.0M
    data[0].re = t[0].re + data[0].im;                                         \
1677
10.0M
    data[0].im = t[0].re - data[0].im;                                         \
1678
10.0M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1679
10.0M
    data[   0].im = MULT(fact[1], data[   0].im);                              \
1680
10.0M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1681
10.0M
    data[len4].im = MULT(fact[3], data[len4].im);                              \
1682
10.0M
                                                                               \
1683
496M
    for (int i = 1; i < len4; i++) {                                           \
1684
486M
        /* Separate even and odd FFTs */                                       \
1685
486M
        t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re));             \
1686
486M
        t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im));             \
1687
486M
        t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im));             \
1688
486M
        t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re));             \
1689
486M
                                                                               \
1690
486M
        /* Apply twiddle factors to the odd FFT and add to the even FFT */     \
1691
486M
        CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]);            \
1692
486M
                                                                               \
1693
486M
        data[       i].re = t[0].re + t[2].re;                                 \
1694
486M
        data[       i].im = t[2].im - t[0].im;                                 \
1695
486M
        data[len2 - i].re = t[0].re - t[2].re;                                 \
1696
486M
        data[len2 - i].im = t[2].im + t[0].im;                                 \
1697
486M
    }                                                                          \
1698
10.0M
                                                                               \
1699
10.0M
    if (inv) {                                                                 \
1700
5.22M
        s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex));                   \
1701
5.22M
    } else {                                                                   \
1702
4.85M
        /* Move [0].im to the last position, as convention requires */         \
1703
4.85M
        data[len2].re = data[0].im;                                            \
1704
4.85M
        data[   0].im = data[len2].im = 0;                                     \
1705
4.85M
    }                                                                          \
1706
10.0M
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2c_double_c
Unexecuted instantiation: tx_double.c:ff_tx_rdft_c2r_double_c
tx_float.c:ff_tx_rdft_r2c_float_c
Line
Count
Source
1657
4.85M
                                     void *_src, ptrdiff_t stride)             \
1658
4.85M
{                                                                              \
1659
4.85M
    const int len2 = s->len >> 1;                                              \
1660
4.85M
    const int len4 = s->len >> 2;                                              \
1661
4.85M
    const TXSample *fact = (void *)s->exp;                                     \
1662
4.85M
    const TXSample *tcos = fact + 8;                                           \
1663
4.85M
    const TXSample *tsin = tcos + len4;                                        \
1664
4.85M
    TXComplex *data = inv ? _src : _dst;                                       \
1665
4.85M
    TXComplex t[3];                                                            \
1666
4.85M
                                                                               \
1667
4.85M
    if (!inv)                                                                  \
1668
4.85M
        s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex));                   \
1669
4.85M
    else                                                                       \
1670
4.85M
        data[0].im = data[len2].re;                                            \
1671
4.85M
                                                                               \
1672
4.85M
    /* The DC value's both components are real, but we need to change them     \
1673
4.85M
     * into complex values. Also, the middle of the array is special-cased.    \
1674
4.85M
     * These operations can be done before or after the loop. */               \
1675
4.85M
    t[0].re = data[0].re;                                                      \
1676
4.85M
    data[0].re = t[0].re + data[0].im;                                         \
1677
4.85M
    data[0].im = t[0].re - data[0].im;                                         \
1678
4.85M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1679
4.85M
    data[   0].im = MULT(fact[1], data[   0].im);                              \
1680
4.85M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1681
4.85M
    data[len4].im = MULT(fact[3], data[len4].im);                              \
1682
4.85M
                                                                               \
1683
155M
    for (int i = 1; i < len4; i++) {                                           \
1684
150M
        /* Separate even and odd FFTs */                                       \
1685
150M
        t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re));             \
1686
150M
        t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im));             \
1687
150M
        t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im));             \
1688
150M
        t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re));             \
1689
150M
                                                                               \
1690
150M
        /* Apply twiddle factors to the odd FFT and add to the even FFT */     \
1691
150M
        CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]);            \
1692
150M
                                                                               \
1693
150M
        data[       i].re = t[0].re + t[2].re;                                 \
1694
150M
        data[       i].im = t[2].im - t[0].im;                                 \
1695
150M
        data[len2 - i].re = t[0].re - t[2].re;                                 \
1696
150M
        data[len2 - i].im = t[2].im + t[0].im;                                 \
1697
150M
    }                                                                          \
1698
4.85M
                                                                               \
1699
4.85M
    if (inv) {                                                                 \
1700
0
        s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex));                   \
1701
4.85M
    } else {                                                                   \
1702
4.85M
        /* Move [0].im to the last position, as convention requires */         \
1703
4.85M
        data[len2].re = data[0].im;                                            \
1704
4.85M
        data[   0].im = data[len2].im = 0;                                     \
1705
4.85M
    }                                                                          \
1706
4.85M
}                                                                              \
tx_float.c:ff_tx_rdft_c2r_float_c
Line
Count
Source
1657
5.22M
                                     void *_src, ptrdiff_t stride)             \
1658
5.22M
{                                                                              \
1659
5.22M
    const int len2 = s->len >> 1;                                              \
1660
5.22M
    const int len4 = s->len >> 2;                                              \
1661
5.22M
    const TXSample *fact = (void *)s->exp;                                     \
1662
5.22M
    const TXSample *tcos = fact + 8;                                           \
1663
5.22M
    const TXSample *tsin = tcos + len4;                                        \
1664
5.22M
    TXComplex *data = inv ? _src : _dst;                                       \
1665
5.22M
    TXComplex t[3];                                                            \
1666
5.22M
                                                                               \
1667
5.22M
    if (!inv)                                                                  \
1668
5.22M
        s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex));                   \
1669
5.22M
    else                                                                       \
1670
5.22M
        data[0].im = data[len2].re;                                            \
1671
5.22M
                                                                               \
1672
5.22M
    /* The DC value's both components are real, but we need to change them     \
1673
5.22M
     * into complex values. Also, the middle of the array is special-cased.    \
1674
5.22M
     * These operations can be done before or after the loop. */               \
1675
5.22M
    t[0].re = data[0].re;                                                      \
1676
5.22M
    data[0].re = t[0].re + data[0].im;                                         \
1677
5.22M
    data[0].im = t[0].re - data[0].im;                                         \
1678
5.22M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1679
5.22M
    data[   0].im = MULT(fact[1], data[   0].im);                              \
1680
5.22M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1681
5.22M
    data[len4].im = MULT(fact[3], data[len4].im);                              \
1682
5.22M
                                                                               \
1683
341M
    for (int i = 1; i < len4; i++) {                                           \
1684
336M
        /* Separate even and odd FFTs */                                       \
1685
336M
        t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re));             \
1686
336M
        t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im));             \
1687
336M
        t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im));             \
1688
336M
        t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re));             \
1689
336M
                                                                               \
1690
336M
        /* Apply twiddle factors to the odd FFT and add to the even FFT */     \
1691
336M
        CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]);            \
1692
336M
                                                                               \
1693
336M
        data[       i].re = t[0].re + t[2].re;                                 \
1694
336M
        data[       i].im = t[2].im - t[0].im;                                 \
1695
336M
        data[len2 - i].re = t[0].re - t[2].re;                                 \
1696
336M
        data[len2 - i].im = t[2].im + t[0].im;                                 \
1697
336M
    }                                                                          \
1698
5.22M
                                                                               \
1699
5.22M
    if (inv) {                                                                 \
1700
5.22M
        s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex));                   \
1701
5.22M
    } else {                                                                   \
1702
0
        /* Move [0].im to the last position, as convention requires */         \
1703
0
        data[len2].re = data[0].im;                                            \
1704
0
        data[   0].im = data[len2].im = 0;                                     \
1705
0
    }                                                                          \
1706
5.22M
}                                                                              \
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2c_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_c2r_int32_c
1707
                                                                               \
1708
static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
1709
    .name       = TX_NAME_STR("rdft_" #n),                                     \
1710
    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
1711
    .type       = TX_TYPE(RDFT),                                               \
1712
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE |       \
1713
                  (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY),             \
1714
    .factors    = { 4, TX_FACTOR_ANY },                                        \
1715
    .nb_factors = 2,                                                           \
1716
    .min_len    = 4,                                                           \
1717
    .max_len    = TX_LEN_UNLIMITED,                                            \
1718
    .init       = TX_NAME(ff_tx_rdft_init),                                    \
1719
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
1720
    .prio       = FF_TX_PRIO_BASE,                                             \
1721
};
1722
1723
DECL_RDFT(r2c,  0)
1724
DECL_RDFT(c2r,  1)
1725
1726
#define DECL_RDFT_HALF(n, mode, mod2)                                          \
1727
static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst,               \
1728
3.23M
                                        void *_src, ptrdiff_t stride)          \
1729
3.23M
{                                                                              \
1730
3.23M
    const int len = s->len;                                                    \
1731
3.23M
    const int len2 = len >> 1;                                                 \
1732
3.23M
    const int len4 = len >> 2;                                                 \
1733
3.23M
    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
1734
3.23M
    const TXSample *fact = (void *)s->exp;                                     \
1735
3.23M
    const TXSample *tcos = fact + 8;                                           \
1736
3.23M
    const TXSample *tsin = tcos + aligned_len4;                                \
1737
3.23M
    TXComplex *data = _dst;                                                    \
1738
3.23M
    TXSample *out = _dst; /* Half-complex is forward-only */                   \
1739
3.23M
    TXSample tmp_dc;                                                           \
1740
3.23M
    av_unused TXSample tmp_mid;                                                \
1741
3.23M
    TXSample tmp[4];                                                           \
1742
3.23M
    TXComplex sf, sl;                                                          \
1743
3.23M
                                                                               \
1744
3.23M
    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
1745
3.23M
                                                                               \
1746
3.23M
    tmp_dc = data[0].re;                                                       \
1747
3.23M
    data[   0].re = tmp_dc + data[0].im;                                       \
1748
3.23M
    tmp_dc        = tmp_dc - data[0].im;                                       \
1749
3.23M
                                                                               \
1750
3.23M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1751
3.23M
    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
1752
3.23M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1753
3.23M
                                                                               \
1754
3.23M
    if (!mod2) {                                                               \
1755
0
        data[len4].im = MULT(fact[3], data[len4].im);                          \
1756
3.23M
    } else {                                                                   \
1757
3.23M
        sf = data[len4];                                                       \
1758
3.23M
        sl = data[len4 + 1];                                                   \
1759
3.23M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1760
3.23M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1761
3.23M
        else                                                                   \
1762
3.23M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1763
3.23M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1764
3.23M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1765
3.23M
                                                                               \
1766
3.23M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1767
1.61M
            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
1768
1.61M
            tmp_mid = (tmp[0] - tmp[3]);                                       \
1769
1.61M
        } else {                                                               \
1770
1.61M
            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
1771
1.61M
            tmp_mid = (tmp[0] + tmp[3]);                                       \
1772
1.61M
        }                                                                      \
1773
3.23M
    }                                                                          \
1774
3.23M
                                                                               \
1775
3.23M
    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
1776
105M
    for (int i = 1; i <= len4; i++) {                                          \
1777
101M
        TXSample tmp[4];                                                       \
1778
101M
        TXComplex sf = data[i];                                                \
1779
101M
        TXComplex sl = data[len2 - i];                                         \
1780
101M
                                                                               \
1781
101M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1782
101M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1783
101M
        else                                                                   \
1784
101M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1785
101M
                                                                               \
1786
101M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1787
101M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1788
101M
                                                                               \
1789
101M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1790
50.1M
            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
1791
50.1M
            out[i]           = (tmp[0] + tmp[3]);                              \
1792
50.1M
            out[len - i]     = (tmp[0] - tmp[3]);                              \
1793
51.7M
        } else {                                                               \
1794
51.7M
            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
1795
51.7M
            out[i - 1]       = (tmp[3] - tmp[0]);                              \
1796
51.7M
            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
1797
51.7M
        }                                                                      \
1798
101M
    }                                                                          \
1799
3.23M
                                                                               \
1800
103M
    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
1801
100M
        out[len2 - i] = out[len - i];                                          \
1802
3.23M
                                                                               \
1803
3.23M
    if (mode == AV_TX_REAL_TO_REAL) {                                          \
1804
1.61M
        out[len2] = tmp_dc;                                                    \
1805
1.61M
        if (mod2)                                                              \
1806
1.61M
            out[len4 + 1] = tmp_mid * fact[5];                                 \
1807
1.61M
    } else if (mod2) {                                                         \
1808
1.61M
        out[len4] = tmp_mid;                                                   \
1809
1.61M
    }                                                                          \
1810
3.23M
}                                                                              \
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2r_double_c
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2r_mod2_double_c
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2i_double_c
Unexecuted instantiation: tx_double.c:ff_tx_rdft_r2i_mod2_double_c
Unexecuted instantiation: tx_float.c:ff_tx_rdft_r2r_float_c
tx_float.c:ff_tx_rdft_r2r_mod2_float_c
Line
Count
Source
1728
1.61M
                                        void *_src, ptrdiff_t stride)          \
1729
1.61M
{                                                                              \
1730
1.61M
    const int len = s->len;                                                    \
1731
1.61M
    const int len2 = len >> 1;                                                 \
1732
1.61M
    const int len4 = len >> 2;                                                 \
1733
1.61M
    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
1734
1.61M
    const TXSample *fact = (void *)s->exp;                                     \
1735
1.61M
    const TXSample *tcos = fact + 8;                                           \
1736
1.61M
    const TXSample *tsin = tcos + aligned_len4;                                \
1737
1.61M
    TXComplex *data = _dst;                                                    \
1738
1.61M
    TXSample *out = _dst; /* Half-complex is forward-only */                   \
1739
1.61M
    TXSample tmp_dc;                                                           \
1740
1.61M
    av_unused TXSample tmp_mid;                                                \
1741
1.61M
    TXSample tmp[4];                                                           \
1742
1.61M
    TXComplex sf, sl;                                                          \
1743
1.61M
                                                                               \
1744
1.61M
    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
1745
1.61M
                                                                               \
1746
1.61M
    tmp_dc = data[0].re;                                                       \
1747
1.61M
    data[   0].re = tmp_dc + data[0].im;                                       \
1748
1.61M
    tmp_dc        = tmp_dc - data[0].im;                                       \
1749
1.61M
                                                                               \
1750
1.61M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1751
1.61M
    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
1752
1.61M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1753
1.61M
                                                                               \
1754
1.61M
    if (!mod2) {                                                               \
1755
0
        data[len4].im = MULT(fact[3], data[len4].im);                          \
1756
1.61M
    } else {                                                                   \
1757
1.61M
        sf = data[len4];                                                       \
1758
1.61M
        sl = data[len4 + 1];                                                   \
1759
1.61M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1760
1.61M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1761
1.61M
        else                                                                   \
1762
1.61M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1763
1.61M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1764
1.61M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1765
1.61M
                                                                               \
1766
1.61M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1767
1.61M
            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
1768
1.61M
            tmp_mid = (tmp[0] - tmp[3]);                                       \
1769
1.61M
        } else {                                                               \
1770
0
            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
1771
0
            tmp_mid = (tmp[0] + tmp[3]);                                       \
1772
0
        }                                                                      \
1773
1.61M
    }                                                                          \
1774
1.61M
                                                                               \
1775
1.61M
    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
1776
51.7M
    for (int i = 1; i <= len4; i++) {                                          \
1777
50.1M
        TXSample tmp[4];                                                       \
1778
50.1M
        TXComplex sf = data[i];                                                \
1779
50.1M
        TXComplex sl = data[len2 - i];                                         \
1780
50.1M
                                                                               \
1781
50.1M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1782
50.1M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1783
50.1M
        else                                                                   \
1784
50.1M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1785
50.1M
                                                                               \
1786
50.1M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1787
50.1M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1788
50.1M
                                                                               \
1789
50.1M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1790
50.1M
            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
1791
50.1M
            out[i]           = (tmp[0] + tmp[3]);                              \
1792
50.1M
            out[len - i]     = (tmp[0] - tmp[3]);                              \
1793
50.1M
        } else {                                                               \
1794
0
            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
1795
0
            out[i - 1]       = (tmp[3] - tmp[0]);                              \
1796
0
            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
1797
0
        }                                                                      \
1798
50.1M
    }                                                                          \
1799
1.61M
                                                                               \
1800
50.1M
    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
1801
48.5M
        out[len2 - i] = out[len - i];                                          \
1802
1.61M
                                                                               \
1803
1.61M
    if (mode == AV_TX_REAL_TO_REAL) {                                          \
1804
1.61M
        out[len2] = tmp_dc;                                                    \
1805
1.61M
        if (mod2)                                                              \
1806
1.61M
            out[len4 + 1] = tmp_mid * fact[5];                                 \
1807
1.61M
    } else if (mod2) {                                                         \
1808
0
        out[len4] = tmp_mid;                                                   \
1809
0
    }                                                                          \
1810
1.61M
}                                                                              \
Unexecuted instantiation: tx_float.c:ff_tx_rdft_r2i_float_c
tx_float.c:ff_tx_rdft_r2i_mod2_float_c
Line
Count
Source
1728
1.61M
                                        void *_src, ptrdiff_t stride)          \
1729
1.61M
{                                                                              \
1730
1.61M
    const int len = s->len;                                                    \
1731
1.61M
    const int len2 = len >> 1;                                                 \
1732
1.61M
    const int len4 = len >> 2;                                                 \
1733
1.61M
    const int aligned_len4 = FFALIGN(len, 4)/4;                                \
1734
1.61M
    const TXSample *fact = (void *)s->exp;                                     \
1735
1.61M
    const TXSample *tcos = fact + 8;                                           \
1736
1.61M
    const TXSample *tsin = tcos + aligned_len4;                                \
1737
1.61M
    TXComplex *data = _dst;                                                    \
1738
1.61M
    TXSample *out = _dst; /* Half-complex is forward-only */                   \
1739
1.61M
    TXSample tmp_dc;                                                           \
1740
1.61M
    av_unused TXSample tmp_mid;                                                \
1741
1.61M
    TXSample tmp[4];                                                           \
1742
1.61M
    TXComplex sf, sl;                                                          \
1743
1.61M
                                                                               \
1744
1.61M
    s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex));                       \
1745
1.61M
                                                                               \
1746
1.61M
    tmp_dc = data[0].re;                                                       \
1747
1.61M
    data[   0].re = tmp_dc + data[0].im;                                       \
1748
1.61M
    tmp_dc        = tmp_dc - data[0].im;                                       \
1749
1.61M
                                                                               \
1750
1.61M
    data[   0].re = MULT(fact[0], data[   0].re);                              \
1751
1.61M
    tmp_dc        = MULT(fact[1],        tmp_dc);                              \
1752
1.61M
    data[len4].re = MULT(fact[2], data[len4].re);                              \
1753
1.61M
                                                                               \
1754
1.61M
    if (!mod2) {                                                               \
1755
0
        data[len4].im = MULT(fact[3], data[len4].im);                          \
1756
1.61M
    } else {                                                                   \
1757
1.61M
        sf = data[len4];                                                       \
1758
1.61M
        sl = data[len4 + 1];                                                   \
1759
1.61M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1760
1.61M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1761
1.61M
        else                                                                   \
1762
1.61M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1763
1.61M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1764
1.61M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1765
1.61M
                                                                               \
1766
1.61M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1767
0
            tmp[3]  = tmp[1]*tcos[len4] - tmp[2]*tsin[len4];                   \
1768
0
            tmp_mid = (tmp[0] - tmp[3]);                                       \
1769
1.61M
        } else {                                                               \
1770
1.61M
            tmp[3]  = tmp[1]*tsin[len4] + tmp[2]*tcos[len4];                   \
1771
1.61M
            tmp_mid = (tmp[0] + tmp[3]);                                       \
1772
1.61M
        }                                                                      \
1773
1.61M
    }                                                                          \
1774
1.61M
                                                                               \
1775
1.61M
    /* NOTE: unrolling this breaks non-mod8 lengths */                         \
1776
53.4M
    for (int i = 1; i <= len4; i++) {                                          \
1777
51.7M
        TXSample tmp[4];                                                       \
1778
51.7M
        TXComplex sf = data[i];                                                \
1779
51.7M
        TXComplex sl = data[len2 - i];                                         \
1780
51.7M
                                                                               \
1781
51.7M
        if (mode == AV_TX_REAL_TO_REAL)                                        \
1782
51.7M
            tmp[0] = MULT(fact[4], (sf.re + sl.re));                           \
1783
51.7M
        else                                                                   \
1784
51.7M
            tmp[0] = MULT(fact[5], (sf.im - sl.im));                           \
1785
51.7M
                                                                               \
1786
51.7M
        tmp[1] = MULT(fact[6], (sf.im + sl.im));                               \
1787
51.7M
        tmp[2] = MULT(fact[7], (sf.re - sl.re));                               \
1788
51.7M
                                                                               \
1789
51.7M
        if (mode == AV_TX_REAL_TO_REAL) {                                      \
1790
0
            tmp[3]           = tmp[1]*tcos[i] - tmp[2]*tsin[i];                \
1791
0
            out[i]           = (tmp[0] + tmp[3]);                              \
1792
0
            out[len - i]     = (tmp[0] - tmp[3]);                              \
1793
51.7M
        } else {                                                               \
1794
51.7M
            tmp[3]           = tmp[1]*tsin[i] + tmp[2]*tcos[i];                \
1795
51.7M
            out[i - 1]       = (tmp[3] - tmp[0]);                              \
1796
51.7M
            out[len - i - 1] = (tmp[0] + tmp[3]);                              \
1797
51.7M
        }                                                                      \
1798
51.7M
    }                                                                          \
1799
1.61M
                                                                               \
1800
53.4M
    for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++)       \
1801
51.7M
        out[len2 - i] = out[len - i];                                          \
1802
1.61M
                                                                               \
1803
1.61M
    if (mode == AV_TX_REAL_TO_REAL) {                                          \
1804
0
        out[len2] = tmp_dc;                                                    \
1805
0
        if (mod2)                                                              \
1806
0
            out[len4 + 1] = tmp_mid * fact[5];                                 \
1807
1.61M
    } else if (mod2) {                                                         \
1808
1.61M
        out[len4] = tmp_mid;                                                   \
1809
1.61M
    }                                                                          \
1810
1.61M
}                                                                              \
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2r_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2r_mod2_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2i_int32_c
Unexecuted instantiation: tx_int32.c:ff_tx_rdft_r2i_mod2_int32_c
1811
                                                                               \
1812
static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = {                   \
1813
    .name       = TX_NAME_STR("rdft_" #n),                                     \
1814
    .function   = TX_NAME(ff_tx_rdft_ ##n),                                    \
1815
    .type       = TX_TYPE(RDFT),                                               \
1816
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | mode |                     \
1817
                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,                     \
1818
    .factors    = { 2 + 2*(!mod2), TX_FACTOR_ANY },                            \
1819
    .nb_factors = 2,                                                           \
1820
    .min_len    = 2 + 2*(!mod2),                                               \
1821
    .max_len    = TX_LEN_UNLIMITED,                                            \
1822
    .init       = TX_NAME(ff_tx_rdft_init),                                    \
1823
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,                                         \
1824
    .prio       = FF_TX_PRIO_BASE,                                             \
1825
};
1826
1827
DECL_RDFT_HALF(r2r,      AV_TX_REAL_TO_REAL,      0)
1828
DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL,      1)
1829
DECL_RDFT_HALF(r2i,      AV_TX_REAL_TO_IMAGINARY, 0)
1830
DECL_RDFT_HALF(r2i_mod2, AV_TX_REAL_TO_IMAGINARY, 1)
1831
1832
static av_cold int TX_NAME(ff_tx_dct_init)(AVTXContext *s,
1833
                                           const FFTXCodelet *cd,
1834
                                           uint64_t flags,
1835
                                           FFTXCodeletOptions *opts,
1836
                                           int len, int inv,
1837
                                           const void *scale)
1838
792
{
1839
792
    int ret;
1840
792
    double freq;
1841
792
    TXSample *tab;
1842
792
    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
1843
1844
792
    if (inv) {
1845
792
        len *= 2;
1846
792
        s->len *= 2;
1847
792
        rsc *= 0.5;
1848
792
    }
1849
1850
792
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
1851
0
        return ret;
1852
1853
792
    s->exp = av_malloc((len/2)*3*sizeof(TXSample));
1854
792
    if (!s->exp)
1855
0
        return AVERROR(ENOMEM);
1856
1857
792
    tab = (TXSample *)s->exp;
1858
1859
792
    freq = M_PI/(len*2);
1860
1861
1.03M
    for (int i = 0; i < len; i++)
1862
1.03M
        tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
1863
1864
792
    if (inv) {
1865
516k
        for (int i = 0; i < len/2; i++)
1866
515k
            tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
1867
792
    } else {
1868
0
        for (int i = 0; i < len/2; i++)
1869
0
            tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
1870
0
    }
1871
1872
792
    return 0;
1873
792
}
Unexecuted instantiation: tx_double.c:ff_tx_dct_init_double_c
tx_float.c:ff_tx_dct_init_float_c
Line
Count
Source
1838
792
{
1839
792
    int ret;
1840
792
    double freq;
1841
792
    TXSample *tab;
1842
792
    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
1843
1844
792
    if (inv) {
1845
792
        len *= 2;
1846
792
        s->len *= 2;
1847
792
        rsc *= 0.5;
1848
792
    }
1849
1850
792
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
1851
0
        return ret;
1852
1853
792
    s->exp = av_malloc((len/2)*3*sizeof(TXSample));
1854
792
    if (!s->exp)
1855
0
        return AVERROR(ENOMEM);
1856
1857
792
    tab = (TXSample *)s->exp;
1858
1859
792
    freq = M_PI/(len*2);
1860
1861
1.03M
    for (int i = 0; i < len; i++)
1862
1.03M
        tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
1863
1864
792
    if (inv) {
1865
516k
        for (int i = 0; i < len/2; i++)
1866
515k
            tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
1867
792
    } else {
1868
0
        for (int i = 0; i < len/2; i++)
1869
0
            tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
1870
0
    }
1871
1872
792
    return 0;
1873
792
}
Unexecuted instantiation: tx_int32.c:ff_tx_dct_init_int32_c
1874
1875
static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst,
1876
                                 void *_src, ptrdiff_t stride)
1877
0
{
1878
0
    TXSample *dst = _dst;
1879
0
    TXSample *src = _src;
1880
0
    const int len = s->len;
1881
0
    const int len2 = len >> 1;
1882
0
    const TXSample *exp = (void *)s->exp;
1883
0
    TXSample next;
1884
#ifdef TX_INT32
1885
    int64_t tmp1, tmp2;
1886
#else
1887
    TXSample tmp1, tmp2;
1888
#endif
1889
1890
0
    for (int i = 0; i < len2; i++) {
1891
0
        TXSample in1 = src[i];
1892
0
        TXSample in2 = src[len - i - 1];
1893
0
        TXSample s    = exp[len + i];
1894
1895
#ifdef TX_INT32
1896
        tmp1 = in1 + in2;
1897
        tmp2 = in1 - in2;
1898
1899
        tmp1 >>= 1;
1900
        tmp2 *= s;
1901
1902
        tmp2 = (tmp2 + 0x40000000) >> 31;
1903
#else
1904
        tmp1 = (in1 + in2)*0.5;
1905
        tmp2 = (in1 - in2)*s;
1906
#endif
1907
1908
0
        src[i]           = tmp1 + tmp2;
1909
0
        src[len - i - 1] = tmp1 - tmp2;
1910
0
    }
1911
1912
0
    s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex));
1913
1914
0
    next = dst[len];
1915
1916
0
    for (int i = len - 2; i > 0; i -= 2) {
1917
0
        TXSample tmp;
1918
1919
0
        CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]);
1920
1921
0
        dst[i + 1] = next;
1922
1923
0
        next += tmp;
1924
0
    }
1925
1926
#ifdef TX_INT32
1927
    tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]);
1928
    dst[0] = (tmp1 + 0x40000000) >> 31;
1929
#else
1930
    dst[0] = exp[0] * dst[0];
1931
#endif
1932
0
    dst[1] = next;
1933
0
}
Unexecuted instantiation: tx_double.c:ff_tx_dctII_double_c
Unexecuted instantiation: tx_float.c:ff_tx_dctII_float_c
Unexecuted instantiation: tx_int32.c:ff_tx_dctII_int32_c
1934
1935
static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst,
1936
                                  void *_src, ptrdiff_t stride)
1937
376k
{
1938
376k
    TXSample *dst = _dst;
1939
376k
    TXSample *src = _src;
1940
376k
    const int len = s->len;
1941
376k
    const int len2 = len >> 1;
1942
376k
    const TXSample *exp = (void *)s->exp;
1943
#ifdef TX_INT32
1944
    int64_t  tmp1, tmp2 = src[len - 1];
1945
    tmp2 = (2*tmp2 + 0x40000000) >> 31;
1946
#else
1947
    TXSample tmp1, tmp2 = 2*src[len - 1];
1948
#endif
1949
1950
376k
    src[len] = tmp2;
1951
1952
146M
    for (int i = len - 2; i >= 2; i -= 2) {
1953
146M
        TXSample val1 = src[i - 0];
1954
146M
        TXSample val2 = src[i - 1] - src[i + 1];
1955
1956
146M
        CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
1957
146M
    }
1958
1959
376k
    s->fn[0](&s->sub[0], dst, src, sizeof(float));
1960
1961
147M
    for (int i = 0; i < len2; i++) {
1962
146M
        TXSample in1 = dst[i];
1963
146M
        TXSample in2 = dst[len - i - 1];
1964
146M
        TXSample c   = exp[len + i];
1965
1966
146M
        tmp1 = in1 + in2;
1967
146M
        tmp2 = in1 - in2;
1968
146M
        tmp2 *= c;
1969
#ifdef TX_INT32
1970
        tmp2 = (tmp2 + 0x40000000) >> 31;
1971
#endif
1972
1973
146M
        dst[i]            = tmp1 + tmp2;
1974
146M
        dst[len - i - 1]  = tmp1 - tmp2;
1975
146M
    }
1976
376k
}
Unexecuted instantiation: tx_double.c:ff_tx_dctIII_double_c
tx_float.c:ff_tx_dctIII_float_c
Line
Count
Source
1937
376k
{
1938
376k
    TXSample *dst = _dst;
1939
376k
    TXSample *src = _src;
1940
376k
    const int len = s->len;
1941
376k
    const int len2 = len >> 1;
1942
376k
    const TXSample *exp = (void *)s->exp;
1943
#ifdef TX_INT32
1944
    int64_t  tmp1, tmp2 = src[len - 1];
1945
    tmp2 = (2*tmp2 + 0x40000000) >> 31;
1946
#else
1947
376k
    TXSample tmp1, tmp2 = 2*src[len - 1];
1948
376k
#endif
1949
1950
376k
    src[len] = tmp2;
1951
1952
146M
    for (int i = len - 2; i >= 2; i -= 2) {
1953
146M
        TXSample val1 = src[i - 0];
1954
146M
        TXSample val2 = src[i - 1] - src[i + 1];
1955
1956
146M
        CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
1957
146M
    }
1958
1959
376k
    s->fn[0](&s->sub[0], dst, src, sizeof(float));
1960
1961
147M
    for (int i = 0; i < len2; i++) {
1962
146M
        TXSample in1 = dst[i];
1963
146M
        TXSample in2 = dst[len - i - 1];
1964
146M
        TXSample c   = exp[len + i];
1965
1966
146M
        tmp1 = in1 + in2;
1967
146M
        tmp2 = in1 - in2;
1968
146M
        tmp2 *= c;
1969
#ifdef TX_INT32
1970
        tmp2 = (tmp2 + 0x40000000) >> 31;
1971
#endif
1972
1973
146M
        dst[i]            = tmp1 + tmp2;
1974
146M
        dst[len - i - 1]  = tmp1 - tmp2;
1975
146M
    }
1976
376k
}
Unexecuted instantiation: tx_int32.c:ff_tx_dctIII_int32_c
1977
1978
static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = {
1979
    .name       = TX_NAME_STR("dctII"),
1980
    .function   = TX_NAME(ff_tx_dctII),
1981
    .type       = TX_TYPE(DCT),
1982
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
1983
                  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY,
1984
    .factors    = { 2, TX_FACTOR_ANY },
1985
    .min_len    = 2,
1986
    .max_len    = TX_LEN_UNLIMITED,
1987
    .init       = TX_NAME(ff_tx_dct_init),
1988
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
1989
    .prio       = FF_TX_PRIO_BASE,
1990
};
1991
1992
static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = {
1993
    .name       = TX_NAME_STR("dctIII"),
1994
    .function   = TX_NAME(ff_tx_dctIII),
1995
    .type       = TX_TYPE(DCT),
1996
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE |
1997
                  FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY,
1998
    .factors    = { 2, TX_FACTOR_ANY },
1999
    .min_len    = 2,
2000
    .max_len    = TX_LEN_UNLIMITED,
2001
    .init       = TX_NAME(ff_tx_dct_init),
2002
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
2003
    .prio       = FF_TX_PRIO_BASE,
2004
};
2005
2006
static av_cold int TX_NAME(ff_tx_dcstI_init)(AVTXContext *s,
2007
                                             const FFTXCodelet *cd,
2008
                                             uint64_t flags,
2009
                                             FFTXCodeletOptions *opts,
2010
                                             int len, int inv,
2011
                                             const void *scale)
2012
1.56k
{
2013
1.56k
    int ret;
2014
1.56k
    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
2015
2016
1.56k
    if (inv) {
2017
0
        len *= 2;
2018
0
        s->len *= 2;
2019
0
        rsc *= 0.5;
2020
0
    }
2021
2022
    /* We want a half-complex RDFT */
2023
1.56k
    flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
2024
1.56k
                                          AV_TX_REAL_TO_IMAGINARY;
2025
2026
1.56k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
2027
1.56k
                                (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
2028
1.56k
                                0, &rsc)))
2029
0
        return ret;
2030
2031
1.56k
    s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
2032
1.56k
    if (!s->tmp)
2033
0
        return AVERROR(ENOMEM);
2034
2035
1.56k
    return 0;
2036
1.56k
}
Unexecuted instantiation: tx_double.c:ff_tx_dcstI_init_double_c
tx_float.c:ff_tx_dcstI_init_float_c
Line
Count
Source
2012
1.56k
{
2013
1.56k
    int ret;
2014
1.56k
    SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
2015
2016
1.56k
    if (inv) {
2017
0
        len *= 2;
2018
0
        s->len *= 2;
2019
0
        rsc *= 0.5;
2020
0
    }
2021
2022
    /* We want a half-complex RDFT */
2023
1.56k
    flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
2024
1.56k
                                          AV_TX_REAL_TO_IMAGINARY;
2025
2026
1.56k
    if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
2027
1.56k
                                (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
2028
1.56k
                                0, &rsc)))
2029
0
        return ret;
2030
2031
1.56k
    s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
2032
1.56k
    if (!s->tmp)
2033
0
        return AVERROR(ENOMEM);
2034
2035
1.56k
    return 0;
2036
1.56k
}
Unexecuted instantiation: tx_int32.c:ff_tx_dcstI_init_int32_c
2037
2038
static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst,
2039
                                void *_src, ptrdiff_t stride)
2040
1.61M
{
2041
1.61M
    TXSample *dst = _dst;
2042
1.61M
    TXSample *src = _src;
2043
1.61M
    const int len = s->len - 1;
2044
1.61M
    TXSample *tmp = (TXSample *)s->tmp;
2045
2046
1.61M
    stride /= sizeof(TXSample);
2047
2048
103M
    for (int i = 0; i < len; i++)
2049
101M
        tmp[i] = tmp[2*len - i] = src[i * stride];
2050
2051
1.61M
    tmp[len] = src[len * stride]; /* Middle */
2052
2053
1.61M
    s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
2054
1.61M
}
Unexecuted instantiation: tx_double.c:ff_tx_dctI_double_c
tx_float.c:ff_tx_dctI_float_c
Line
Count
Source
2040
1.61M
{
2041
1.61M
    TXSample *dst = _dst;
2042
1.61M
    TXSample *src = _src;
2043
1.61M
    const int len = s->len - 1;
2044
1.61M
    TXSample *tmp = (TXSample *)s->tmp;
2045
2046
1.61M
    stride /= sizeof(TXSample);
2047
2048
103M
    for (int i = 0; i < len; i++)
2049
101M
        tmp[i] = tmp[2*len - i] = src[i * stride];
2050
2051
1.61M
    tmp[len] = src[len * stride]; /* Middle */
2052
2053
1.61M
    s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
2054
1.61M
}
Unexecuted instantiation: tx_int32.c:ff_tx_dctI_int32_c
2055
2056
static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst,
2057
                                void *_src, ptrdiff_t stride)
2058
1.61M
{
2059
1.61M
    TXSample *dst = _dst;
2060
1.61M
    TXSample *src = _src;
2061
1.61M
    const int len = s->len + 1;
2062
1.61M
    TXSample *tmp = (void *)s->tmp;
2063
2064
1.61M
    stride /= sizeof(TXSample);
2065
2066
1.61M
    tmp[0] = 0;
2067
2068
105M
    for (int i = 1; i < len; i++) {
2069
103M
        TXSample a = src[(i - 1) * stride];
2070
103M
        tmp[i] = -a;
2071
103M
        tmp[2*len - i] = a;
2072
103M
    }
2073
2074
1.61M
    tmp[len] = 0; /* i == n, Nyquist */
2075
2076
1.61M
    s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
2077
1.61M
}
Unexecuted instantiation: tx_double.c:ff_tx_dstI_double_c
tx_float.c:ff_tx_dstI_float_c
Line
Count
Source
2058
1.61M
{
2059
1.61M
    TXSample *dst = _dst;
2060
1.61M
    TXSample *src = _src;
2061
1.61M
    const int len = s->len + 1;
2062
1.61M
    TXSample *tmp = (void *)s->tmp;
2063
2064
1.61M
    stride /= sizeof(TXSample);
2065
2066
1.61M
    tmp[0] = 0;
2067
2068
105M
    for (int i = 1; i < len; i++) {
2069
103M
        TXSample a = src[(i - 1) * stride];
2070
103M
        tmp[i] = -a;
2071
103M
        tmp[2*len - i] = a;
2072
103M
    }
2073
2074
1.61M
    tmp[len] = 0; /* i == n, Nyquist */
2075
2076
1.61M
    s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
2077
1.61M
}
Unexecuted instantiation: tx_int32.c:ff_tx_dstI_int32_c
2078
2079
static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = {
2080
    .name       = TX_NAME_STR("dctI"),
2081
    .function   = TX_NAME(ff_tx_dctI),
2082
    .type       = TX_TYPE(DCT_I),
2083
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
2084
    .factors    = { 2, TX_FACTOR_ANY },
2085
    .nb_factors = 2,
2086
    .min_len    = 2,
2087
    .max_len    = TX_LEN_UNLIMITED,
2088
    .init       = TX_NAME(ff_tx_dcstI_init),
2089
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
2090
    .prio       = FF_TX_PRIO_BASE,
2091
};
2092
2093
static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = {
2094
    .name       = TX_NAME_STR("dstI"),
2095
    .function   = TX_NAME(ff_tx_dstI),
2096
    .type       = TX_TYPE(DST_I),
2097
    .flags      = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE,
2098
    .factors    = { 2, TX_FACTOR_ANY },
2099
    .nb_factors = 2,
2100
    .min_len    = 2,
2101
    .max_len    = TX_LEN_UNLIMITED,
2102
    .init       = TX_NAME(ff_tx_dcstI_init),
2103
    .cpu_flags  = FF_TX_CPU_FLAGS_ALL,
2104
    .prio       = FF_TX_PRIO_BASE,
2105
};
2106
2107
int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
2108
1.07M
{
2109
1.07M
    int off = 0;
2110
1.07M
    int len4 = s->len >> 1;
2111
1.07M
    double scale = s->scale_d;
2112
1.07M
    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2113
1.07M
    size_t alloc = pre_tab ? 2*len4 : len4;
2114
2115
1.07M
    if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2116
0
        return AVERROR(ENOMEM);
2117
2118
1.07M
    scale = sqrt(fabs(scale));
2119
2120
1.07M
    if (pre_tab)
2121
1.05M
        off = len4;
2122
2123
232M
    for (int i = 0; i < len4; i++) {
2124
230M
        const double alpha = M_PI_2 * (i + theta) / len4;
2125
230M
        s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2126
230M
                                       RESCALE(sin(alpha) * scale) };
2127
230M
    }
2128
2129
1.07M
    if (pre_tab)
2130
218M
        for (int i = 0; i < len4; i++)
2131
217M
            s->exp[i] = s->exp[len4 + pre_tab[i]];
2132
2133
1.07M
    return 0;
2134
1.07M
}
Unexecuted instantiation: ff_tx_mdct_gen_exp_double
ff_tx_mdct_gen_exp_float
Line
Count
Source
2108
950k
{
2109
950k
    int off = 0;
2110
950k
    int len4 = s->len >> 1;
2111
950k
    double scale = s->scale_d;
2112
950k
    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2113
950k
    size_t alloc = pre_tab ? 2*len4 : len4;
2114
2115
950k
    if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2116
0
        return AVERROR(ENOMEM);
2117
2118
950k
    scale = sqrt(fabs(scale));
2119
2120
950k
    if (pre_tab)
2121
928k
        off = len4;
2122
2123
214M
    for (int i = 0; i < len4; i++) {
2124
213M
        const double alpha = M_PI_2 * (i + theta) / len4;
2125
213M
        s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2126
213M
                                       RESCALE(sin(alpha) * scale) };
2127
213M
    }
2128
2129
950k
    if (pre_tab)
2130
203M
        for (int i = 0; i < len4; i++)
2131
202M
            s->exp[i] = s->exp[len4 + pre_tab[i]];
2132
2133
950k
    return 0;
2134
950k
}
ff_tx_mdct_gen_exp_int32
Line
Count
Source
2108
129k
{
2109
129k
    int off = 0;
2110
129k
    int len4 = s->len >> 1;
2111
129k
    double scale = s->scale_d;
2112
129k
    const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2113
129k
    size_t alloc = pre_tab ? 2*len4 : len4;
2114
2115
129k
    if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2116
0
        return AVERROR(ENOMEM);
2117
2118
129k
    scale = sqrt(fabs(scale));
2119
2120
129k
    if (pre_tab)
2121
124k
        off = len4;
2122
2123
17.3M
    for (int i = 0; i < len4; i++) {
2124
17.2M
        const double alpha = M_PI_2 * (i + theta) / len4;
2125
17.2M
        s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2126
17.2M
                                       RESCALE(sin(alpha) * scale) };
2127
17.2M
    }
2128
2129
129k
    if (pre_tab)
2130
14.5M
        for (int i = 0; i < len4; i++)
2131
14.4M
            s->exp[i] = s->exp[len4 + pre_tab[i]];
2132
2133
129k
    return 0;
2134
129k
}
2135
2136
const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
2137
    /* Split-Radix codelets */
2138
    &TX_NAME(ff_tx_fft2_ns_def),
2139
    &TX_NAME(ff_tx_fft4_ns_def),
2140
    &TX_NAME(ff_tx_fft8_ns_def),
2141
    &TX_NAME(ff_tx_fft16_ns_def),
2142
    &TX_NAME(ff_tx_fft32_ns_def),
2143
    &TX_NAME(ff_tx_fft64_ns_def),
2144
    &TX_NAME(ff_tx_fft128_ns_def),
2145
    &TX_NAME(ff_tx_fft256_ns_def),
2146
    &TX_NAME(ff_tx_fft512_ns_def),
2147
    &TX_NAME(ff_tx_fft1024_ns_def),
2148
    &TX_NAME(ff_tx_fft2048_ns_def),
2149
    &TX_NAME(ff_tx_fft4096_ns_def),
2150
    &TX_NAME(ff_tx_fft8192_ns_def),
2151
    &TX_NAME(ff_tx_fft16384_ns_def),
2152
    &TX_NAME(ff_tx_fft32768_ns_def),
2153
    &TX_NAME(ff_tx_fft65536_ns_def),
2154
    &TX_NAME(ff_tx_fft131072_ns_def),
2155
2156
    /* Prime factor codelets */
2157
    &TX_NAME(ff_tx_fft3_ns_def),
2158
    &TX_NAME(ff_tx_fft5_ns_def),
2159
    &TX_NAME(ff_tx_fft7_ns_def),
2160
    &TX_NAME(ff_tx_fft9_ns_def),
2161
    &TX_NAME(ff_tx_fft15_ns_def),
2162
2163
    /* We get these for free */
2164
    &TX_NAME(ff_tx_fft3_fwd_def),
2165
    &TX_NAME(ff_tx_fft5_fwd_def),
2166
    &TX_NAME(ff_tx_fft7_fwd_def),
2167
    &TX_NAME(ff_tx_fft9_fwd_def),
2168
2169
    /* Standalone transforms */
2170
    &TX_NAME(ff_tx_fft_def),
2171
    &TX_NAME(ff_tx_fft_inplace_def),
2172
    &TX_NAME(ff_tx_fft_inplace_small_def),
2173
    &TX_NAME(ff_tx_fft_pfa_def),
2174
    &TX_NAME(ff_tx_fft_pfa_ns_def),
2175
    &TX_NAME(ff_tx_fft_naive_def),
2176
    &TX_NAME(ff_tx_fft_naive_small_def),
2177
    &TX_NAME(ff_tx_mdct_fwd_def),
2178
    &TX_NAME(ff_tx_mdct_inv_def),
2179
    &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
2180
    &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
2181
    &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
2182
    &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
2183
    &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
2184
    &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
2185
    &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
2186
    &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
2187
    &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
2188
    &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
2189
    &TX_NAME(ff_tx_mdct_naive_fwd_def),
2190
    &TX_NAME(ff_tx_mdct_naive_inv_def),
2191
    &TX_NAME(ff_tx_mdct_inv_full_def),
2192
    &TX_NAME(ff_tx_rdft_r2c_def),
2193
    &TX_NAME(ff_tx_rdft_r2r_def),
2194
    &TX_NAME(ff_tx_rdft_r2r_mod2_def),
2195
    &TX_NAME(ff_tx_rdft_r2i_def),
2196
    &TX_NAME(ff_tx_rdft_r2i_mod2_def),
2197
    &TX_NAME(ff_tx_rdft_c2r_def),
2198
    &TX_NAME(ff_tx_dctII_def),
2199
    &TX_NAME(ff_tx_dctIII_def),
2200
    &TX_NAME(ff_tx_dctI_def),
2201
    &TX_NAME(ff_tx_dstI_def),
2202
2203
    NULL,
2204
};