Coverage Report

Created: 2024-07-27 06:04

/src/openexr/src/lib/OpenEXR/ImfDwaCompressorSimd.h
Line
Count
Source (jump to first uncovered line)
1
//
2
// SPDX-License-Identifier: BSD-3-Clause
3
// Copyright (c) Contributors to the OpenEXR Project.
4
//
5
6
#ifndef IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
7
#define IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
8
9
//
10
// Various SSE accelerated functions, used by Imf::DwaCompressor.
11
// These have been separated into a separate .h file, as the fast
12
// paths are done with template specialization.
13
//
14
// Unless otherwise noted, all pointers are assumed to be 32-byte
15
// aligned. Unaligned pointers may risk seg-faulting.
16
//
17
18
#include "ImfNamespace.h"
19
#include "ImfSimd.h"
20
#include "ImfSystemSpecific.h"
21
#include "OpenEXRConfig.h"
22
#include "OpenEXRConfigInternal.h"
23
24
#include <assert.h>
25
#include <half.h>
26
27
#include <algorithm>
28
29
OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
30
31
158k
#define _SSE_ALIGNMENT 32
32
316k
#define _SSE_ALIGNMENT_MASK 0x0F
33
#define _AVX_ALIGNMENT_MASK 0x1F
34
35
//
36
// A simple 64-element array, aligned properly for SIMD access.
37
//
38
39
template <class T> class SimdAlignedBuffer64
40
{
41
public:
42
8.82k
    SimdAlignedBuffer64 () : _buffer (0), _handle (0) { alloc (); }
Imf_3_3::SimdAlignedBuffer64<unsigned short>::SimdAlignedBuffer64()
Line
Count
Source
42
4.41k
    SimdAlignedBuffer64 () : _buffer (0), _handle (0) { alloc (); }
Imf_3_3::SimdAlignedBuffer64<float>::SimdAlignedBuffer64()
Line
Count
Source
42
4.41k
    SimdAlignedBuffer64 () : _buffer (0), _handle (0) { alloc (); }
43
44
    SimdAlignedBuffer64 (const SimdAlignedBuffer64& rhs)
45
        : _buffer (0), _handle (0)
46
    {
47
        alloc ();
48
        memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
49
    }
50
51
    SimdAlignedBuffer64& operator= (const SimdAlignedBuffer64& rhs)
52
    {
53
        memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
54
        return *this;
55
    }
56
57
#if __cplusplus >= 201103L
58
    SimdAlignedBuffer64 (SimdAlignedBuffer64&& rhs) noexcept
59
        : _buffer (rhs._buffer), _handle (rhs._handle)
60
0
    {
61
0
        rhs._handle = nullptr;
62
0
        rhs._buffer = nullptr;
63
0
    }
64
65
    SimdAlignedBuffer64& operator= (SimdAlignedBuffer64&& rhs) noexcept
66
    {
67
        std::swap (_handle, rhs._handle);
68
        std::swap (_buffer, rhs._buffer);
69
        return *this;
70
    }
71
#endif
72
    ~SimdAlignedBuffer64 ()
73
8.82k
    {
74
8.82k
        if (_handle) EXRFreeAligned (_handle);
75
8.82k
        _handle = 0;
76
8.82k
        _buffer = 0;
77
8.82k
    }
Imf_3_3::SimdAlignedBuffer64<float>::~SimdAlignedBuffer64()
Line
Count
Source
73
4.41k
    {
74
4.41k
        if (_handle) EXRFreeAligned (_handle);
75
4.41k
        _handle = 0;
76
4.41k
        _buffer = 0;
77
4.41k
    }
Imf_3_3::SimdAlignedBuffer64<unsigned short>::~SimdAlignedBuffer64()
Line
Count
Source
73
4.41k
    {
74
4.41k
        if (_handle) EXRFreeAligned (_handle);
75
4.41k
        _handle = 0;
76
4.41k
        _buffer = 0;
77
4.41k
    }
78
79
    void alloc ()
80
8.82k
    {
81
        //
82
        // Try EXRAllocAligned first - but it might fallback to
83
        // unaligned allocs. If so, overalloc.
84
        //
85
86
8.82k
        _handle = (char*) EXRAllocAligned (64 * sizeof (T), _SSE_ALIGNMENT);
87
88
8.82k
        if (((size_t) _handle & (_SSE_ALIGNMENT - 1)) == 0)
89
8.82k
        {
90
8.82k
            _buffer = (T*) _handle;
91
8.82k
            return;
92
8.82k
        }
93
94
0
        EXRFreeAligned (_handle);
95
0
        _handle = (char*) EXRAllocAligned (
96
0
            64 * sizeof (T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT);
97
98
0
        char* aligned = _handle;
99
100
0
        while ((size_t) aligned & (_SSE_ALIGNMENT - 1))
101
0
            aligned++;
102
103
0
        _buffer = (T*) aligned;
104
0
    }
Imf_3_3::SimdAlignedBuffer64<unsigned short>::alloc()
Line
Count
Source
80
4.41k
    {
81
        //
82
        // Try EXRAllocAligned first - but it might fallback to
83
        // unaligned allocs. If so, overalloc.
84
        //
85
86
4.41k
        _handle = (char*) EXRAllocAligned (64 * sizeof (T), _SSE_ALIGNMENT);
87
88
4.41k
        if (((size_t) _handle & (_SSE_ALIGNMENT - 1)) == 0)
89
4.41k
        {
90
4.41k
            _buffer = (T*) _handle;
91
4.41k
            return;
92
4.41k
        }
93
94
0
        EXRFreeAligned (_handle);
95
0
        _handle = (char*) EXRAllocAligned (
96
0
            64 * sizeof (T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT);
97
98
0
        char* aligned = _handle;
99
100
0
        while ((size_t) aligned & (_SSE_ALIGNMENT - 1))
101
0
            aligned++;
102
103
0
        _buffer = (T*) aligned;
104
0
    }
Imf_3_3::SimdAlignedBuffer64<float>::alloc()
Line
Count
Source
80
4.41k
    {
81
        //
82
        // Try EXRAllocAligned first - but it might fallback to
83
        // unaligned allocs. If so, overalloc.
84
        //
85
86
4.41k
        _handle = (char*) EXRAllocAligned (64 * sizeof (T), _SSE_ALIGNMENT);
87
88
4.41k
        if (((size_t) _handle & (_SSE_ALIGNMENT - 1)) == 0)
89
4.41k
        {
90
4.41k
            _buffer = (T*) _handle;
91
4.41k
            return;
92
4.41k
        }
93
94
0
        EXRFreeAligned (_handle);
95
0
        _handle = (char*) EXRAllocAligned (
96
0
            64 * sizeof (T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT);
97
98
0
        char* aligned = _handle;
99
100
0
        while ((size_t) aligned & (_SSE_ALIGNMENT - 1))
101
0
            aligned++;
102
103
0
        _buffer = (T*) aligned;
104
0
    }
105
106
    T* _buffer;
107
108
private:
109
    char* _handle;
110
};
111
112
typedef SimdAlignedBuffer64<float>          SimdAlignedBuffer64f;
113
typedef SimdAlignedBuffer64<unsigned short> SimdAlignedBuffer64us;
114
115
namespace
116
{
117
118
//
119
// Color space conversion, Inverse 709 CSC, Y'CbCr -> R'G'B'
120
//
121
122
void
123
csc709Inverse (float& comp0, float& comp1, float& comp2)
124
262
{
125
262
    float src[3];
126
127
262
    src[0] = comp0;
128
262
    src[1] = comp1;
129
262
    src[2] = comp2;
130
131
262
    comp0 = src[0] + 1.5747f * src[2];
132
262
    comp1 = src[0] - 0.1873f * src[1] - 0.4682f * src[2];
133
262
    comp2 = src[0] + 1.8556f * src[1];
134
262
}
135
136
#ifndef IMF_HAVE_SSE2
137
138
//
139
// Scalar color space conversion, based on 709 primiary chromaticies.
140
// No scaling or offsets, just the matrix
141
//
142
143
void
144
csc709Inverse64 (float* comp0, float* comp1, float* comp2)
145
{
146
    for (int i = 0; i < 64; ++i)
147
        csc709Inverse (comp0[i], comp1[i], comp2[i]);
148
}
149
150
#else /* IMF_HAVE_SSE2 */
151
152
//
153
// SSE2 color space conversion
154
//
155
156
void
157
csc709Inverse64 (float* comp0, float* comp1, float* comp2)
158
17.5k
{
159
17.5k
    __m128 c0 = {1.5747f, 1.5747f, 1.5747f, 1.5747f};
160
17.5k
    __m128 c1 = {1.8556f, 1.8556f, 1.8556f, 1.8556f};
161
17.5k
    __m128 c2 = {-0.1873f, -0.1873f, -0.1873f, -0.1873f};
162
17.5k
    __m128 c3 = {-0.4682f, -0.4682f, -0.4682f, -0.4682f};
163
164
17.5k
    __m128* r = (__m128*) comp0;
165
17.5k
    __m128* g = (__m128*) comp1;
166
17.5k
    __m128* b = (__m128*) comp2;
167
17.5k
    __m128  src[3];
168
169
17.5k
#    define CSC_INVERSE_709_SSE2_LOOP(i)                                       \
170
281k
        src[0] = r[i];                                                         \
171
281k
        src[1] = g[i];                                                         \
172
281k
        src[2] = b[i];                                                         \
173
281k
                                                                               \
174
281k
        r[i] = _mm_add_ps (r[i], _mm_mul_ps (src[2], c0));                     \
175
281k
                                                                               \
176
281k
        g[i]   = _mm_mul_ps (g[i], c2);                                        \
177
281k
        src[2] = _mm_mul_ps (src[2], c3);                                      \
178
281k
        g[i]   = _mm_add_ps (g[i], src[0]);                                    \
179
281k
        g[i]   = _mm_add_ps (g[i], src[2]);                                    \
180
281k
                                                                               \
181
281k
        b[i] = _mm_mul_ps (c1, src[1]);                                        \
182
281k
        b[i] = _mm_add_ps (b[i], src[0]);
183
184
17.5k
    CSC_INVERSE_709_SSE2_LOOP (0)
185
17.5k
    CSC_INVERSE_709_SSE2_LOOP (1)
186
17.5k
    CSC_INVERSE_709_SSE2_LOOP (2)
187
17.5k
    CSC_INVERSE_709_SSE2_LOOP (3)
188
189
17.5k
    CSC_INVERSE_709_SSE2_LOOP (4)
190
17.5k
    CSC_INVERSE_709_SSE2_LOOP (5)
191
17.5k
    CSC_INVERSE_709_SSE2_LOOP (6)
192
17.5k
    CSC_INVERSE_709_SSE2_LOOP (7)
193
194
17.5k
    CSC_INVERSE_709_SSE2_LOOP (8)
195
17.5k
    CSC_INVERSE_709_SSE2_LOOP (9)
196
17.5k
    CSC_INVERSE_709_SSE2_LOOP (10)
197
17.5k
    CSC_INVERSE_709_SSE2_LOOP (11)
198
199
17.5k
    CSC_INVERSE_709_SSE2_LOOP (12)
200
17.5k
    CSC_INVERSE_709_SSE2_LOOP (13)
201
17.5k
    CSC_INVERSE_709_SSE2_LOOP (14)
202
17.5k
    CSC_INVERSE_709_SSE2_LOOP (15)
203
17.5k
}
204
205
#endif /* IMF_HAVE_SSE2 */
206
207
//
208
// Color space conversion, Forward 709 CSC, R'G'B' -> Y'CbCr
209
//
210
// Simple FPU color space conversion. Based on the 709
211
// primary chromaticies, with no scaling or offsets.
212
//
213
214
void
215
csc709Forward64 (float* comp0, float* comp1, float* comp2)
216
0
{
217
0
    float src[3];
218
219
0
    for (int i = 0; i < 64; ++i)
220
0
    {
221
0
        src[0] = comp0[i];
222
0
        src[1] = comp1[i];
223
0
        src[2] = comp2[i];
224
225
0
        comp0[i] = 0.2126f * src[0] + 0.7152f * src[1] + 0.0722f * src[2];
226
0
        comp1[i] = -0.1146f * src[0] - 0.3854f * src[1] + 0.5000f * src[2];
227
0
        comp2[i] = 0.5000f * src[0] - 0.4542f * src[1] - 0.0458f * src[2];
228
0
    }
229
0
}
230
231
//
232
// Byte interleaving of 2 byte arrays:
233
//    src0 = AAAA
234
//    src1 = BBBB
235
//    dst  = ABABABAB
236
//
237
// numBytes is the size of each of the source buffers
238
//
239
240
#ifndef IMF_HAVE_SSE2
241
242
//
243
// Scalar default implementation
244
//
245
246
void
247
interleaveByte2 (char* dst, char* src0, char* src1, int numBytes)
248
{
249
    for (int x = 0; x < numBytes; ++x)
250
    {
251
        dst[2 * x]     = src0[x];
252
        dst[2 * x + 1] = src1[x];
253
    }
254
}
255
256
#else /* IMF_HAVE_SSE2 */
257
258
//
259
// SSE2 byte interleaving
260
//
261
262
void
263
interleaveByte2 (char* dst, char* src0, char* src1, int numBytes)
264
17.7k
{
265
17.7k
    int dstAlignment  = (size_t) dst % 16;
266
17.7k
    int src0Alignment = (size_t) src0 % 16;
267
17.7k
    int src1Alignment = (size_t) src1 % 16;
268
269
17.7k
    __m128i* dst_epi8  = (__m128i*) dst;
270
17.7k
    __m128i* src0_epi8 = (__m128i*) src0;
271
17.7k
    __m128i* src1_epi8 = (__m128i*) src1;
272
17.7k
    int      sseWidth  = numBytes / 16;
273
274
17.7k
    if ((!dstAlignment) && (!src0Alignment) && (!src1Alignment))
275
3.49k
    {
276
3.49k
        __m128i tmp0, tmp1;
277
278
        //
279
        // Aligned loads and stores
280
        //
281
282
75.1k
        for (int x = 0; x < sseWidth; ++x)
283
71.6k
        {
284
71.6k
            tmp0 = src0_epi8[x];
285
71.6k
            tmp1 = src1_epi8[x];
286
287
71.6k
            _mm_stream_si128 (&dst_epi8[2 * x], _mm_unpacklo_epi8 (tmp0, tmp1));
288
289
71.6k
            _mm_stream_si128 (
290
71.6k
                &dst_epi8[2 * x + 1], _mm_unpackhi_epi8 (tmp0, tmp1));
291
71.6k
        }
292
293
        //
294
        // Then do run the leftovers one at a time
295
        //
296
297
13.5k
        for (int x = 16 * sseWidth; x < numBytes; ++x)
298
10.0k
        {
299
10.0k
            dst[2 * x]     = src0[x];
300
10.0k
            dst[2 * x + 1] = src1[x];
301
10.0k
        }
302
3.49k
    }
303
14.2k
    else if ((!dstAlignment) && (src0Alignment == 8) && (src1Alignment == 8))
304
1.50k
    {
305
        //
306
        // Aligned stores, but catch up a few values so we can
307
        // use aligned loads
308
        //
309
310
10.9k
        for (int x = 0; x < std::min (numBytes, 8); ++x)
311
9.49k
        {
312
9.49k
            dst[2 * x]     = src0[x];
313
9.49k
            dst[2 * x + 1] = src1[x];
314
9.49k
        }
315
316
1.50k
        if (numBytes > 8)
317
734
        {
318
734
            dst_epi8  = (__m128i*) &dst[16];
319
734
            src0_epi8 = (__m128i*) &src0[8];
320
734
            src1_epi8 = (__m128i*) &src1[8];
321
734
            sseWidth  = (numBytes - 8) / 16;
322
323
42.4k
            for (int x = 0; x < sseWidth; ++x)
324
41.7k
            {
325
41.7k
                _mm_stream_si128 (
326
41.7k
                    &dst_epi8[2 * x],
327
41.7k
                    _mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x]));
328
329
41.7k
                _mm_stream_si128 (
330
41.7k
                    &dst_epi8[2 * x + 1],
331
41.7k
                    _mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x]));
332
41.7k
            }
333
334
            //
335
            // Then do run the leftovers one at a time
336
            //
337
338
4.26k
            for (int x = 16 * sseWidth + 8; x < numBytes; ++x)
339
3.53k
            {
340
3.53k
                dst[2 * x]     = src0[x];
341
3.53k
                dst[2 * x + 1] = src1[x];
342
3.53k
            }
343
734
        }
344
1.50k
    }
345
12.7k
    else
346
12.7k
    {
347
        //
348
        // Unaligned everything
349
        //
350
351
344k
        for (int x = 0; x < sseWidth; ++x)
352
332k
        {
353
332k
            __m128i tmpSrc0_epi8 = _mm_loadu_si128 (&src0_epi8[x]);
354
332k
            __m128i tmpSrc1_epi8 = _mm_loadu_si128 (&src1_epi8[x]);
355
356
332k
            _mm_storeu_si128 (
357
332k
                &dst_epi8[2 * x],
358
332k
                _mm_unpacklo_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
359
360
332k
            _mm_storeu_si128 (
361
332k
                &dst_epi8[2 * x + 1],
362
332k
                _mm_unpackhi_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
363
332k
        }
364
365
        //
366
        // Then do run the leftovers one at a time
367
        //
368
369
93.3k
        for (int x = 16 * sseWidth; x < numBytes; ++x)
370
80.5k
        {
371
80.5k
            dst[2 * x]     = src0[x];
372
80.5k
            dst[2 * x + 1] = src1[x];
373
80.5k
        }
374
12.7k
    }
375
17.7k
}
376
377
#endif /* IMF_HAVE_SSE2 */
378
379
//
380
// Float -> half float conversion
381
//
382
// To enable F16C based conversion, we can't rely on compile-time
383
// detection, hence the multiple defined versions. Pick one based
384
// on runtime cpuid detection.
385
//
386
387
//
388
// Default boring conversion
389
//
390
391
void
392
convertFloatToHalf64_scalar (unsigned short* dst, float* src)
393
0
{
394
0
    for (int i = 0; i < 64; ++i)
395
0
        dst[i] = ((half) src[i]).bits ();
396
0
}
397
398
#ifdef IMF_HAVE_NEON_AARCH64
399
400
void
401
convertFloatToHalf64_neon (unsigned short* dst, float* src)
402
{
403
    for (int i = 0; i < 64; i += 8)
404
    {
405
        float32x4x2_t vec_fp32 = vld1q_f32_x2 (src + i);
406
        vst1q_u16 (
407
            dst + i,
408
            vcombine_u16 (
409
                vreinterpret_u16_f16 (vcvt_f16_f32 (vec_fp32.val[0])),
410
                vreinterpret_u16_f16 (vcvt_f16_f32 (vec_fp32.val[1]))));
411
    }
412
}
413
#endif
414
415
//
416
// F16C conversion - Assumes aligned src and dst
417
//
418
419
void
420
convertFloatToHalf64_f16c (unsigned short* dst, float* src)
421
227k
{
422
    //
423
    // Ordinarily, I'd avoid using inline asm and prefer intrinsics.
424
    // However, in order to get the intrinsics, we need to tell
425
    // the compiler to generate VEX instructions.
426
    //
427
    // (On the GCC side, -mf16c goes ahead and activates -mavc,
428
    //  resulting in VEX code. Without -mf16c, no intrinsics..)
429
    //
430
    // Now, it's quite likely that we'll find ourselves in situations
431
    // where we want to build *without* VEX, in order to maintain
432
    // maximum compatibility. But to get there with intrinsics,
433
    // we'd need to break out code into a separate file. Bleh.
434
    // I'll take the asm.
435
    //
436
437
227k
#if defined IMF_HAVE_GCC_INLINEASM_X86
438
227k
    __asm__ ("vmovaps       (%0),     %%ymm0         \n"
439
227k
             "vmovaps   0x20(%0),     %%ymm1         \n"
440
227k
             "vmovaps   0x40(%0),     %%ymm2         \n"
441
227k
             "vmovaps   0x60(%0),     %%ymm3         \n"
442
227k
             "vcvtps2ph $0,           %%ymm0, %%xmm0 \n"
443
227k
             "vcvtps2ph $0,           %%ymm1, %%xmm1 \n"
444
227k
             "vcvtps2ph $0,           %%ymm2, %%xmm2 \n"
445
227k
             "vcvtps2ph $0,           %%ymm3, %%xmm3 \n"
446
227k
             "vmovdqa   %%xmm0,       0x00(%1)       \n"
447
227k
             "vmovdqa   %%xmm1,       0x10(%1)       \n"
448
227k
             "vmovdqa   %%xmm2,       0x20(%1)       \n"
449
227k
             "vmovdqa   %%xmm3,       0x30(%1)       \n"
450
227k
             "vmovaps   0x80(%0),     %%ymm0         \n"
451
227k
             "vmovaps   0xa0(%0),     %%ymm1         \n"
452
227k
             "vmovaps   0xc0(%0),     %%ymm2         \n"
453
227k
             "vmovaps   0xe0(%0),     %%ymm3         \n"
454
227k
             "vcvtps2ph $0,           %%ymm0, %%xmm0 \n"
455
227k
             "vcvtps2ph $0,           %%ymm1, %%xmm1 \n"
456
227k
             "vcvtps2ph $0,           %%ymm2, %%xmm2 \n"
457
227k
             "vcvtps2ph $0,           %%ymm3, %%xmm3 \n"
458
227k
             "vmovdqa   %%xmm0,       0x40(%1)       \n"
459
227k
             "vmovdqa   %%xmm1,       0x50(%1)       \n"
460
227k
             "vmovdqa   %%xmm2,       0x60(%1)       \n"
461
227k
             "vmovdqa   %%xmm3,       0x70(%1)       \n"
462
227k
#    ifndef __AVX__
463
227k
             "vzeroupper                             \n"
464
227k
#    endif     /* __AVX__ */
465
227k
             : /* Output  */
466
227k
             : /* Input   */ "r"(src), "r"(dst)
467
227k
#    ifndef __AVX__
468
227k
             : /* Clobber */ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"
469
#    else
470
             : /* Clobber */ "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"
471
#    endif /* __AVX__ */
472
227k
    );
473
#else
474
    convertFloatToHalf64_scalar (dst, src);
475
#endif /* IMF_HAVE_GCC_INLINEASM_X86 */
476
227k
}
477
478
//
479
// Convert an 8x8 block of HALF from zig-zag order to
480
// FLOAT in normal order. The order we want is:
481
//
482
//          src                           dst
483
//  0  1  2  3  4  5  6  7       0  1  5  6 14 15 27 28
484
//  8  9 10 11 12 13 14 15       2  4  7 13 16 26 29 42
485
// 16 17 18 19 20 21 22 23       3  8 12 17 25 30 41 43
486
// 24 25 26 27 28 29 30 31       9 11 18 24 31 40 44 53
487
// 32 33 34 35 36 37 38 39      10 19 23 32 39 45 52 54
488
// 40 41 42 43 44 45 46 47      20 22 33 38 46 51 55 60
489
// 48 49 50 51 52 53 54 55      21 34 37 47 50 56 59 61
490
// 56 57 58 59 60 61 62 63      35 36 48 49 57 58 62 63
491
//
492
493
void
494
fromHalfZigZag_scalar (unsigned short* src, float* dst)
495
0
{
496
0
    half* srcHalf = (half*) src;
497
498
0
    dst[0] = (float) srcHalf[0];
499
0
    dst[1] = (float) srcHalf[1];
500
0
    dst[2] = (float) srcHalf[5];
501
0
    dst[3] = (float) srcHalf[6];
502
0
    dst[4] = (float) srcHalf[14];
503
0
    dst[5] = (float) srcHalf[15];
504
0
    dst[6] = (float) srcHalf[27];
505
0
    dst[7] = (float) srcHalf[28];
506
0
    dst[8] = (float) srcHalf[2];
507
0
    dst[9] = (float) srcHalf[4];
508
509
0
    dst[10] = (float) srcHalf[7];
510
0
    dst[11] = (float) srcHalf[13];
511
0
    dst[12] = (float) srcHalf[16];
512
0
    dst[13] = (float) srcHalf[26];
513
0
    dst[14] = (float) srcHalf[29];
514
0
    dst[15] = (float) srcHalf[42];
515
0
    dst[16] = (float) srcHalf[3];
516
0
    dst[17] = (float) srcHalf[8];
517
0
    dst[18] = (float) srcHalf[12];
518
0
    dst[19] = (float) srcHalf[17];
519
520
0
    dst[20] = (float) srcHalf[25];
521
0
    dst[21] = (float) srcHalf[30];
522
0
    dst[22] = (float) srcHalf[41];
523
0
    dst[23] = (float) srcHalf[43];
524
0
    dst[24] = (float) srcHalf[9];
525
0
    dst[25] = (float) srcHalf[11];
526
0
    dst[26] = (float) srcHalf[18];
527
0
    dst[27] = (float) srcHalf[24];
528
0
    dst[28] = (float) srcHalf[31];
529
0
    dst[29] = (float) srcHalf[40];
530
531
0
    dst[30] = (float) srcHalf[44];
532
0
    dst[31] = (float) srcHalf[53];
533
0
    dst[32] = (float) srcHalf[10];
534
0
    dst[33] = (float) srcHalf[19];
535
0
    dst[34] = (float) srcHalf[23];
536
0
    dst[35] = (float) srcHalf[32];
537
0
    dst[36] = (float) srcHalf[39];
538
0
    dst[37] = (float) srcHalf[45];
539
0
    dst[38] = (float) srcHalf[52];
540
0
    dst[39] = (float) srcHalf[54];
541
542
0
    dst[40] = (float) srcHalf[20];
543
0
    dst[41] = (float) srcHalf[22];
544
0
    dst[42] = (float) srcHalf[33];
545
0
    dst[43] = (float) srcHalf[38];
546
0
    dst[44] = (float) srcHalf[46];
547
0
    dst[45] = (float) srcHalf[51];
548
0
    dst[46] = (float) srcHalf[55];
549
0
    dst[47] = (float) srcHalf[60];
550
0
    dst[48] = (float) srcHalf[21];
551
0
    dst[49] = (float) srcHalf[34];
552
553
0
    dst[50] = (float) srcHalf[37];
554
0
    dst[51] = (float) srcHalf[47];
555
0
    dst[52] = (float) srcHalf[50];
556
0
    dst[53] = (float) srcHalf[56];
557
0
    dst[54] = (float) srcHalf[59];
558
0
    dst[55] = (float) srcHalf[61];
559
0
    dst[56] = (float) srcHalf[35];
560
0
    dst[57] = (float) srcHalf[36];
561
0
    dst[58] = (float) srcHalf[48];
562
0
    dst[59] = (float) srcHalf[49];
563
564
0
    dst[60] = (float) srcHalf[57];
565
0
    dst[61] = (float) srcHalf[58];
566
0
    dst[62] = (float) srcHalf[62];
567
0
    dst[63] = (float) srcHalf[63];
568
0
}
569
570
//
571
// If we can form the correct ordering in xmm registers,
572
// we can use F16C to convert from HALF -> FLOAT. However,
573
// making the correct order isn't trivial.
574
//
575
// We want to re-order a source 8x8 matrix from:
576
//
577
//  0  1  2  3  4  5  6  7       0  1  5  6 14 15 27 28
578
//  8  9 10 11 12 13 14 15       2  4  7 13 16 26 29 42
579
// 16 17 18 19 20 21 22 23       3  8 12 17 25 30 41 43
580
// 24 25 26 27 28 29 30 31       9 11 18 24 31 40 44 53   (A)
581
// 32 33 34 35 36 37 38 39  --> 10 19 23 32 39 45 52 54
582
// 40 41 42 43 44 45 46 47      20 22 33 38 46 51 55 60
583
// 48 49 50 51 52 53 54 55      21 34 37 47 50 56 59 61
584
// 56 57 58 59 60 61 62 63      35 36 48 49 57 58 62 63
585
//
586
// Which looks like a mess, right?
587
//
588
// Now, check out the NE/SW diagonals of (A). Along those lines,
589
// we have runs of contiguous values! If we rewrite (A) a bit, we get:
590
//
591
//  0
592
//  1  2
593
//  5  4  3
594
//  6  7  8  9
595
// 14 13 12 11 10
596
// 15 16 17 18 19 20
597
// 27 26 25 24 23 22 21            (B)
598
// 28 29 30 31 32 33 34 35
599
//    42 41 40 39 38 37 36
600
//       43 44 45 46 47 48
601
//          53 52 51 50 49
602
//             54 55 56 57
603
//                60 59 58
604
//                   61 62
605
//                      63
606
//
607
// In this ordering, the columns are the rows (A). If we can 'transpose'
608
// (B), we'll achieve our goal. But we want this to fit nicely into
609
// xmm registers and still be able to load large runs efficiently.
610
// Also, notice that the odd rows are in ascending order, while
611
// the even rows are in descending order.
612
//
613
// If we 'fold' the bottom half up into the top, we can preserve ordered
614
// runs across rows, and still keep all the correct values in columns.
615
// After transposing, we'll need to rotate things back into place.
616
// This gives us:
617
//
618
//  0 | 42   41   40   39   38   37   36
619
//  1    2 | 43   44   45   46   47   48
620
//  5    4    3 | 53   52   51   50   49
621
//  6    7    8    9 | 54   55   56   57      (C)
622
// 14   13   12   11   10 | 60   59   58
623
// 15   16   17   18   19   20 | 61   62
624
// 27   26   25   24   23   22   21 | 61
625
// 28   29   30   31   32   33   34   35
626
//
627
// But hang on. We still have the backwards descending rows to deal with.
628
// Lets reverse the even rows so that all values are in ascending order
629
//
630
//  36   37  38   39   40   41   42 | 0
631
//  1    2 | 43   44   45   46   47   48
632
//  49   50  51   52   53 |  3    4    5
633
//  6    7    8    9 | 54   55   56   57      (D)
634
// 58   59   60 | 10   11   12   13   14
635
// 15   16   17   18   19   20 | 61   62
636
// 61 | 21   22   23   24   25   26   27
637
// 28   29   30   31   32   33   34   35
638
//
639
// If we can form (D),  we will then:
640
//   1) Reverse the even rows
641
//   2) Transpose
642
//   3) Rotate the rows
643
//
644
// and we'll have (A).
645
//
646
647
void
648
fromHalfZigZag_f16c (unsigned short* src, float* dst)
649
226k
{
650
226k
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
651
226k
    __asm__
652
653
        /* x3 <- 0                    
654
            * x8 <- [ 0- 7]              
655
            * x6 <- [56-63]              
656
            * x9 <- [21-28]              
657
            * x7 <- [28-35]              
658
            * x3 <- [ 6- 9] (lower half) */
659
660
226k
        ("vpxor   %%xmm3,  %%xmm3, %%xmm3   \n"
661
226k
         "vmovdqa    (%0), %%xmm8           \n"
662
226k
         "vmovdqa 112(%0), %%xmm6           \n"
663
226k
         "vmovdqu  42(%0), %%xmm9           \n"
664
226k
         "vmovdqu  56(%0), %%xmm7           \n"
665
226k
         "vmovq    12(%0), %%xmm3           \n"
666
667
         /* Setup rows 0-2 of A in xmm0-xmm2 
668
            * x1 <- x8 >> 16 (1 value)     
669
            * x2 <- x8 << 32 (2 values)    
670
            * x0 <- alignr([35-42], x8, 2) 
671
            * x1 <- blend(x1, [41-48])     
672
            * x2 <- blend(x2, [49-56])     */
673
674
226k
         "vpsrldq      $2, %%xmm8, %%xmm1   \n"
675
226k
         "vpslldq      $4, %%xmm8, %%xmm2   \n"
676
226k
         "vpalignr     $2, 70(%0), %%xmm8, %%xmm0 \n"
677
226k
         "vpblendw  $0xfc, 82(%0), %%xmm1, %%xmm1 \n"
678
226k
         "vpblendw  $0x1f, 98(%0), %%xmm2, %%xmm2 \n"
679
680
         /* Setup rows 4-6 of A in xmm4-xmm6 
681
            * x4 <- x6 >> 32 (2 values)   
682
            * x5 <- x6 << 16 (1 value)    
683
            * x6 <- alignr(x6,x9,14)      
684
            * x4 <- blend(x4, [ 7-14])    
685
            * x5 <- blend(x5, [15-22])    */
686
687
226k
         "vpsrldq      $4, %%xmm6, %%xmm4         \n"
688
226k
         "vpslldq      $2, %%xmm6, %%xmm5         \n"
689
226k
         "vpalignr    $14, %%xmm6, %%xmm9, %%xmm6 \n"
690
226k
         "vpblendw  $0xf8, 14(%0), %%xmm4, %%xmm4 \n"
691
226k
         "vpblendw  $0x3f, 30(%0), %%xmm5, %%xmm5 \n"
692
693
         /* Load the upper half of row 3 into xmm3 
694
            * x3 <- [54-57] (upper half) */
695
696
226k
         "vpinsrq      $1, 108(%0), %%xmm3, %%xmm3\n"
697
698
         /* Reverse the even rows. We're not using PSHUFB as
699
            * that requires loading an extra constant all the time,
700
            * and we're already pretty memory bound.
701
            */
702
703
226k
         "vpshuflw $0x1b, %%xmm0, %%xmm0          \n"
704
226k
         "vpshuflw $0x1b, %%xmm2, %%xmm2          \n"
705
226k
         "vpshuflw $0x1b, %%xmm4, %%xmm4          \n"
706
226k
         "vpshuflw $0x1b, %%xmm6, %%xmm6          \n"
707
708
226k
         "vpshufhw $0x1b, %%xmm0, %%xmm0          \n"
709
226k
         "vpshufhw $0x1b, %%xmm2, %%xmm2          \n"
710
226k
         "vpshufhw $0x1b, %%xmm4, %%xmm4          \n"
711
226k
         "vpshufhw $0x1b, %%xmm6, %%xmm6          \n"
712
713
226k
         "vpshufd $0x4e, %%xmm0, %%xmm0          \n"
714
226k
         "vpshufd $0x4e, %%xmm2, %%xmm2          \n"
715
226k
         "vpshufd $0x4e, %%xmm4, %%xmm4          \n"
716
226k
         "vpshufd $0x4e, %%xmm6, %%xmm6          \n"
717
718
         /* Transpose xmm0-xmm7 into xmm8-xmm15 */
719
720
226k
         "vpunpcklwd %%xmm1, %%xmm0, %%xmm8       \n"
721
226k
         "vpunpcklwd %%xmm3, %%xmm2, %%xmm9       \n"
722
226k
         "vpunpcklwd %%xmm5, %%xmm4, %%xmm10      \n"
723
226k
         "vpunpcklwd %%xmm7, %%xmm6, %%xmm11      \n"
724
226k
         "vpunpckhwd %%xmm1, %%xmm0, %%xmm12      \n"
725
226k
         "vpunpckhwd %%xmm3, %%xmm2, %%xmm13      \n"
726
226k
         "vpunpckhwd %%xmm5, %%xmm4, %%xmm14      \n"
727
226k
         "vpunpckhwd %%xmm7, %%xmm6, %%xmm15      \n"
728
729
226k
         "vpunpckldq  %%xmm9,  %%xmm8, %%xmm0     \n"
730
226k
         "vpunpckldq %%xmm11, %%xmm10, %%xmm1     \n"
731
226k
         "vpunpckhdq  %%xmm9,  %%xmm8, %%xmm2     \n"
732
226k
         "vpunpckhdq %%xmm11, %%xmm10, %%xmm3     \n"
733
226k
         "vpunpckldq %%xmm13, %%xmm12, %%xmm4     \n"
734
226k
         "vpunpckldq %%xmm15, %%xmm14, %%xmm5     \n"
735
226k
         "vpunpckhdq %%xmm13, %%xmm12, %%xmm6     \n"
736
226k
         "vpunpckhdq %%xmm15, %%xmm14, %%xmm7     \n"
737
738
226k
         "vpunpcklqdq %%xmm1,  %%xmm0, %%xmm8     \n"
739
226k
         "vpunpckhqdq %%xmm1,  %%xmm0, %%xmm9     \n"
740
226k
         "vpunpcklqdq %%xmm3,  %%xmm2, %%xmm10    \n"
741
226k
         "vpunpckhqdq %%xmm3,  %%xmm2, %%xmm11    \n"
742
226k
         "vpunpcklqdq %%xmm4,  %%xmm5, %%xmm12    \n"
743
226k
         "vpunpckhqdq %%xmm5,  %%xmm4, %%xmm13    \n"
744
226k
         "vpunpcklqdq %%xmm7,  %%xmm6, %%xmm14    \n"
745
226k
         "vpunpckhqdq %%xmm7,  %%xmm6, %%xmm15    \n"
746
747
         /* Rotate the rows to get the correct final order. 
748
            * Rotating xmm12 isn't needed, as we can handle
749
            * the rotation in the PUNPCKLQDQ above. Rotating
750
            * xmm8 isn't needed as it's already in the right order           
751
            */
752
753
226k
         "vpalignr  $2,  %%xmm9,  %%xmm9,  %%xmm9 \n"
754
226k
         "vpalignr  $4, %%xmm10, %%xmm10, %%xmm10 \n"
755
226k
         "vpalignr  $6, %%xmm11, %%xmm11, %%xmm11 \n"
756
226k
         "vpalignr $10, %%xmm13, %%xmm13, %%xmm13 \n"
757
226k
         "vpalignr $12, %%xmm14, %%xmm14, %%xmm14 \n"
758
226k
         "vpalignr $14, %%xmm15, %%xmm15, %%xmm15 \n"
759
760
         /* Convert from half -> float */
761
762
226k
         "vcvtph2ps  %%xmm8, %%ymm8            \n"
763
226k
         "vcvtph2ps  %%xmm9, %%ymm9            \n"
764
226k
         "vcvtph2ps %%xmm10, %%ymm10           \n"
765
226k
         "vcvtph2ps %%xmm11, %%ymm11           \n"
766
226k
         "vcvtph2ps %%xmm12, %%ymm12           \n"
767
226k
         "vcvtph2ps %%xmm13, %%ymm13           \n"
768
226k
         "vcvtph2ps %%xmm14, %%ymm14           \n"
769
226k
         "vcvtph2ps %%xmm15, %%ymm15           \n"
770
771
         /* Move float values to dst */
772
773
226k
         "vmovaps    %%ymm8,    (%1)           \n"
774
226k
         "vmovaps    %%ymm9,  32(%1)           \n"
775
226k
         "vmovaps   %%ymm10,  64(%1)           \n"
776
226k
         "vmovaps   %%ymm11,  96(%1)           \n"
777
226k
         "vmovaps   %%ymm12, 128(%1)           \n"
778
226k
         "vmovaps   %%ymm13, 160(%1)           \n"
779
226k
         "vmovaps   %%ymm14, 192(%1)           \n"
780
226k
         "vmovaps   %%ymm15, 224(%1)           \n"
781
226k
#    ifndef __AVX__
782
226k
         "vzeroupper                          \n"
783
226k
#    endif /* __AVX__ */
784
226k
         : /* Output  */
785
226k
         : /* Input   */ "r"(src), "r"(dst)
786
226k
         : /* Clobber */ "memory",
787
226k
#    ifndef __AVX__
788
226k
           "%xmm0",
789
226k
           "%xmm1",
790
226k
           "%xmm2",
791
226k
           "%xmm3",
792
226k
           "%xmm4",
793
226k
           "%xmm5",
794
226k
           "%xmm6",
795
226k
           "%xmm7",
796
226k
           "%xmm8",
797
226k
           "%xmm9",
798
226k
           "%xmm10",
799
226k
           "%xmm11",
800
226k
           "%xmm12",
801
226k
           "%xmm13",
802
226k
           "%xmm14",
803
226k
           "%xmm15"
804
#    else
805
           "%ymm0",
806
           "%ymm1",
807
           "%ymm2",
808
           "%ymm3",
809
           "%ymm4",
810
           "%ymm5",
811
           "%ymm6",
812
           "%ymm7",
813
           "%ymm8",
814
           "%ymm9",
815
           "%ymm10",
816
           "%ymm11",
817
           "%ymm12",
818
           "%ymm13",
819
           "%ymm14",
820
           "%ymm15"
821
#    endif /* __AVX__ */
822
226k
        );
823
824
#else
825
    fromHalfZigZag_scalar (src, dst);
826
#endif /* defined IMF_HAVE_GCC_INLINEASM_X86_64 */
827
226k
}
828
829
#ifdef IMF_HAVE_NEON_AARCH64
830
831
void
832
fromHalfZigZag_neon (unsigned short* __restrict__ src, float* __restrict__ dst)
833
{
834
    uint8x16_t res_tbl[4] = {
835
        {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42},
836
        {3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53},
837
        {10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60},
838
        {21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63}};
839
840
    uint8x16x4_t vec_input_l, vec_input_h;
841
842
    for (int i = 0; i < 4; i++)
843
    {
844
        uint8x16x2_t vec_in_u8 = vld2q_u8 ((unsigned char*) (src + 16 * i));
845
        vec_input_l.val[i]     = vec_in_u8.val[0];
846
        vec_input_h.val[i]     = vec_in_u8.val[1];
847
    }
848
849
#    pragma unroll(4)
850
    for (int i = 0; i < 4; i++)
851
    {
852
        uint8x16_t res_vec_l, res_vec_h;
853
        res_vec_l = vqtbl4q_u8 (vec_input_l, res_tbl[i]);
854
        res_vec_h = vqtbl4q_u8 (vec_input_h, res_tbl[i]);
855
        float16x8_t res_vec_l_f16 =
856
            vreinterpretq_f16_u8 (vzip1q_u8 (res_vec_l, res_vec_h));
857
        float16x8_t res_vec_h_f16 =
858
            vreinterpretq_f16_u8 (vzip2q_u8 (res_vec_l, res_vec_h));
859
        vst1q_f32 (dst + i * 16, vcvt_f32_f16 (vget_low_f16 (res_vec_l_f16)));
860
        vst1q_f32 (dst + i * 16 + 4, vcvt_high_f32_f16 (res_vec_l_f16));
861
        vst1q_f32 (
862
            dst + i * 16 + 8, vcvt_f32_f16 (vget_low_f16 (res_vec_h_f16)));
863
        vst1q_f32 (dst + i * 16 + 12, vcvt_high_f32_f16 (res_vec_h_f16));
864
    }
865
}
866
867
#endif // IMF_HAVE_NEON_AARCH64
868
869
//
870
// Inverse 8x8 DCT, only inverting the DC. This assumes that
871
// all AC frequencies are 0.
872
//
873
874
#ifndef IMF_HAVE_SSE2
875
876
void
877
dctInverse8x8DcOnly (float* data)
878
{
879
    float val = data[0] * 3.535536e-01f * 3.535536e-01f;
880
881
    for (int i = 0; i < 64; ++i)
882
        data[i] = val;
883
}
884
885
#else /* IMF_HAVE_SSE2 */
886
887
void
888
dctInverse8x8DcOnly (float* data)
889
23.0k
{
890
23.0k
    __m128  src = _mm_set1_ps (data[0] * 3.535536e-01f * 3.535536e-01f);
891
23.0k
    __m128* dst = (__m128*) data;
892
893
391k
    for (int i = 0; i < 16; ++i)
894
368k
        dst[i] = src;
895
23.0k
}
896
897
#endif /* IMF_HAVE_SSE2 */
898
899
//
900
// Full 8x8 Inverse DCT:
901
//
902
// Simple inverse DCT on an 8x8 block, with scalar ops only.
903
//  Operates on data in-place.
904
//
905
// This is based on the iDCT formuation (y = frequency domain,
906
//                                       x = spatial domain)
907
//
908
//    [x0]    [        ][y0]    [        ][y1]
909
//    [x1] =  [  M1    ][y2]  + [  M2    ][y3]
910
//    [x2]    [        ][y4]    [        ][y5]
911
//    [x3]    [        ][y6]    [        ][y7]
912
//
913
//    [x7]    [        ][y0]    [        ][y1]
914
//    [x6] =  [  M1    ][y2]  - [  M2    ][y3]
915
//    [x5]    [        ][y4]    [        ][y5]
916
//    [x4]    [        ][y6]    [        ][y7]
917
//
918
// where M1:             M2:
919
//
920
//   [a  c  a   f]     [b  d  e  g]
921
//   [a  f -a  -c]     [d -g -b -e]
922
//   [a -f -a   c]     [e -b  g  d]
923
//   [a -c  a  -f]     [g -e  d -b]
924
//
925
// and the constants are as defined below..
926
//
927
// If you know how many of the lower rows are zero, that can
928
// be passed in to help speed things up. If you don't know,
929
// just set zeroedRows=0.
930
//
931
932
//
933
// Default implementation
934
//
935
936
template <int zeroedRows>
937
void
938
dctInverse8x8_scalar (float* data)
939
0
{
940
0
    const float a = .5f * cosf (3.14159f / 4.0f);
941
0
    const float b = .5f * cosf (3.14159f / 16.0f);
942
0
    const float c = .5f * cosf (3.14159f / 8.0f);
943
0
    const float d = .5f * cosf (3.f * 3.14159f / 16.0f);
944
0
    const float e = .5f * cosf (5.f * 3.14159f / 16.0f);
945
0
    const float f = .5f * cosf (3.f * 3.14159f / 8.0f);
946
0
    const float g = .5f * cosf (7.f * 3.14159f / 16.0f);
947
948
0
    float alpha[4], beta[4], theta[4], gamma[4];
949
950
0
    float* rowPtr = NULL;
951
952
    //
953
    // First pass - row wise.
954
    //
955
    // This looks less-compact than the description above in
956
    // an attempt to fold together common sub-expressions.
957
    //
958
959
0
    for (int row = 0; row < 8 - zeroedRows; ++row)
960
0
    {
961
0
        rowPtr = data + row * 8;
962
963
0
        alpha[0] = c * rowPtr[2];
964
0
        alpha[1] = f * rowPtr[2];
965
0
        alpha[2] = c * rowPtr[6];
966
0
        alpha[3] = f * rowPtr[6];
967
968
0
        beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7];
969
0
        beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7];
970
0
        beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7];
971
0
        beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7];
972
973
0
        theta[0] = a * (rowPtr[0] + rowPtr[4]);
974
0
        theta[3] = a * (rowPtr[0] - rowPtr[4]);
975
976
0
        theta[1] = alpha[0] + alpha[3];
977
0
        theta[2] = alpha[1] - alpha[2];
978
979
0
        gamma[0] = theta[0] + theta[1];
980
0
        gamma[1] = theta[3] + theta[2];
981
0
        gamma[2] = theta[3] - theta[2];
982
0
        gamma[3] = theta[0] - theta[1];
983
984
0
        rowPtr[0] = gamma[0] + beta[0];
985
0
        rowPtr[1] = gamma[1] + beta[1];
986
0
        rowPtr[2] = gamma[2] + beta[2];
987
0
        rowPtr[3] = gamma[3] + beta[3];
988
989
0
        rowPtr[4] = gamma[3] - beta[3];
990
0
        rowPtr[5] = gamma[2] - beta[2];
991
0
        rowPtr[6] = gamma[1] - beta[1];
992
0
        rowPtr[7] = gamma[0] - beta[0];
993
0
    }
994
995
    //
996
    // Second pass - column wise.
997
    //
998
999
0
    for (int column = 0; column < 8; ++column)
1000
0
    {
1001
0
        alpha[0] = c * data[16 + column];
1002
0
        alpha[1] = f * data[16 + column];
1003
0
        alpha[2] = c * data[48 + column];
1004
0
        alpha[3] = f * data[48 + column];
1005
1006
0
        beta[0] = b * data[8 + column] + d * data[24 + column] +
1007
0
                  e * data[40 + column] + g * data[56 + column];
1008
1009
0
        beta[1] = d * data[8 + column] - g * data[24 + column] -
1010
0
                  b * data[40 + column] - e * data[56 + column];
1011
1012
0
        beta[2] = e * data[8 + column] - b * data[24 + column] +
1013
0
                  g * data[40 + column] + d * data[56 + column];
1014
1015
0
        beta[3] = g * data[8 + column] - e * data[24 + column] +
1016
0
                  d * data[40 + column] - b * data[56 + column];
1017
1018
0
        theta[0] = a * (data[column] + data[32 + column]);
1019
0
        theta[3] = a * (data[column] - data[32 + column]);
1020
1021
0
        theta[1] = alpha[0] + alpha[3];
1022
0
        theta[2] = alpha[1] - alpha[2];
1023
1024
0
        gamma[0] = theta[0] + theta[1];
1025
0
        gamma[1] = theta[3] + theta[2];
1026
0
        gamma[2] = theta[3] - theta[2];
1027
0
        gamma[3] = theta[0] - theta[1];
1028
1029
0
        data[column]      = gamma[0] + beta[0];
1030
0
        data[8 + column]  = gamma[1] + beta[1];
1031
0
        data[16 + column] = gamma[2] + beta[2];
1032
0
        data[24 + column] = gamma[3] + beta[3];
1033
1034
0
        data[32 + column] = gamma[3] - beta[3];
1035
0
        data[40 + column] = gamma[2] - beta[2];
1036
0
        data[48 + column] = gamma[1] - beta[1];
1037
0
        data[56 + column] = gamma[0] - beta[0];
1038
0
    }
1039
0
}
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<0>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<1>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<2>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<3>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<4>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<5>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<6>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<7>(float*)
1040
1041
//
1042
// SSE2 Implementation
1043
//
1044
1045
template <int zeroedRows>
1046
void
1047
dctInverse8x8_sse2 (float* data)
1048
0
{
1049
0
#ifdef IMF_HAVE_SSE2
1050
0
    __m128 a = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f};
1051
0
    __m128 b = {4.903927e-01f, 4.903927e-01f, 4.903927e-01f, 4.903927e-01f};
1052
0
    __m128 c = {4.619398e-01f, 4.619398e-01f, 4.619398e-01f, 4.619398e-01f};
1053
0
    __m128 d = {4.157349e-01f, 4.157349e-01f, 4.157349e-01f, 4.157349e-01f};
1054
0
    __m128 e = {2.777855e-01f, 2.777855e-01f, 2.777855e-01f, 2.777855e-01f};
1055
0
    __m128 f = {1.913422e-01f, 1.913422e-01f, 1.913422e-01f, 1.913422e-01f};
1056
0
    __m128 g = {9.754573e-02f, 9.754573e-02f, 9.754573e-02f, 9.754573e-02f};
1057
1058
0
    __m128 c0 = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f};
1059
0
    __m128 c1 = {4.619398e-01f, 1.913422e-01f, -1.913422e-01f, -4.619398e-01f};
1060
0
    __m128 c2 = {3.535536e-01f, -3.535536e-01f, -3.535536e-01f, 3.535536e-01f};
1061
0
    __m128 c3 = {1.913422e-01f, -4.619398e-01f, 4.619398e-01f, -1.913422e-01f};
1062
1063
0
    __m128 c4 = {4.903927e-01f, 4.157349e-01f, 2.777855e-01f, 9.754573e-02f};
1064
0
    __m128 c5 = {4.157349e-01f, -9.754573e-02f, -4.903927e-01f, -2.777855e-01f};
1065
0
    __m128 c6 = {2.777855e-01f, -4.903927e-01f, 9.754573e-02f, 4.157349e-01f};
1066
0
    __m128 c7 = {9.754573e-02f, -2.777855e-01f, 4.157349e-01f, -4.903927e-01f};
1067
1068
0
    __m128* srcVec = (__m128*) data;
1069
0
    __m128  x[8], evenSum, oddSum;
1070
0
    __m128  in[8], alpha[4], beta[4], theta[4], gamma[4];
1071
1072
    //
1073
    // Rows -
1074
    //
1075
    //  Treat this just like matrix-vector multiplication. The
1076
    //  trick is to note that:
1077
    //
1078
    //    [M00 M01 M02 M03][v0]   [(v0 M00) + (v1 M01) + (v2 M02) + (v3 M03)]
1079
    //    [M10 M11 M12 M13][v1] = [(v0 M10) + (v1 M11) + (v2 M12) + (v3 M13)]
1080
    //    [M20 M21 M22 M23][v2]   [(v0 M20) + (v1 M21) + (v2 M22) + (v3 M23)]
1081
    //    [M30 M31 M32 M33][v3]   [(v0 M30) + (v1 M31) + (v2 M32) + (v3 M33)]
1082
    //
1083
    // Then, we can fill a register with v_i and multiply by the i-th column
1084
    // of M, accumulating across all i-s.
1085
    //
1086
    // The kids refer to the populating of a register with a single value
1087
    // "broadcasting", and it can be done with a shuffle instruction. It
1088
    // seems to be the slowest part of the whole ordeal.
1089
    //
1090
    // Our matrix columns are stored above in c0-c7. c0-3 make up M1, and
1091
    // c4-7 are from M2.
1092
    //
1093
1094
0
#    define DCT_INVERSE_8x8_SS2_ROW_LOOP(i)                                    \
1095
        /*                                                              \
1096
             * Broadcast the components of the row                          \
1097
             */      \
1098
0
                                                                               \
1099
0
        x[0] = _mm_shuffle_ps (                                                \
1100
0
            srcVec[2 * i], srcVec[2 * i], _MM_SHUFFLE (0, 0, 0, 0));           \
1101
0
                                                                               \
1102
0
        x[1] = _mm_shuffle_ps (                                                \
1103
0
            srcVec[2 * i], srcVec[2 * i], _MM_SHUFFLE (1, 1, 1, 1));           \
1104
0
                                                                               \
1105
0
        x[2] = _mm_shuffle_ps (                                                \
1106
0
            srcVec[2 * i], srcVec[2 * i], _MM_SHUFFLE (2, 2, 2, 2));           \
1107
0
                                                                               \
1108
0
        x[3] = _mm_shuffle_ps (                                                \
1109
0
            srcVec[2 * i], srcVec[2 * i], _MM_SHUFFLE (3, 3, 3, 3));           \
1110
0
                                                                               \
1111
0
        x[4] = _mm_shuffle_ps (                                                \
1112
0
            srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (0, 0, 0, 0));   \
1113
0
                                                                               \
1114
0
        x[5] = _mm_shuffle_ps (                                                \
1115
0
            srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (1, 1, 1, 1));   \
1116
0
                                                                               \
1117
0
        x[6] = _mm_shuffle_ps (                                                \
1118
0
            srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (2, 2, 2, 2));   \
1119
0
                                                                               \
1120
0
        x[7] = _mm_shuffle_ps (                                                \
1121
0
            srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (3, 3, 3, 3));   \
1122
        /*                                                              \
1123
             * Multiply the components by each column of the matrix         \
1124
             */      \
1125
0
                                                                               \
1126
0
        x[0] = _mm_mul_ps (x[0], c0);                                          \
1127
0
        x[2] = _mm_mul_ps (x[2], c1);                                          \
1128
0
        x[4] = _mm_mul_ps (x[4], c2);                                          \
1129
0
        x[6] = _mm_mul_ps (x[6], c3);                                          \
1130
0
                                                                               \
1131
0
        x[1] = _mm_mul_ps (x[1], c4);                                          \
1132
0
        x[3] = _mm_mul_ps (x[3], c5);                                          \
1133
0
        x[5] = _mm_mul_ps (x[5], c6);                                          \
1134
0
        x[7] = _mm_mul_ps (x[7], c7);                                          \
1135
0
                                                                               \
1136
        /*                                                              \
1137
             * Add across                                                   \
1138
             */      \
1139
0
                                                                               \
1140
0
        evenSum = _mm_setzero_ps ();                                           \
1141
0
        evenSum = _mm_add_ps (evenSum, x[0]);                                  \
1142
0
        evenSum = _mm_add_ps (evenSum, x[2]);                                  \
1143
0
        evenSum = _mm_add_ps (evenSum, x[4]);                                  \
1144
0
        evenSum = _mm_add_ps (evenSum, x[6]);                                  \
1145
0
                                                                               \
1146
0
        oddSum = _mm_setzero_ps ();                                            \
1147
0
        oddSum = _mm_add_ps (oddSum, x[1]);                                    \
1148
0
        oddSum = _mm_add_ps (oddSum, x[3]);                                    \
1149
0
        oddSum = _mm_add_ps (oddSum, x[5]);                                    \
1150
0
        oddSum = _mm_add_ps (oddSum, x[7]);                                    \
1151
0
                                                                               \
1152
        /*                                                              \
1153
             * Final Sum:                                                   \
1154
             *    out [0, 1, 2, 3] = evenSum + oddSum                       \
1155
             *    out [7, 6, 5, 4] = evenSum - oddSum                       \
1156
             */      \
1157
0
                                                                               \
1158
0
        srcVec[2 * i]     = _mm_add_ps (evenSum, oddSum);                      \
1159
0
        srcVec[2 * i + 1] = _mm_sub_ps (evenSum, oddSum);                      \
1160
0
        srcVec[2 * i + 1] = _mm_shuffle_ps (                                   \
1161
0
            srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (0, 1, 2, 3))
1162
1163
0
    switch (zeroedRows)
1164
0
    {
1165
0
        case 0:
1166
0
        default:
1167
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0);
1168
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1);
1169
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2);
1170
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3);
1171
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (4);
1172
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (5);
1173
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (6);
1174
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (7);
1175
0
            break;
1176
1177
0
        case 1:
1178
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0);
1179
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1);
1180
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2);
1181
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3);
1182
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (4);
1183
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (5);
1184
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (6);
1185
0
            break;
1186
1187
0
        case 2:
1188
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0);
1189
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1);
1190
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2);
1191
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3);
1192
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (4);
1193
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (5);
1194
0
            break;
1195
1196
0
        case 3:
1197
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0);
1198
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1);
1199
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2);
1200
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3);
1201
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (4);
1202
0
            break;
1203
1204
0
        case 4:
1205
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0);
1206
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1);
1207
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2);
1208
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3);
1209
0
            break;
1210
1211
0
        case 5:
1212
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0);
1213
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1);
1214
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2);
1215
0
            break;
1216
1217
0
        case 6:
1218
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0);
1219
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1);
1220
0
            break;
1221
1222
0
        case 7: DCT_INVERSE_8x8_SS2_ROW_LOOP (0); break;
1223
0
    }
1224
1225
0
#    undef DCT_INVERSE_8x8_SS2_ROW_LOOP
1226
    //
1227
    // Columns -
1228
    //
1229
    // This is slightly more straightforward, if less readable. Here
1230
    // we just operate on 4 columns at a time, in two batches.
1231
    //
1232
    // The slight mess is to try and cache sub-expressions, which
1233
    // we ignore in the row-wise pass.
1234
    //
1235
1236
0
    for (int col = 0; col < 2; ++col)
1237
0
    {
1238
1239
0
        for (int i = 0; i < 8; ++i)
1240
0
            in[i] = srcVec[2 * i + col];
1241
1242
0
        alpha[0] = _mm_mul_ps (c, in[2]);
1243
0
        alpha[1] = _mm_mul_ps (f, in[2]);
1244
0
        alpha[2] = _mm_mul_ps (c, in[6]);
1245
0
        alpha[3] = _mm_mul_ps (f, in[6]);
1246
1247
0
        beta[0] = _mm_add_ps (
1248
0
            _mm_add_ps (_mm_mul_ps (in[1], b), _mm_mul_ps (in[3], d)),
1249
0
            _mm_add_ps (_mm_mul_ps (in[5], e), _mm_mul_ps (in[7], g)));
1250
1251
0
        beta[1] = _mm_sub_ps (
1252
0
            _mm_sub_ps (_mm_mul_ps (in[1], d), _mm_mul_ps (in[3], g)),
1253
0
            _mm_add_ps (_mm_mul_ps (in[5], b), _mm_mul_ps (in[7], e)));
1254
1255
0
        beta[2] = _mm_add_ps (
1256
0
            _mm_sub_ps (_mm_mul_ps (in[1], e), _mm_mul_ps (in[3], b)),
1257
0
            _mm_add_ps (_mm_mul_ps (in[5], g), _mm_mul_ps (in[7], d)));
1258
1259
0
        beta[3] = _mm_add_ps (
1260
0
            _mm_sub_ps (_mm_mul_ps (in[1], g), _mm_mul_ps (in[3], e)),
1261
0
            _mm_sub_ps (_mm_mul_ps (in[5], d), _mm_mul_ps (in[7], b)));
1262
1263
0
        theta[0] = _mm_mul_ps (a, _mm_add_ps (in[0], in[4]));
1264
0
        theta[3] = _mm_mul_ps (a, _mm_sub_ps (in[0], in[4]));
1265
1266
0
        theta[1] = _mm_add_ps (alpha[0], alpha[3]);
1267
0
        theta[2] = _mm_sub_ps (alpha[1], alpha[2]);
1268
1269
0
        gamma[0] = _mm_add_ps (theta[0], theta[1]);
1270
0
        gamma[1] = _mm_add_ps (theta[3], theta[2]);
1271
0
        gamma[2] = _mm_sub_ps (theta[3], theta[2]);
1272
0
        gamma[3] = _mm_sub_ps (theta[0], theta[1]);
1273
1274
0
        srcVec[col]     = _mm_add_ps (gamma[0], beta[0]);
1275
0
        srcVec[2 + col] = _mm_add_ps (gamma[1], beta[1]);
1276
0
        srcVec[4 + col] = _mm_add_ps (gamma[2], beta[2]);
1277
0
        srcVec[6 + col] = _mm_add_ps (gamma[3], beta[3]);
1278
1279
0
        srcVec[8 + col]  = _mm_sub_ps (gamma[3], beta[3]);
1280
0
        srcVec[10 + col] = _mm_sub_ps (gamma[2], beta[2]);
1281
0
        srcVec[12 + col] = _mm_sub_ps (gamma[1], beta[1]);
1282
0
        srcVec[14 + col] = _mm_sub_ps (gamma[0], beta[0]);
1283
0
    }
1284
1285
#else /* IMF_HAVE_SSE2 */
1286
1287
    dctInverse8x8_scalar<zeroedRows> (data);
1288
1289
#endif /* IMF_HAVE_SSE2 */
1290
0
}
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<0>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<1>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<2>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<3>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<4>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<5>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<6>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<7>(float*)
1291
1292
//
1293
// AVX Implementation
1294
//
1295
1296
// clang-format off
1297
1298
#define STR(A) #A
1299
1300
#define IDCT_AVX_SETUP_2_ROWS(_DST0,  _DST1,  _TMP0,  _TMP1, \
1301
                              _OFF00, _OFF01, _OFF10, _OFF11) \
1302
    "vmovaps                 " STR(_OFF00) "(%0),  %%xmm" STR(_TMP0) "  \n" \
1303
    "vmovaps                 " STR(_OFF01) "(%0),  %%xmm" STR(_TMP1) "  \n" \
1304
    "                                                                                \n" \
1305
    "vinsertf128  $1, " STR(_OFF10) "(%0), %%ymm" STR(_TMP0) ", %%ymm" STR(_TMP0) "  \n" \
1306
    "vinsertf128  $1, " STR(_OFF11) "(%0), %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP1) "  \n" \
1307
    "                                                                                \n" \
1308
    "vunpcklpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST0) "  \n" \
1309
    "vunpckhpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST1) "  \n" \
1310
    "                                                                                \n" \
1311
    "vunpcklps      %%ymm" STR(_DST1) ",  %%ymm" STR(_DST0) ",  %%ymm" STR(_TMP0) "  \n" \
1312
    "vunpckhps      %%ymm" STR(_DST1) ",  %%ymm" STR(_DST0) ",  %%ymm" STR(_TMP1) "  \n" \
1313
    "                                                                                \n" \
1314
    "vunpcklpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST0) "  \n" \
1315
    "vunpckhpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST1) "  \n" 
1316
1317
#define IDCT_AVX_MMULT_ROWS(_SRC)                       \
1318
    /* Broadcast the source values into y12-y15 */      \
1319
    "vpermilps $0x00, " STR(_SRC) ", %%ymm12       \n"  \
1320
    "vpermilps $0x55, " STR(_SRC) ", %%ymm13       \n"  \
1321
    "vpermilps $0xaa, " STR(_SRC) ", %%ymm14       \n"  \
1322
    "vpermilps $0xff, " STR(_SRC) ", %%ymm15       \n"  \
1323
                                                        \
1324
    /* Multiple coefs and the broadcasted values */     \
1325
    "vmulps    %%ymm12,  %%ymm8, %%ymm12     \n"        \
1326
    "vmulps    %%ymm13,  %%ymm9, %%ymm13     \n"        \
1327
    "vmulps    %%ymm14, %%ymm10, %%ymm14     \n"        \
1328
    "vmulps    %%ymm15, %%ymm11, %%ymm15     \n"        \
1329
                                                        \
1330
    /* Accumulate the result back into the source */    \
1331
    "vaddps    %%ymm13, %%ymm12, %%ymm12      \n"       \
1332
    "vaddps    %%ymm15, %%ymm14, %%ymm14      \n"       \
1333
    "vaddps    %%ymm14, %%ymm12, " STR(_SRC) "\n"     
1334
1335
#define IDCT_AVX_EO_TO_ROW_HALVES(_EVEN, _ODD, _FRONT, _BACK)      \
1336
    "vsubps   " STR(_ODD) "," STR(_EVEN) "," STR(_BACK)  "\n"  \
1337
    "vaddps   " STR(_ODD) "," STR(_EVEN) "," STR(_FRONT) "\n"  \
1338
    /* Reverse the back half                                */ \
1339
    "vpermilps $0x1b," STR(_BACK) "," STR(_BACK) "\n"  
1340
1341
/* In order to allow for path paths when we know certain rows
1342
 * of the 8x8 block are zero, most of the body of the DCT is
1343
 * in the following macro. Statements are wrapped in a ROWn()
1344
 * macro, where n is the lowest row in the 8x8 block in which
1345
 * they depend.
1346
 *
1347
 * This should work for the cases where we have 2-8 full rows.
1348
 * the 1-row case is special, and we'll handle it separately.
1349
 */
1350
#define IDCT_AVX_BODY \
1351
    /* ==============================================               
1352
     *               Row 1D DCT                                     
1353
     * ----------------------------------------------
1354
     */                                                           \
1355
                                                                  \
1356
    /* Setup for the row-oriented 1D DCT. Assuming that (%0) holds 
1357
     * the row-major 8x8 block, load ymm0-3 with the even columns
1358
     * and ymm4-7 with the odd columns. The lower half of the ymm
1359
     * holds one row, while the upper half holds the next row.
1360
     *
1361
     * If our source is:
1362
     *    a0 a1 a2 a3   a4 a5 a6 a7
1363
     *    b0 b1 b2 b3   b4 b5 b6 b7
1364
     *
1365
     * We'll be forming:
1366
     *    a0 a2 a4 a6   b0 b2 b4 b6
1367
     *    a1 a3 a5 a7   b1 b3 b5 b7
1368
     */                                                              \
1369
    ROW0( IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15,    0,  16,  32,  48) ) \
1370
    ROW2( IDCT_AVX_SETUP_2_ROWS(1, 5, 12, 13,   64,  80,  96, 112) ) \
1371
    ROW4( IDCT_AVX_SETUP_2_ROWS(2, 6, 10, 11,  128, 144, 160, 176) ) \
1372
    ROW6( IDCT_AVX_SETUP_2_ROWS(3, 7,  8,  9,  192, 208, 224, 240) ) \
1373
                                                                     \
1374
    /* Multiple the even columns (ymm0-3) by the matrix M1
1375
     * storing the results back in ymm0-3
1376
     *
1377
     * Assume that (%1) holds the matrix in column major order
1378
     */                                                              \
1379
    "vbroadcastf128   (%1),  %%ymm8         \n"                      \
1380
    "vbroadcastf128 16(%1),  %%ymm9         \n"                      \
1381
    "vbroadcastf128 32(%1), %%ymm10         \n"                      \
1382
    "vbroadcastf128 48(%1), %%ymm11         \n"                      \
1383
                                                                     \
1384
    ROW0( IDCT_AVX_MMULT_ROWS(%%ymm0) )                              \
1385
    ROW2( IDCT_AVX_MMULT_ROWS(%%ymm1) )                              \
1386
    ROW4( IDCT_AVX_MMULT_ROWS(%%ymm2) )                              \
1387
    ROW6( IDCT_AVX_MMULT_ROWS(%%ymm3) )                              \
1388
                                                                     \
1389
    /* Repeat, but with the odd columns (ymm4-7) and the 
1390
     * matrix M2
1391
     */                                                              \
1392
    "vbroadcastf128  64(%1),  %%ymm8         \n"                     \
1393
    "vbroadcastf128  80(%1),  %%ymm9         \n"                     \
1394
    "vbroadcastf128  96(%1), %%ymm10         \n"                     \
1395
    "vbroadcastf128 112(%1), %%ymm11         \n"                     \
1396
                                                                     \
1397
    ROW0( IDCT_AVX_MMULT_ROWS(%%ymm4) )                              \
1398
    ROW2( IDCT_AVX_MMULT_ROWS(%%ymm5) )                              \
1399
    ROW4( IDCT_AVX_MMULT_ROWS(%%ymm6) )                              \
1400
    ROW6( IDCT_AVX_MMULT_ROWS(%%ymm7) )                              \
1401
                                                                     \
1402
    /* Sum the M1 (ymm0-3) and M2 (ymm4-7) results to get the 
1403
     * front halves of the results, and difference to get the 
1404
     * back halves. The front halfs end up in ymm0-3, the back
1405
     * halves end up in ymm12-15. 
1406
     */                                                                \
1407
    ROW0( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) ) \
1408
    ROW2( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm1, %%ymm5, %%ymm1, %%ymm13) ) \
1409
    ROW4( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm2, %%ymm6, %%ymm2, %%ymm14) ) \
1410
    ROW6( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm3, %%ymm7, %%ymm3, %%ymm15) ) \
1411
                                                                       \
1412
    /* Reassemble the rows halves into ymm0-7  */                      \
1413
    ROW7( "vperm2f128 $0x13, %%ymm3, %%ymm15, %%ymm7   \n" )           \
1414
    ROW6( "vperm2f128 $0x02, %%ymm3, %%ymm15, %%ymm6   \n" )           \
1415
    ROW5( "vperm2f128 $0x13, %%ymm2, %%ymm14, %%ymm5   \n" )           \
1416
    ROW4( "vperm2f128 $0x02, %%ymm2, %%ymm14, %%ymm4   \n" )           \
1417
    ROW3( "vperm2f128 $0x13, %%ymm1, %%ymm13, %%ymm3   \n" )           \
1418
    ROW2( "vperm2f128 $0x02, %%ymm1, %%ymm13, %%ymm2   \n" )           \
1419
    ROW1( "vperm2f128 $0x13, %%ymm0, %%ymm12, %%ymm1   \n" )           \
1420
    ROW0( "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n" )           \
1421
                                                                       \
1422
                                                                       \
1423
    /* ==============================================
1424
     *                Column 1D DCT 
1425
     * ----------------------------------------------
1426
     */                                                                \
1427
                                                                       \
1428
    /* Rows should be in ymm0-7, and M2 columns should still be 
1429
     * preserved in ymm8-11.  M2 has 4 unique values (and +- 
1430
     * versions of each), and all (positive) values appear in 
1431
     * the first column (and row), which is in ymm8.
1432
     *
1433
     * For the column-wise DCT, we need to:
1434
     *   1) Broadcast each element a row of M2 into 4 vectors
1435
     *   2) Multiple the odd rows (ymm1,3,5,7) by the broadcasts.
1436
     *   3) Accumulate into ymm12-15 for the odd outputs.
1437
     *
1438
     * Instead of doing 16 broadcasts for each element in M2, 
1439
     * do 4, filling y8-11 with:
1440
     *
1441
     *     ymm8:  [ b  b  b  b  | b  b  b  b ]
1442
     *     ymm9:  [ d  d  d  d  | d  d  d  d ]
1443
     *     ymm10: [ e  e  e  e  | e  e  e  e ]
1444
     *     ymm11: [ g  g  g  g  | g  g  g  g ]
1445
     * 
1446
     * And deal with the negative values by subtracting during accum.
1447
     */                                                                \
1448
    "vpermilps        $0xff,  %%ymm8, %%ymm11  \n"                     \
1449
    "vpermilps        $0xaa,  %%ymm8, %%ymm10  \n"                     \
1450
    "vpermilps        $0x55,  %%ymm8, %%ymm9   \n"                     \
1451
    "vpermilps        $0x00,  %%ymm8, %%ymm8   \n"                     \
1452
                                                                       \
1453
    /* This one is easy, since we have ymm12-15 open for scratch   
1454
     *    ymm12 = b ymm1 + d ymm3 + e ymm5 + g ymm7 
1455
     */                                                                \
1456
    ROW1( "vmulps    %%ymm1,  %%ymm8, %%ymm12    \n" )                 \
1457
    ROW3( "vmulps    %%ymm3,  %%ymm9, %%ymm13    \n" )                 \
1458
    ROW5( "vmulps    %%ymm5, %%ymm10, %%ymm14    \n" )                 \
1459
    ROW7( "vmulps    %%ymm7, %%ymm11, %%ymm15    \n" )                 \
1460
                                                                       \
1461
    ROW3( "vaddps   %%ymm12, %%ymm13, %%ymm12    \n" )                 \
1462
    ROW7( "vaddps   %%ymm14, %%ymm15, %%ymm14    \n" )                 \
1463
    ROW5( "vaddps   %%ymm12, %%ymm14, %%ymm12    \n" )                 \
1464
                                                                       \
1465
    /* Tricker, since only y13-15 are open for scratch   
1466
     *    ymm13 = d ymm1 - g ymm3 - b ymm5 - e ymm7 
1467
     */                                                                \
1468
    ROW1( "vmulps    %%ymm1,   %%ymm9, %%ymm13   \n" )                 \
1469
    ROW3( "vmulps    %%ymm3,  %%ymm11, %%ymm14   \n" )                 \
1470
    ROW5( "vmulps    %%ymm5,   %%ymm8, %%ymm15   \n" )                 \
1471
                                                                       \
1472
    ROW5( "vaddps    %%ymm14, %%ymm15, %%ymm14   \n" )                 \
1473
    ROW3( "vsubps    %%ymm14, %%ymm13, %%ymm13   \n" )                 \
1474
                                                                       \
1475
    ROW7( "vmulps    %%ymm7,  %%ymm10, %%ymm15   \n" )                 \
1476
    ROW7( "vsubps    %%ymm15, %%ymm13, %%ymm13   \n" )                 \
1477
                                                                       \
1478
    /* Tricker still, as only y14-15 are open for scratch   
1479
     *    ymm14 = e ymm1 - b ymm3 + g ymm5 + d ymm7 
1480
     */                                                                \
1481
    ROW1( "vmulps     %%ymm1, %%ymm10,  %%ymm14  \n" )                 \
1482
    ROW3( "vmulps     %%ymm3,  %%ymm8,  %%ymm15  \n" )                 \
1483
                                                                       \
1484
    ROW3( "vsubps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1485
                                                                       \
1486
    ROW5( "vmulps     %%ymm5, %%ymm11, %%ymm15   \n" )                 \
1487
    ROW5( "vaddps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1488
                                                                       \
1489
    ROW7( "vmulps    %%ymm7,   %%ymm9, %%ymm15   \n" )                 \
1490
    ROW7( "vaddps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1491
                                                                       \
1492
                                                                       \
1493
    /* Easy, as we can blow away ymm1,3,5,7 for scratch
1494
     *    ymm15 = g ymm1 - e ymm3 + d ymm5 - b ymm7 
1495
     */                                                                \
1496
    ROW1( "vmulps    %%ymm1, %%ymm11, %%ymm15    \n" )                 \
1497
    ROW3( "vmulps    %%ymm3, %%ymm10,  %%ymm3    \n" )                 \
1498
    ROW5( "vmulps    %%ymm5,  %%ymm9,  %%ymm5    \n" )                 \
1499
    ROW7( "vmulps    %%ymm7,  %%ymm8,  %%ymm7    \n" )                 \
1500
                                                                       \
1501
    ROW5( "vaddps   %%ymm15,  %%ymm5, %%ymm15    \n" )                 \
1502
    ROW7( "vaddps    %%ymm3,  %%ymm7,  %%ymm3    \n" )                 \
1503
    ROW3( "vsubps    %%ymm3, %%ymm15, %%ymm15    \n" )                 \
1504
                                                                       \
1505
                                                                       \
1506
    /* Load coefs for M1. Because we're going to broadcast
1507
     * coefs, we don't need to load the actual structure from
1508
     * M1. Instead, just load enough that we can broadcast.
1509
     * There are only 6 unique values in M1, but they're in +-
1510
     * pairs, leaving only 3 unique coefs if we add and subtract 
1511
     * properly.
1512
     *
1513
     * Fill      ymm1 with coef[2] = [ a  a  c  f | a  a  c  f ]
1514
     * Broadcast ymm5 with           [ f  f  f  f | f  f  f  f ]
1515
     * Broadcast ymm3 with           [ c  c  c  c | c  c  c  c ]
1516
     * Broadcast ymm1 with           [ a  a  a  a | a  a  a  a ]
1517
     */                                                                \
1518
    "vbroadcastf128   8(%1),  %%ymm1          \n"                      \
1519
    "vpermilps        $0xff,  %%ymm1, %%ymm5  \n"                      \
1520
    "vpermilps        $0xaa,  %%ymm1, %%ymm3  \n"                      \
1521
    "vpermilps        $0x00,  %%ymm1, %%ymm1  \n"                      \
1522
                                                                       \
1523
    /* If we expand E = [M1] [x0 x2 x4 x6]^t, we get the following 
1524
     * common expressions:
1525
     *
1526
     *   E_0 = ymm8  = (a ymm0 + a ymm4) + (c ymm2 + f ymm6) 
1527
     *   E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
1528
     * 
1529
     *   E_1 = ymm9  = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
1530
     *   E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
1531
     *
1532
     * Afterwards, ymm8-11 will hold the even outputs.
1533
     */                                                                \
1534
                                                                       \
1535
    /*  ymm11 = (a ymm0 + a ymm4),   ymm1 = (a ymm0 - a ymm4) */       \
1536
    ROW0( "vmulps    %%ymm1,  %%ymm0, %%ymm11   \n" )                  \
1537
    ROW4( "vmulps    %%ymm1,  %%ymm4,  %%ymm4   \n" )                  \
1538
    ROW0( "vmovaps   %%ymm11, %%ymm1            \n" )                  \
1539
    ROW4( "vaddps    %%ymm4, %%ymm11, %%ymm11   \n" )                  \
1540
    ROW4( "vsubps    %%ymm4,  %%ymm1,  %%ymm1   \n" )                  \
1541
                                                                       \
1542
    /* ymm7 = (c ymm2 + f ymm6) */                                     \
1543
    ROW2( "vmulps    %%ymm3, %%ymm2,  %%ymm7    \n" )                  \
1544
    ROW6( "vmulps    %%ymm5, %%ymm6,  %%ymm9    \n" )                  \
1545
    ROW6( "vaddps    %%ymm9, %%ymm7,  %%ymm7    \n" )                  \
1546
                                                                       \
1547
    /* E_0 = ymm8  = (a ymm0 + a ymm4) + (c ymm2 + f ymm6) 
1548
     * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6) 
1549
     */                                                                \
1550
    ROW0( "vmovaps   %%ymm11, %%ymm8            \n" )                  \
1551
    ROW2( "vaddps     %%ymm7, %%ymm8,  %%ymm8   \n" )                  \
1552
    ROW2( "vsubps     %%ymm7, %%ymm11, %%ymm11  \n" )                  \
1553
                                                                       \
1554
    /* ymm7 = (f ymm2 - c ymm6) */                                     \
1555
    ROW2( "vmulps     %%ymm5,  %%ymm2, %%ymm7   \n" )                  \
1556
    ROW6( "vmulps     %%ymm3,  %%ymm6, %%ymm9   \n" )                  \
1557
    ROW6( "vsubps     %%ymm9,  %%ymm7, %%ymm7   \n" )                  \
1558
                                                                       \
1559
    /* E_1 = ymm9  = (a ymm0 - a ymm4) + (f ymm2 - c ymm6) 
1560
     * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
1561
     */                                                                \
1562
    ROW0( "vmovaps   %%ymm1,  %%ymm9            \n" )                  \
1563
    ROW0( "vmovaps   %%ymm1, %%ymm10            \n" )                  \
1564
    ROW2( "vaddps    %%ymm7,  %%ymm1,  %%ymm9   \n" )                  \
1565
    ROW2( "vsubps    %%ymm7,  %%ymm1,  %%ymm10  \n" )                  \
1566
                                                                       \
1567
    /* Add the even (ymm8-11) and the odds (ymm12-15), 
1568
     * placing the results into ymm0-7 
1569
     */                                                                \
1570
    "vaddps   %%ymm12,  %%ymm8, %%ymm0       \n"                       \
1571
    "vaddps   %%ymm13,  %%ymm9, %%ymm1       \n"                       \
1572
    "vaddps   %%ymm14, %%ymm10, %%ymm2       \n"                       \
1573
    "vaddps   %%ymm15, %%ymm11, %%ymm3       \n"                       \
1574
                                                                       \
1575
    "vsubps   %%ymm12,  %%ymm8, %%ymm7       \n"                       \
1576
    "vsubps   %%ymm13,  %%ymm9, %%ymm6       \n"                       \
1577
    "vsubps   %%ymm14, %%ymm10, %%ymm5       \n"                       \
1578
    "vsubps   %%ymm15, %%ymm11, %%ymm4       \n"                       \
1579
                                                                       \
1580
    /* Copy out the results from ymm0-7  */                            \
1581
    "vmovaps   %%ymm0,    (%0)                   \n"                   \
1582
    "vmovaps   %%ymm1,  32(%0)                   \n"                   \
1583
    "vmovaps   %%ymm2,  64(%0)                   \n"                   \
1584
    "vmovaps   %%ymm3,  96(%0)                   \n"                   \
1585
    "vmovaps   %%ymm4, 128(%0)                   \n"                   \
1586
    "vmovaps   %%ymm5, 160(%0)                   \n"                   \
1587
    "vmovaps   %%ymm6, 192(%0)                   \n"                   \
1588
    "vmovaps   %%ymm7, 224(%0)                   \n"            
1589
1590
/* Output, input, and clobber (OIC) sections of the inline asm */
1591
#define IDCT_AVX_OIC(_IN0)                          \
1592
226k
        : /* Output  */                            \
1593
226k
        : /* Input   */ "r"(_IN0), "r"(sAvxCoef)      \
1594
226k
        : /* Clobber */ "memory",                  \
1595
226k
                        "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", \
1596
226k
                        "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7", \
1597
226k
                        "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",\
1598
226k
                        "%xmm12", "%xmm13", "%xmm14", "%xmm15" 
1599
1600
/* Include vzeroupper for non-AVX builds                */
1601
#ifndef __AVX__ 
1602
    #define IDCT_AVX_ASM(_IN0)   \
1603
213k
        __asm__(                 \
1604
213k
            IDCT_AVX_BODY        \
1605
213k
            "vzeroupper      \n" \
1606
213k
            IDCT_AVX_OIC(_IN0)   \
1607
213k
        );                       
1608
#else /* __AVX__ */
1609
    #define IDCT_AVX_ASM(_IN0)   \
1610
        __asm__(                 \
1611
            IDCT_AVX_BODY        \
1612
            IDCT_AVX_OIC(_IN0)   \
1613
        );                       
1614
#endif /* __AVX__ */
1615
1616
// clang-format on
1617
1618
template <int zeroedRows>
1619
void
1620
dctInverse8x8_avx (float* data)
1621
226k
{
1622
226k
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
226k
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
226k
        3.535536e-01,  3.535536e-01,
1634
226k
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
226k
        4.619398e-01,  1.913422e-01,
1636
226k
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
226k
        3.535536e-01,  -3.535536e-01,
1638
226k
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
226k
        1.913422e-01,  -4.619398e-01,
1640
226k
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
226k
        4.903927e-01,  4.157349e-01,
1643
226k
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
226k
        4.157349e-01,  -9.754573e-02,
1645
226k
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
226k
        2.777855e-01,  -4.903927e-01,
1647
226k
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
226k
        9.754573e-02,  -2.777855e-01,
1649
226k
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
226k
    };
1651
1652
226k
#    define ROW0(_X) _X
1653
226k
#    define ROW1(_X) _X
1654
226k
#    define ROW2(_X) _X
1655
226k
#    define ROW3(_X) _X
1656
226k
#    define ROW4(_X) _X
1657
226k
#    define ROW5(_X) _X
1658
226k
#    define ROW6(_X) _X
1659
226k
#    define ROW7(_X) _X
1660
1661
226k
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
24.2k
    else if (zeroedRows == 1)
1663
1.71k
    {
1664
1665
1.71k
#    undef ROW7
1666
1.71k
#    define ROW7(_X)
1667
1.71k
        IDCT_AVX_ASM (data)
1668
1.71k
    }
1669
22.5k
    else if (zeroedRows == 2)
1670
127
    {
1671
1672
127
#    undef ROW6
1673
127
#    define ROW6(_X)
1674
127
        IDCT_AVX_ASM (data)
1675
127
    }
1676
22.3k
    else if (zeroedRows == 3)
1677
1.61k
    {
1678
1679
1.61k
#    undef ROW5
1680
1.61k
#    define ROW5(_X)
1681
1.61k
        IDCT_AVX_ASM (data)
1682
1.61k
    }
1683
20.7k
    else if (zeroedRows == 4)
1684
713
    {
1685
1686
713
#    undef ROW4
1687
713
#    define ROW4(_X)
1688
713
        IDCT_AVX_ASM (data)
1689
713
    }
1690
20.0k
    else if (zeroedRows == 5)
1691
6.10k
    {
1692
1693
6.10k
#    undef ROW3
1694
6.10k
#    define ROW3(_X)
1695
6.10k
        IDCT_AVX_ASM (data)
1696
6.10k
    }
1697
13.9k
    else if (zeroedRows == 6)
1698
1.02k
    {
1699
1700
1.02k
#    undef ROW2
1701
1.02k
#    define ROW2(_X)
1702
1.02k
        IDCT_AVX_ASM (data)
1703
1.02k
    }
1704
12.9k
    else if (zeroedRows == 7)
1705
12.9k
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
12.9k
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
12.9k
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
12.9k
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
12.9k
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
12.9k
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
12.9k
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
12.9k
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
12.9k
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
12.9k
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
12.9k
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
12.9k
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
12.9k
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
12.9k
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
12.9k
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
12.9k
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
12.9k
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
12.9k
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
12.9k
                    "vmovaps %%ymm0,    (%0)          \n"
1749
12.9k
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
12.9k
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
12.9k
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
12.9k
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
12.9k
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
12.9k
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
12.9k
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
12.9k
#    ifndef __AVX__
1758
12.9k
                    "vzeroupper                   \n"
1759
12.9k
#    endif /* __AVX__ */
1760
12.9k
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
12.9k
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
226k
}
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<0>(float*)
Line
Count
Source
1621
202k
{
1622
202k
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
202k
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
202k
        3.535536e-01,  3.535536e-01,
1634
202k
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
202k
        4.619398e-01,  1.913422e-01,
1636
202k
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
202k
        3.535536e-01,  -3.535536e-01,
1638
202k
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
202k
        1.913422e-01,  -4.619398e-01,
1640
202k
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
202k
        4.903927e-01,  4.157349e-01,
1643
202k
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
202k
        4.157349e-01,  -9.754573e-02,
1645
202k
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
202k
        2.777855e-01,  -4.903927e-01,
1647
202k
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
202k
        9.754573e-02,  -2.777855e-01,
1649
202k
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
202k
    };
1651
1652
202k
#    define ROW0(_X) _X
1653
202k
#    define ROW1(_X) _X
1654
202k
#    define ROW2(_X) _X
1655
202k
#    define ROW3(_X) _X
1656
202k
#    define ROW4(_X) _X
1657
202k
#    define ROW5(_X) _X
1658
202k
#    define ROW6(_X) _X
1659
202k
#    define ROW7(_X) _X
1660
1661
202k
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
0
    else if (zeroedRows == 1)
1663
0
    {
1664
1665
0
#    undef ROW7
1666
0
#    define ROW7(_X)
1667
0
        IDCT_AVX_ASM (data)
1668
0
    }
1669
0
    else if (zeroedRows == 2)
1670
0
    {
1671
1672
0
#    undef ROW6
1673
0
#    define ROW6(_X)
1674
0
        IDCT_AVX_ASM (data)
1675
0
    }
1676
0
    else if (zeroedRows == 3)
1677
0
    {
1678
1679
0
#    undef ROW5
1680
0
#    define ROW5(_X)
1681
0
        IDCT_AVX_ASM (data)
1682
0
    }
1683
0
    else if (zeroedRows == 4)
1684
0
    {
1685
1686
0
#    undef ROW4
1687
0
#    define ROW4(_X)
1688
0
        IDCT_AVX_ASM (data)
1689
0
    }
1690
0
    else if (zeroedRows == 5)
1691
0
    {
1692
1693
0
#    undef ROW3
1694
0
#    define ROW3(_X)
1695
0
        IDCT_AVX_ASM (data)
1696
0
    }
1697
0
    else if (zeroedRows == 6)
1698
0
    {
1699
1700
0
#    undef ROW2
1701
0
#    define ROW2(_X)
1702
0
        IDCT_AVX_ASM (data)
1703
0
    }
1704
0
    else if (zeroedRows == 7)
1705
0
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
0
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
0
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
0
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
0
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
0
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
0
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
0
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
0
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
0
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
0
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
0
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
0
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
0
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
0
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
0
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
0
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
0
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
0
                    "vmovaps %%ymm0,    (%0)          \n"
1749
0
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
0
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
0
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
0
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
0
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
0
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
0
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
0
#    ifndef __AVX__
1758
0
                    "vzeroupper                   \n"
1759
0
#    endif /* __AVX__ */
1760
0
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
0
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
202k
}
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<1>(float*)
Line
Count
Source
1621
1.71k
{
1622
1.71k
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
1.71k
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
1.71k
        3.535536e-01,  3.535536e-01,
1634
1.71k
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
1.71k
        4.619398e-01,  1.913422e-01,
1636
1.71k
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
1.71k
        3.535536e-01,  -3.535536e-01,
1638
1.71k
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
1.71k
        1.913422e-01,  -4.619398e-01,
1640
1.71k
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
1.71k
        4.903927e-01,  4.157349e-01,
1643
1.71k
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
1.71k
        4.157349e-01,  -9.754573e-02,
1645
1.71k
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
1.71k
        2.777855e-01,  -4.903927e-01,
1647
1.71k
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
1.71k
        9.754573e-02,  -2.777855e-01,
1649
1.71k
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
1.71k
    };
1651
1652
1.71k
#    define ROW0(_X) _X
1653
1.71k
#    define ROW1(_X) _X
1654
1.71k
#    define ROW2(_X) _X
1655
1.71k
#    define ROW3(_X) _X
1656
1.71k
#    define ROW4(_X) _X
1657
1.71k
#    define ROW5(_X) _X
1658
1.71k
#    define ROW6(_X) _X
1659
1.71k
#    define ROW7(_X) _X
1660
1661
1.71k
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
1.71k
    else if (zeroedRows == 1)
1663
1.71k
    {
1664
1665
1.71k
#    undef ROW7
1666
1.71k
#    define ROW7(_X)
1667
1.71k
        IDCT_AVX_ASM (data)
1668
1.71k
    }
1669
0
    else if (zeroedRows == 2)
1670
0
    {
1671
1672
0
#    undef ROW6
1673
0
#    define ROW6(_X)
1674
0
        IDCT_AVX_ASM (data)
1675
0
    }
1676
0
    else if (zeroedRows == 3)
1677
0
    {
1678
1679
0
#    undef ROW5
1680
0
#    define ROW5(_X)
1681
0
        IDCT_AVX_ASM (data)
1682
0
    }
1683
0
    else if (zeroedRows == 4)
1684
0
    {
1685
1686
0
#    undef ROW4
1687
0
#    define ROW4(_X)
1688
0
        IDCT_AVX_ASM (data)
1689
0
    }
1690
0
    else if (zeroedRows == 5)
1691
0
    {
1692
1693
0
#    undef ROW3
1694
0
#    define ROW3(_X)
1695
0
        IDCT_AVX_ASM (data)
1696
0
    }
1697
0
    else if (zeroedRows == 6)
1698
0
    {
1699
1700
0
#    undef ROW2
1701
0
#    define ROW2(_X)
1702
0
        IDCT_AVX_ASM (data)
1703
0
    }
1704
0
    else if (zeroedRows == 7)
1705
0
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
0
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
0
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
0
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
0
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
0
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
0
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
0
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
0
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
0
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
0
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
0
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
0
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
0
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
0
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
0
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
0
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
0
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
0
                    "vmovaps %%ymm0,    (%0)          \n"
1749
0
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
0
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
0
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
0
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
0
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
0
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
0
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
0
#    ifndef __AVX__
1758
0
                    "vzeroupper                   \n"
1759
0
#    endif /* __AVX__ */
1760
0
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
0
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
1.71k
}
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<2>(float*)
Line
Count
Source
1621
127
{
1622
127
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
127
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
127
        3.535536e-01,  3.535536e-01,
1634
127
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
127
        4.619398e-01,  1.913422e-01,
1636
127
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
127
        3.535536e-01,  -3.535536e-01,
1638
127
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
127
        1.913422e-01,  -4.619398e-01,
1640
127
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
127
        4.903927e-01,  4.157349e-01,
1643
127
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
127
        4.157349e-01,  -9.754573e-02,
1645
127
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
127
        2.777855e-01,  -4.903927e-01,
1647
127
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
127
        9.754573e-02,  -2.777855e-01,
1649
127
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
127
    };
1651
1652
127
#    define ROW0(_X) _X
1653
127
#    define ROW1(_X) _X
1654
127
#    define ROW2(_X) _X
1655
127
#    define ROW3(_X) _X
1656
127
#    define ROW4(_X) _X
1657
127
#    define ROW5(_X) _X
1658
127
#    define ROW6(_X) _X
1659
127
#    define ROW7(_X) _X
1660
1661
127
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
127
    else if (zeroedRows == 1)
1663
0
    {
1664
1665
0
#    undef ROW7
1666
0
#    define ROW7(_X)
1667
0
        IDCT_AVX_ASM (data)
1668
0
    }
1669
127
    else if (zeroedRows == 2)
1670
127
    {
1671
1672
127
#    undef ROW6
1673
127
#    define ROW6(_X)
1674
127
        IDCT_AVX_ASM (data)
1675
127
    }
1676
0
    else if (zeroedRows == 3)
1677
0
    {
1678
1679
0
#    undef ROW5
1680
0
#    define ROW5(_X)
1681
0
        IDCT_AVX_ASM (data)
1682
0
    }
1683
0
    else if (zeroedRows == 4)
1684
0
    {
1685
1686
0
#    undef ROW4
1687
0
#    define ROW4(_X)
1688
0
        IDCT_AVX_ASM (data)
1689
0
    }
1690
0
    else if (zeroedRows == 5)
1691
0
    {
1692
1693
0
#    undef ROW3
1694
0
#    define ROW3(_X)
1695
0
        IDCT_AVX_ASM (data)
1696
0
    }
1697
0
    else if (zeroedRows == 6)
1698
0
    {
1699
1700
0
#    undef ROW2
1701
0
#    define ROW2(_X)
1702
0
        IDCT_AVX_ASM (data)
1703
0
    }
1704
0
    else if (zeroedRows == 7)
1705
0
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
0
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
0
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
0
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
0
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
0
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
0
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
0
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
0
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
0
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
0
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
0
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
0
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
0
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
0
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
0
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
0
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
0
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
0
                    "vmovaps %%ymm0,    (%0)          \n"
1749
0
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
0
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
0
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
0
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
0
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
0
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
0
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
0
#    ifndef __AVX__
1758
0
                    "vzeroupper                   \n"
1759
0
#    endif /* __AVX__ */
1760
0
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
0
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
127
}
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<3>(float*)
Line
Count
Source
1621
1.61k
{
1622
1.61k
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
1.61k
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
1.61k
        3.535536e-01,  3.535536e-01,
1634
1.61k
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
1.61k
        4.619398e-01,  1.913422e-01,
1636
1.61k
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
1.61k
        3.535536e-01,  -3.535536e-01,
1638
1.61k
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
1.61k
        1.913422e-01,  -4.619398e-01,
1640
1.61k
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
1.61k
        4.903927e-01,  4.157349e-01,
1643
1.61k
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
1.61k
        4.157349e-01,  -9.754573e-02,
1645
1.61k
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
1.61k
        2.777855e-01,  -4.903927e-01,
1647
1.61k
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
1.61k
        9.754573e-02,  -2.777855e-01,
1649
1.61k
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
1.61k
    };
1651
1652
1.61k
#    define ROW0(_X) _X
1653
1.61k
#    define ROW1(_X) _X
1654
1.61k
#    define ROW2(_X) _X
1655
1.61k
#    define ROW3(_X) _X
1656
1.61k
#    define ROW4(_X) _X
1657
1.61k
#    define ROW5(_X) _X
1658
1.61k
#    define ROW6(_X) _X
1659
1.61k
#    define ROW7(_X) _X
1660
1661
1.61k
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
1.61k
    else if (zeroedRows == 1)
1663
0
    {
1664
1665
0
#    undef ROW7
1666
0
#    define ROW7(_X)
1667
0
        IDCT_AVX_ASM (data)
1668
0
    }
1669
1.61k
    else if (zeroedRows == 2)
1670
0
    {
1671
1672
0
#    undef ROW6
1673
0
#    define ROW6(_X)
1674
0
        IDCT_AVX_ASM (data)
1675
0
    }
1676
1.61k
    else if (zeroedRows == 3)
1677
1.61k
    {
1678
1679
1.61k
#    undef ROW5
1680
1.61k
#    define ROW5(_X)
1681
1.61k
        IDCT_AVX_ASM (data)
1682
1.61k
    }
1683
0
    else if (zeroedRows == 4)
1684
0
    {
1685
1686
0
#    undef ROW4
1687
0
#    define ROW4(_X)
1688
0
        IDCT_AVX_ASM (data)
1689
0
    }
1690
0
    else if (zeroedRows == 5)
1691
0
    {
1692
1693
0
#    undef ROW3
1694
0
#    define ROW3(_X)
1695
0
        IDCT_AVX_ASM (data)
1696
0
    }
1697
0
    else if (zeroedRows == 6)
1698
0
    {
1699
1700
0
#    undef ROW2
1701
0
#    define ROW2(_X)
1702
0
        IDCT_AVX_ASM (data)
1703
0
    }
1704
0
    else if (zeroedRows == 7)
1705
0
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
0
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
0
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
0
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
0
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
0
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
0
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
0
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
0
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
0
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
0
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
0
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
0
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
0
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
0
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
0
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
0
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
0
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
0
                    "vmovaps %%ymm0,    (%0)          \n"
1749
0
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
0
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
0
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
0
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
0
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
0
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
0
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
0
#    ifndef __AVX__
1758
0
                    "vzeroupper                   \n"
1759
0
#    endif /* __AVX__ */
1760
0
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
0
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
1.61k
}
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<4>(float*)
Line
Count
Source
1621
713
{
1622
713
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
713
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
713
        3.535536e-01,  3.535536e-01,
1634
713
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
713
        4.619398e-01,  1.913422e-01,
1636
713
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
713
        3.535536e-01,  -3.535536e-01,
1638
713
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
713
        1.913422e-01,  -4.619398e-01,
1640
713
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
713
        4.903927e-01,  4.157349e-01,
1643
713
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
713
        4.157349e-01,  -9.754573e-02,
1645
713
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
713
        2.777855e-01,  -4.903927e-01,
1647
713
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
713
        9.754573e-02,  -2.777855e-01,
1649
713
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
713
    };
1651
1652
713
#    define ROW0(_X) _X
1653
713
#    define ROW1(_X) _X
1654
713
#    define ROW2(_X) _X
1655
713
#    define ROW3(_X) _X
1656
713
#    define ROW4(_X) _X
1657
713
#    define ROW5(_X) _X
1658
713
#    define ROW6(_X) _X
1659
713
#    define ROW7(_X) _X
1660
1661
713
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
713
    else if (zeroedRows == 1)
1663
0
    {
1664
1665
0
#    undef ROW7
1666
0
#    define ROW7(_X)
1667
0
        IDCT_AVX_ASM (data)
1668
0
    }
1669
713
    else if (zeroedRows == 2)
1670
0
    {
1671
1672
0
#    undef ROW6
1673
0
#    define ROW6(_X)
1674
0
        IDCT_AVX_ASM (data)
1675
0
    }
1676
713
    else if (zeroedRows == 3)
1677
0
    {
1678
1679
0
#    undef ROW5
1680
0
#    define ROW5(_X)
1681
0
        IDCT_AVX_ASM (data)
1682
0
    }
1683
713
    else if (zeroedRows == 4)
1684
713
    {
1685
1686
713
#    undef ROW4
1687
713
#    define ROW4(_X)
1688
713
        IDCT_AVX_ASM (data)
1689
713
    }
1690
0
    else if (zeroedRows == 5)
1691
0
    {
1692
1693
0
#    undef ROW3
1694
0
#    define ROW3(_X)
1695
0
        IDCT_AVX_ASM (data)
1696
0
    }
1697
0
    else if (zeroedRows == 6)
1698
0
    {
1699
1700
0
#    undef ROW2
1701
0
#    define ROW2(_X)
1702
0
        IDCT_AVX_ASM (data)
1703
0
    }
1704
0
    else if (zeroedRows == 7)
1705
0
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
0
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
0
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
0
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
0
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
0
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
0
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
0
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
0
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
0
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
0
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
0
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
0
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
0
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
0
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
0
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
0
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
0
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
0
                    "vmovaps %%ymm0,    (%0)          \n"
1749
0
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
0
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
0
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
0
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
0
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
0
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
0
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
0
#    ifndef __AVX__
1758
0
                    "vzeroupper                   \n"
1759
0
#    endif /* __AVX__ */
1760
0
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
0
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
713
}
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<5>(float*)
Line
Count
Source
1621
6.10k
{
1622
6.10k
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
6.10k
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
6.10k
        3.535536e-01,  3.535536e-01,
1634
6.10k
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
6.10k
        4.619398e-01,  1.913422e-01,
1636
6.10k
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
6.10k
        3.535536e-01,  -3.535536e-01,
1638
6.10k
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
6.10k
        1.913422e-01,  -4.619398e-01,
1640
6.10k
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
6.10k
        4.903927e-01,  4.157349e-01,
1643
6.10k
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
6.10k
        4.157349e-01,  -9.754573e-02,
1645
6.10k
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
6.10k
        2.777855e-01,  -4.903927e-01,
1647
6.10k
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
6.10k
        9.754573e-02,  -2.777855e-01,
1649
6.10k
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
6.10k
    };
1651
1652
6.10k
#    define ROW0(_X) _X
1653
6.10k
#    define ROW1(_X) _X
1654
6.10k
#    define ROW2(_X) _X
1655
6.10k
#    define ROW3(_X) _X
1656
6.10k
#    define ROW4(_X) _X
1657
6.10k
#    define ROW5(_X) _X
1658
6.10k
#    define ROW6(_X) _X
1659
6.10k
#    define ROW7(_X) _X
1660
1661
6.10k
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
6.10k
    else if (zeroedRows == 1)
1663
0
    {
1664
1665
0
#    undef ROW7
1666
0
#    define ROW7(_X)
1667
0
        IDCT_AVX_ASM (data)
1668
0
    }
1669
6.10k
    else if (zeroedRows == 2)
1670
0
    {
1671
1672
0
#    undef ROW6
1673
0
#    define ROW6(_X)
1674
0
        IDCT_AVX_ASM (data)
1675
0
    }
1676
6.10k
    else if (zeroedRows == 3)
1677
0
    {
1678
1679
0
#    undef ROW5
1680
0
#    define ROW5(_X)
1681
0
        IDCT_AVX_ASM (data)
1682
0
    }
1683
6.10k
    else if (zeroedRows == 4)
1684
0
    {
1685
1686
0
#    undef ROW4
1687
0
#    define ROW4(_X)
1688
0
        IDCT_AVX_ASM (data)
1689
0
    }
1690
6.10k
    else if (zeroedRows == 5)
1691
6.10k
    {
1692
1693
6.10k
#    undef ROW3
1694
6.10k
#    define ROW3(_X)
1695
6.10k
        IDCT_AVX_ASM (data)
1696
6.10k
    }
1697
0
    else if (zeroedRows == 6)
1698
0
    {
1699
1700
0
#    undef ROW2
1701
0
#    define ROW2(_X)
1702
0
        IDCT_AVX_ASM (data)
1703
0
    }
1704
0
    else if (zeroedRows == 7)
1705
0
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
0
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
0
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
0
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
0
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
0
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
0
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
0
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
0
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
0
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
0
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
0
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
0
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
0
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
0
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
0
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
0
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
0
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
0
                    "vmovaps %%ymm0,    (%0)          \n"
1749
0
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
0
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
0
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
0
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
0
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
0
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
0
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
0
#    ifndef __AVX__
1758
0
                    "vzeroupper                   \n"
1759
0
#    endif /* __AVX__ */
1760
0
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
0
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
6.10k
}
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<6>(float*)
Line
Count
Source
1621
1.02k
{
1622
1.02k
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
1.02k
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
1.02k
        3.535536e-01,  3.535536e-01,
1634
1.02k
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
1.02k
        4.619398e-01,  1.913422e-01,
1636
1.02k
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
1.02k
        3.535536e-01,  -3.535536e-01,
1638
1.02k
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
1.02k
        1.913422e-01,  -4.619398e-01,
1640
1.02k
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
1.02k
        4.903927e-01,  4.157349e-01,
1643
1.02k
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
1.02k
        4.157349e-01,  -9.754573e-02,
1645
1.02k
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
1.02k
        2.777855e-01,  -4.903927e-01,
1647
1.02k
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
1.02k
        9.754573e-02,  -2.777855e-01,
1649
1.02k
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
1.02k
    };
1651
1652
1.02k
#    define ROW0(_X) _X
1653
1.02k
#    define ROW1(_X) _X
1654
1.02k
#    define ROW2(_X) _X
1655
1.02k
#    define ROW3(_X) _X
1656
1.02k
#    define ROW4(_X) _X
1657
1.02k
#    define ROW5(_X) _X
1658
1.02k
#    define ROW6(_X) _X
1659
1.02k
#    define ROW7(_X) _X
1660
1661
1.02k
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
1.02k
    else if (zeroedRows == 1)
1663
0
    {
1664
1665
0
#    undef ROW7
1666
0
#    define ROW7(_X)
1667
0
        IDCT_AVX_ASM (data)
1668
0
    }
1669
1.02k
    else if (zeroedRows == 2)
1670
0
    {
1671
1672
0
#    undef ROW6
1673
0
#    define ROW6(_X)
1674
0
        IDCT_AVX_ASM (data)
1675
0
    }
1676
1.02k
    else if (zeroedRows == 3)
1677
0
    {
1678
1679
0
#    undef ROW5
1680
0
#    define ROW5(_X)
1681
0
        IDCT_AVX_ASM (data)
1682
0
    }
1683
1.02k
    else if (zeroedRows == 4)
1684
0
    {
1685
1686
0
#    undef ROW4
1687
0
#    define ROW4(_X)
1688
0
        IDCT_AVX_ASM (data)
1689
0
    }
1690
1.02k
    else if (zeroedRows == 5)
1691
0
    {
1692
1693
0
#    undef ROW3
1694
0
#    define ROW3(_X)
1695
0
        IDCT_AVX_ASM (data)
1696
0
    }
1697
1.02k
    else if (zeroedRows == 6)
1698
1.02k
    {
1699
1700
1.02k
#    undef ROW2
1701
1.02k
#    define ROW2(_X)
1702
1.02k
        IDCT_AVX_ASM (data)
1703
1.02k
    }
1704
0
    else if (zeroedRows == 7)
1705
0
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
0
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
0
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
0
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
0
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
0
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
0
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
0
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
0
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
0
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
0
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
0
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
0
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
0
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
0
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
0
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
0
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
0
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
0
                    "vmovaps %%ymm0,    (%0)          \n"
1749
0
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
0
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
0
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
0
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
0
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
0
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
0
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
0
#    ifndef __AVX__
1758
0
                    "vzeroupper                   \n"
1759
0
#    endif /* __AVX__ */
1760
0
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
0
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
1.02k
}
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<7>(float*)
Line
Count
Source
1621
12.9k
{
1622
12.9k
#if defined IMF_HAVE_GCC_INLINEASM_X86_64
1623
1624
    /* The column-major version of M1, followed by the 
1625
     * column-major version of M2:
1626
     *   
1627
     *          [ a  c  a  f ]          [ b  d  e  g ]
1628
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1629
     *          [ a -f -a  c ]          [ e -b  g  d ]
1630
     *          [ a -c  a -f ]          [ g -e  d -b ]
1631
     */
1632
12.9k
    const float sAvxCoef[32] __attribute__ ((aligned (32))) = {
1633
12.9k
        3.535536e-01,  3.535536e-01,
1634
12.9k
        3.535536e-01,  3.535536e-01, /* a  a  a  a */
1635
12.9k
        4.619398e-01,  1.913422e-01,
1636
12.9k
        -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1637
12.9k
        3.535536e-01,  -3.535536e-01,
1638
12.9k
        -3.535536e-01, 3.535536e-01, /* a -a -a  a */
1639
12.9k
        1.913422e-01,  -4.619398e-01,
1640
12.9k
        4.619398e-01,  -1.913422e-01, /* f -c  c -f */
1641
1642
12.9k
        4.903927e-01,  4.157349e-01,
1643
12.9k
        2.777855e-01,  9.754573e-02, /* b  d  e  g */
1644
12.9k
        4.157349e-01,  -9.754573e-02,
1645
12.9k
        -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1646
12.9k
        2.777855e-01,  -4.903927e-01,
1647
12.9k
        9.754573e-02,  4.157349e-01, /* e -b  g  d */
1648
12.9k
        9.754573e-02,  -2.777855e-01,
1649
12.9k
        4.157349e-01,  -4.903927e-01 /* g -e  d -b */
1650
12.9k
    };
1651
1652
12.9k
#    define ROW0(_X) _X
1653
12.9k
#    define ROW1(_X) _X
1654
12.9k
#    define ROW2(_X) _X
1655
12.9k
#    define ROW3(_X) _X
1656
12.9k
#    define ROW4(_X) _X
1657
12.9k
#    define ROW5(_X) _X
1658
12.9k
#    define ROW6(_X) _X
1659
12.9k
#    define ROW7(_X) _X
1660
1661
12.9k
    if (zeroedRows == 0) { IDCT_AVX_ASM (data) }
1662
12.9k
    else if (zeroedRows == 1)
1663
0
    {
1664
1665
0
#    undef ROW7
1666
0
#    define ROW7(_X)
1667
0
        IDCT_AVX_ASM (data)
1668
0
    }
1669
12.9k
    else if (zeroedRows == 2)
1670
0
    {
1671
1672
0
#    undef ROW6
1673
0
#    define ROW6(_X)
1674
0
        IDCT_AVX_ASM (data)
1675
0
    }
1676
12.9k
    else if (zeroedRows == 3)
1677
0
    {
1678
1679
0
#    undef ROW5
1680
0
#    define ROW5(_X)
1681
0
        IDCT_AVX_ASM (data)
1682
0
    }
1683
12.9k
    else if (zeroedRows == 4)
1684
0
    {
1685
1686
0
#    undef ROW4
1687
0
#    define ROW4(_X)
1688
0
        IDCT_AVX_ASM (data)
1689
0
    }
1690
12.9k
    else if (zeroedRows == 5)
1691
0
    {
1692
1693
0
#    undef ROW3
1694
0
#    define ROW3(_X)
1695
0
        IDCT_AVX_ASM (data)
1696
0
    }
1697
12.9k
    else if (zeroedRows == 6)
1698
0
    {
1699
1700
0
#    undef ROW2
1701
0
#    define ROW2(_X)
1702
0
        IDCT_AVX_ASM (data)
1703
0
    }
1704
12.9k
    else if (zeroedRows == 7)
1705
12.9k
    {
1706
        // autoformatting wants to add a space between the doubled %%
1707
        // clang-format off
1708
1709
12.9k
        __asm__(
1710
1711
            /* ==============================================
1712
                 *                Row 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */
1715
12.9k
            IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48)
1716
1717
12.9k
                "vbroadcastf128   (%1),  %%ymm8         \n"
1718
12.9k
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1719
12.9k
                "vbroadcastf128 32(%1), %%ymm10         \n"
1720
12.9k
                "vbroadcastf128 48(%1), %%ymm11         \n"
1721
1722
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1723
12.9k
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1724
1725
12.9k
            IDCT_AVX_MMULT_ROWS (%% ymm0)
1726
1727
12.9k
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1728
12.9k
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1729
12.9k
                "vbroadcastf128  96(%1), %%ymm10         \n"
1730
12.9k
                "vbroadcastf128 112(%1), %%ymm11         \n"
1731
1732
12.9k
            IDCT_AVX_MMULT_ROWS (%% ymm4)
1733
1734
12.9k
                IDCT_AVX_EO_TO_ROW_HALVES (
1735
12.9k
                    %% ymm0, %% ymm4, %% ymm0, %% ymm12)
1736
1737
12.9k
                    "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1738
1739
                    /* ==============================================
1740
                 *                Column 1D DCT 
1741
                 * ----------------------------------------------
1742
                 */
1743
1744
                    /* DC only, so multiple by a and we're done */
1745
12.9k
                    "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1746
1747
                    /* Copy out results  */
1748
12.9k
                    "vmovaps %%ymm0,    (%0)          \n"
1749
12.9k
                    "vmovaps %%ymm0,  32(%0)          \n"
1750
12.9k
                    "vmovaps %%ymm0,  64(%0)          \n"
1751
12.9k
                    "vmovaps %%ymm0,  96(%0)          \n"
1752
12.9k
                    "vmovaps %%ymm0, 128(%0)          \n"
1753
12.9k
                    "vmovaps %%ymm0, 160(%0)          \n"
1754
12.9k
                    "vmovaps %%ymm0, 192(%0)          \n"
1755
12.9k
                    "vmovaps %%ymm0, 224(%0)          \n"
1756
1757
12.9k
#    ifndef __AVX__
1758
12.9k
                    "vzeroupper                   \n"
1759
12.9k
#    endif /* __AVX__ */
1760
12.9k
            IDCT_AVX_OIC (data));
1761
        // clang-format on
1762
12.9k
    }
1763
0
    else
1764
0
    {
1765
0
        assert (false); // Invalid template instance parameter
1766
0
    }
1767
#else /* IMF_HAVE_GCC_INLINEASM_X86_64 */
1768
1769
    dctInverse8x8_scalar<zeroedRows> (data);
1770
1771
#endif /*  IMF_HAVE_GCC_INLINEASM_X86_64 */
1772
12.9k
}
1773
1774
#undef IDCT_AVX_SETUP_2_ROWS
1775
#undef IDCT_AVX_MMULT_ROWS
1776
#undef IDCT_AVX_EO_TO_ROW_HALVES
1777
#undef IDCT_AVX_BODY
1778
#undef IDCT_AVX_OIC
1779
1780
//
1781
// Full 8x8 Forward DCT:
1782
//
1783
// Base forward 8x8 DCT implementation. Works on the data in-place
1784
//
1785
// The implementation describedin Pennebaker + Mitchell,
1786
//  section 4.3.2, and illustrated in figure 4-7
1787
//
1788
// The basic idea is that the 1D DCT math reduces to:
1789
//
1790
//   2*out_0            = c_4 [(s_07 + s_34) + (s_12 + s_56)]
1791
//   2*out_4            = c_4 [(s_07 + s_34) - (s_12 + s_56)]
1792
//
1793
//   {2*out_2, 2*out_6} = rot_6 ((d_12 - d_56), (s_07 - s_34))
1794
//
1795
//   {2*out_3, 2*out_5} = rot_-3 (d_07 - c_4 (s_12 - s_56),
1796
//                                d_34 - c_4 (d_12 + d_56))
1797
//
1798
//   {2*out_1, 2*out_7} = rot_-1 (d_07 + c_4 (s_12 - s_56),
1799
//                               -d_34 - c_4 (d_12 + d_56))
1800
//
1801
// where:
1802
//
1803
//    c_i  = cos(i*pi/16)
1804
//    s_i  = sin(i*pi/16)
1805
//
1806
//    s_ij = in_i + in_j
1807
//    d_ij = in_i - in_j
1808
//
1809
//    rot_i(x, y) = {c_i*x + s_i*y, -s_i*x + c_i*y}
1810
//
1811
// We'll run the DCT in two passes. First, run the 1D DCT on
1812
// the rows, in-place. Then, run over the columns in-place,
1813
// and be done with it.
1814
//
1815
1816
#ifndef IMF_HAVE_SSE2
1817
1818
//
1819
// Default implementation
1820
//
1821
1822
void
1823
dctForward8x8 (float* data)
1824
{
1825
    float A0, A1, A2, A3, A4, A5, A6, A7;
1826
    float K0, K1, rot_x, rot_y;
1827
1828
    float* srcPtr = data;
1829
    float* dstPtr = data;
1830
1831
    const float c1 = cosf (3.14159f * 1.0f / 16.0f);
1832
    const float c2 = cosf (3.14159f * 2.0f / 16.0f);
1833
    const float c3 = cosf (3.14159f * 3.0f / 16.0f);
1834
    const float c4 = cosf (3.14159f * 4.0f / 16.0f);
1835
    const float c5 = cosf (3.14159f * 5.0f / 16.0f);
1836
    const float c6 = cosf (3.14159f * 6.0f / 16.0f);
1837
    const float c7 = cosf (3.14159f * 7.0f / 16.0f);
1838
1839
    const float c1Half = .5f * c1;
1840
    const float c2Half = .5f * c2;
1841
    const float c3Half = .5f * c3;
1842
    const float c5Half = .5f * c5;
1843
    const float c6Half = .5f * c6;
1844
    const float c7Half = .5f * c7;
1845
1846
    //
1847
    // First pass - do a 1D DCT over the rows and write the
1848
    //              results back in place
1849
    //
1850
1851
    for (int row = 0; row < 8; ++row)
1852
    {
1853
        float* srcRowPtr = srcPtr + 8 * row;
1854
        float* dstRowPtr = dstPtr + 8 * row;
1855
1856
        A0 = srcRowPtr[0] + srcRowPtr[7];
1857
        A1 = srcRowPtr[1] + srcRowPtr[2];
1858
        A2 = srcRowPtr[1] - srcRowPtr[2];
1859
        A3 = srcRowPtr[3] + srcRowPtr[4];
1860
        A4 = srcRowPtr[3] - srcRowPtr[4];
1861
        A5 = srcRowPtr[5] + srcRowPtr[6];
1862
        A6 = srcRowPtr[5] - srcRowPtr[6];
1863
        A7 = srcRowPtr[0] - srcRowPtr[7];
1864
1865
        K0 = c4 * (A0 + A3);
1866
        K1 = c4 * (A1 + A5);
1867
1868
        dstRowPtr[0] = .5f * (K0 + K1);
1869
        dstRowPtr[4] = .5f * (K0 - K1);
1870
1871
        //
1872
        // (2*dst2, 2*dst6) = rot 6 (d12 - d56,  s07 - s34)
1873
        //
1874
1875
        rot_x = A2 - A6;
1876
        rot_y = A0 - A3;
1877
1878
        dstRowPtr[2] = c6Half * rot_x + c2Half * rot_y;
1879
        dstRowPtr[6] = c6Half * rot_y - c2Half * rot_x;
1880
1881
        //
1882
        // K0, K1 are active until after dst[1],dst[7]
1883
        //  as well as dst[3], dst[5] are computed.
1884
        //
1885
1886
        K0 = c4 * (A1 - A5);
1887
        K1 = -1 * c4 * (A2 + A6);
1888
1889
        //
1890
        // Two ways to do a rotation:
1891
        //
1892
        //  rot i (x, y) =
1893
        //           X =  c_i*x + s_i*y
1894
        //           Y = -s_i*x + c_i*y
1895
        //
1896
        //        OR
1897
        //
1898
        //           X = c_i*(x+y) + (s_i-c_i)*y
1899
        //           Y = c_i*y     - (s_i+c_i)*x
1900
        //
1901
        // the first case has 4 multiplies, but fewer constants,
1902
        // while the 2nd case has fewer multiplies but takes more space.
1903
1904
        //
1905
        // (2*dst3, 2*dst5) = rot -3 ( d07 - K0,  d34 + K1 )
1906
        //
1907
1908
        rot_x = A7 - K0;
1909
        rot_y = A4 + K1;
1910
1911
        dstRowPtr[3] = c3Half * rot_x - c5Half * rot_y;
1912
        dstRowPtr[5] = c5Half * rot_x + c3Half * rot_y;
1913
1914
        //
1915
        // (2*dst1, 2*dst7) = rot -1 ( d07 + K0,  K1  - d34 )
1916
        //
1917
1918
        rot_x = A7 + K0;
1919
        rot_y = K1 - A4;
1920
1921
        //
1922
        // A: 4, 7 are inactive. All A's are inactive
1923
        //
1924
1925
        dstRowPtr[1] = c1Half * rot_x - c7Half * rot_y;
1926
        dstRowPtr[7] = c7Half * rot_x + c1Half * rot_y;
1927
    }
1928
1929
    //
1930
    // Second pass - do the same, but on the columns
1931
    //
1932
1933
    for (int column = 0; column < 8; ++column)
1934
    {
1935
1936
        A0 = srcPtr[column] + srcPtr[56 + column];
1937
        A7 = srcPtr[column] - srcPtr[56 + column];
1938
1939
        A1 = srcPtr[8 + column] + srcPtr[16 + column];
1940
        A2 = srcPtr[8 + column] - srcPtr[16 + column];
1941
1942
        A3 = srcPtr[24 + column] + srcPtr[32 + column];
1943
        A4 = srcPtr[24 + column] - srcPtr[32 + column];
1944
1945
        A5 = srcPtr[40 + column] + srcPtr[48 + column];
1946
        A6 = srcPtr[40 + column] - srcPtr[48 + column];
1947
1948
        K0 = c4 * (A0 + A3);
1949
        K1 = c4 * (A1 + A5);
1950
1951
        dstPtr[column]      = .5f * (K0 + K1);
1952
        dstPtr[32 + column] = .5f * (K0 - K1);
1953
1954
        //
1955
        // (2*dst2, 2*dst6) = rot 6 ( d12 - d56,  s07 - s34 )
1956
        //
1957
1958
        rot_x = A2 - A6;
1959
        rot_y = A0 - A3;
1960
1961
        dstPtr[16 + column] = .5f * (c6 * rot_x + c2 * rot_y);
1962
        dstPtr[48 + column] = .5f * (c6 * rot_y - c2 * rot_x);
1963
1964
        //
1965
        // K0, K1 are active until after dst[1],dst[7]
1966
        //  as well as dst[3], dst[5] are computed.
1967
        //
1968
1969
        K0 = c4 * (A1 - A5);
1970
        K1 = -1 * c4 * (A2 + A6);
1971
1972
        //
1973
        // (2*dst3, 2*dst5) = rot -3 ( d07 - K0,  d34 + K1 )
1974
        //
1975
1976
        rot_x = A7 - K0;
1977
        rot_y = A4 + K1;
1978
1979
        dstPtr[24 + column] = .5f * (c3 * rot_x - c5 * rot_y);
1980
        dstPtr[40 + column] = .5f * (c5 * rot_x + c3 * rot_y);
1981
1982
        //
1983
        // (2*dst1, 2*dst7) = rot -1 ( d07 + K0,  K1  - d34 )
1984
        //
1985
1986
        rot_x = A7 + K0;
1987
        rot_y = K1 - A4;
1988
1989
        dstPtr[8 + column]  = .5f * (c1 * rot_x - c7 * rot_y);
1990
        dstPtr[56 + column] = .5f * (c7 * rot_x + c1 * rot_y);
1991
    }
1992
}
1993
1994
#else /* IMF_HAVE_SSE2 */
1995
1996
//
1997
// SSE2 implementation
1998
//
1999
// Here, we're always doing a column-wise operation
2000
// plus transposes. This might be faster to do differently
2001
// between rows-wise and column-wise
2002
//
2003
2004
void
2005
dctForward8x8 (float* data)
2006
0
{
2007
0
    __m128* srcVec = (__m128*) data;
2008
0
    __m128  a0Vec, a1Vec, a2Vec, a3Vec, a4Vec, a5Vec, a6Vec, a7Vec;
2009
0
    __m128  k0Vec, k1Vec, rotXVec, rotYVec;
2010
0
    __m128  transTmp[4], transTmp2[4];
2011
2012
0
    __m128 c4Vec    = {.70710678f, .70710678f, .70710678f, .70710678f};
2013
0
    __m128 c4NegVec = {-.70710678f, -.70710678f, -.70710678f, -.70710678f};
2014
2015
0
    __m128 c1HalfVec = {.490392640f, .490392640f, .490392640f, .490392640f};
2016
0
    __m128 c2HalfVec = {.461939770f, .461939770f, .461939770f, .461939770f};
2017
0
    __m128 c3HalfVec = {.415734810f, .415734810f, .415734810f, .415734810f};
2018
0
    __m128 c5HalfVec = {.277785120f, .277785120f, .277785120f, .277785120f};
2019
0
    __m128 c6HalfVec = {.191341720f, .191341720f, .191341720f, .191341720f};
2020
0
    __m128 c7HalfVec = {.097545161f, .097545161f, .097545161f, .097545161f};
2021
2022
0
    __m128 halfVec = {.5f, .5f, .5f, .5f};
2023
2024
0
    for (int iter = 0; iter < 2; ++iter)
2025
0
    {
2026
        //
2027
        //  Operate on 4 columns at a time. The
2028
        //    offsets into our row-major array are:
2029
        //                  0:  0      1
2030
        //                  1:  2      3
2031
        //                  2:  4      5
2032
        //                  3:  6      7
2033
        //                  4:  8      9
2034
        //                  5: 10     11
2035
        //                  6: 12     13
2036
        //                  7: 14     15
2037
        //
2038
2039
0
        for (int pass = 0; pass < 2; ++pass)
2040
0
        {
2041
0
            a0Vec = _mm_add_ps (srcVec[0 + pass], srcVec[14 + pass]);
2042
0
            a1Vec = _mm_add_ps (srcVec[2 + pass], srcVec[4 + pass]);
2043
0
            a3Vec = _mm_add_ps (srcVec[6 + pass], srcVec[8 + pass]);
2044
0
            a5Vec = _mm_add_ps (srcVec[10 + pass], srcVec[12 + pass]);
2045
2046
0
            a7Vec = _mm_sub_ps (srcVec[0 + pass], srcVec[14 + pass]);
2047
0
            a2Vec = _mm_sub_ps (srcVec[2 + pass], srcVec[4 + pass]);
2048
0
            a4Vec = _mm_sub_ps (srcVec[6 + pass], srcVec[8 + pass]);
2049
0
            a6Vec = _mm_sub_ps (srcVec[10 + pass], srcVec[12 + pass]);
2050
2051
            //
2052
            // First stage; Compute out_0 and out_4
2053
            //
2054
2055
0
            k0Vec = _mm_add_ps (a0Vec, a3Vec);
2056
0
            k1Vec = _mm_add_ps (a1Vec, a5Vec);
2057
2058
0
            k0Vec = _mm_mul_ps (c4Vec, k0Vec);
2059
0
            k1Vec = _mm_mul_ps (c4Vec, k1Vec);
2060
2061
0
            srcVec[0 + pass] = _mm_add_ps (k0Vec, k1Vec);
2062
0
            srcVec[8 + pass] = _mm_sub_ps (k0Vec, k1Vec);
2063
2064
0
            srcVec[0 + pass] = _mm_mul_ps (srcVec[0 + pass], halfVec);
2065
0
            srcVec[8 + pass] = _mm_mul_ps (srcVec[8 + pass], halfVec);
2066
2067
            //
2068
            // Second stage; Compute out_2 and out_6
2069
            //
2070
2071
0
            k0Vec = _mm_sub_ps (a2Vec, a6Vec);
2072
0
            k1Vec = _mm_sub_ps (a0Vec, a3Vec);
2073
2074
0
            srcVec[4 + pass] = _mm_add_ps (
2075
0
                _mm_mul_ps (c6HalfVec, k0Vec), _mm_mul_ps (c2HalfVec, k1Vec));
2076
2077
0
            srcVec[12 + pass] = _mm_sub_ps (
2078
0
                _mm_mul_ps (c6HalfVec, k1Vec), _mm_mul_ps (c2HalfVec, k0Vec));
2079
2080
            //
2081
            // Precompute K0 and K1 for the remaining stages
2082
            //
2083
2084
0
            k0Vec = _mm_mul_ps (_mm_sub_ps (a1Vec, a5Vec), c4Vec);
2085
0
            k1Vec = _mm_mul_ps (_mm_add_ps (a2Vec, a6Vec), c4NegVec);
2086
2087
            //
2088
            // Third Stage, compute out_3 and out_5
2089
            //
2090
2091
0
            rotXVec = _mm_sub_ps (a7Vec, k0Vec);
2092
0
            rotYVec = _mm_add_ps (a4Vec, k1Vec);
2093
2094
0
            srcVec[6 + pass] = _mm_sub_ps (
2095
0
                _mm_mul_ps (c3HalfVec, rotXVec),
2096
0
                _mm_mul_ps (c5HalfVec, rotYVec));
2097
2098
0
            srcVec[10 + pass] = _mm_add_ps (
2099
0
                _mm_mul_ps (c5HalfVec, rotXVec),
2100
0
                _mm_mul_ps (c3HalfVec, rotYVec));
2101
2102
            //
2103
            // Fourth Stage, compute out_1 and out_7
2104
            //
2105
2106
0
            rotXVec = _mm_add_ps (a7Vec, k0Vec);
2107
0
            rotYVec = _mm_sub_ps (k1Vec, a4Vec);
2108
2109
0
            srcVec[2 + pass] = _mm_sub_ps (
2110
0
                _mm_mul_ps (c1HalfVec, rotXVec),
2111
0
                _mm_mul_ps (c7HalfVec, rotYVec));
2112
2113
0
            srcVec[14 + pass] = _mm_add_ps (
2114
0
                _mm_mul_ps (c7HalfVec, rotXVec),
2115
0
                _mm_mul_ps (c1HalfVec, rotYVec));
2116
0
        }
2117
2118
        //
2119
        // Transpose the matrix, in 4x4 blocks. So, if we have our
2120
        // 8x8 matrix divied into 4x4 blocks:
2121
        //
2122
        //         M0 | M1         M0t | M2t
2123
        //        ----+---   -->  -----+------
2124
        //         M2 | M3         M1t | M3t
2125
        //
2126
2127
        //
2128
        // M0t, done in place, the first half.
2129
        //
2130
2131
0
        transTmp[0] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0x44);
2132
0
        transTmp[1] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0x44);
2133
0
        transTmp[3] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0xEE);
2134
0
        transTmp[2] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0xEE);
2135
2136
        //
2137
        // M3t, also done in place, the first half.
2138
        //
2139
2140
0
        transTmp2[0] = _mm_shuffle_ps (srcVec[9], srcVec[11], 0x44);
2141
0
        transTmp2[1] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0x44);
2142
0
        transTmp2[2] = _mm_shuffle_ps (srcVec[9], srcVec[11], 0xEE);
2143
0
        transTmp2[3] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0xEE);
2144
2145
        //
2146
        // M0t, the second half.
2147
        //
2148
2149
0
        srcVec[0] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
2150
0
        srcVec[4] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
2151
0
        srcVec[2] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
2152
0
        srcVec[6] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
2153
2154
        //
2155
        // M3t, the second half.
2156
        //
2157
2158
0
        srcVec[9]  = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
2159
0
        srcVec[13] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
2160
0
        srcVec[11] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
2161
0
        srcVec[15] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
2162
2163
        //
2164
        // M1 and M2 need to be done at the same time, because we're
2165
        //  swapping.
2166
        //
2167
        // First, the first half of M1t
2168
        //
2169
2170
0
        transTmp[0] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0x44);
2171
0
        transTmp[1] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0x44);
2172
0
        transTmp[2] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0xEE);
2173
0
        transTmp[3] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0xEE);
2174
2175
        //
2176
        // And the first half of M2t
2177
        //
2178
2179
0
        transTmp2[0] = _mm_shuffle_ps (srcVec[8], srcVec[10], 0x44);
2180
0
        transTmp2[1] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0x44);
2181
0
        transTmp2[2] = _mm_shuffle_ps (srcVec[8], srcVec[10], 0xEE);
2182
0
        transTmp2[3] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0xEE);
2183
2184
        //
2185
        // Second half of M1t
2186
        //
2187
2188
0
        srcVec[8]  = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
2189
0
        srcVec[12] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
2190
0
        srcVec[10] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
2191
0
        srcVec[14] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
2192
2193
        //
2194
        // Second half of M2
2195
        //
2196
2197
0
        srcVec[1] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
2198
0
        srcVec[5] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
2199
0
        srcVec[3] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
2200
0
        srcVec[7] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
2201
0
    }
2202
0
}
2203
2204
#endif /* IMF_HAVE_SSE2 */
2205
2206
} // namespace
2207
2208
OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
2209
2210
#endif