/src/opencv/3rdparty/openexr/IlmImf/ImfDwaCompressorSimd.h
Line | Count | Source |
1 | | /////////////////////////////////////////////////////////////////////////// |
2 | | // |
3 | | // Copyright (c) 2009-2014 DreamWorks Animation LLC. |
4 | | // |
5 | | // All rights reserved. |
6 | | // |
7 | | // Redistribution and use in source and binary forms, with or without |
8 | | // modification, are permitted provided that the following conditions are |
9 | | // met: |
10 | | // * Redistributions of source code must retain the above copyright |
11 | | // notice, this list of conditions and the following disclaimer. |
12 | | // * Redistributions in binary form must reproduce the above |
13 | | // copyright notice, this list of conditions and the following disclaimer |
14 | | // in the documentation and/or other materials provided with the |
15 | | // distribution. |
16 | | // * Neither the name of DreamWorks Animation nor the names of |
17 | | // its contributors may be used to endorse or promote products derived |
18 | | // from this software without specific prior written permission. |
19 | | // |
20 | | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
21 | | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
22 | | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
23 | | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
24 | | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
25 | | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
26 | | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
27 | | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
28 | | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
29 | | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
30 | | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
31 | | // |
32 | | /////////////////////////////////////////////////////////////////////////// |
33 | | |
34 | | #ifndef IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED |
35 | | #define IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED |
36 | | |
37 | | // |
38 | | // Various SSE accelerated functions, used by Imf::DwaCompressor. |
39 | | // These have been separated into a separate .h file, as the fast |
40 | | // paths are done with template specialization. |
41 | | // |
42 | | // Unless otherwise noted, all pointers are assumed to be 32-byte |
43 | | // aligned. Unaligned pointers may risk seg-faulting. |
44 | | // |
45 | | |
46 | | #include "ImfNamespace.h" |
47 | | #include "ImfSimd.h" |
48 | | #include "ImfSystemSpecific.h" |
49 | | #include "OpenEXRConfig.h" |
50 | | |
51 | | #include <half.h> |
52 | | #include <assert.h> |
53 | | |
54 | | #include <algorithm> |
55 | | |
56 | | OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER |
57 | | |
58 | 0 | #define _SSE_ALIGNMENT 32 |
59 | 0 | #define _SSE_ALIGNMENT_MASK 0x0F |
60 | | #define _AVX_ALIGNMENT_MASK 0x1F |
61 | | |
62 | | // |
63 | | // Test if we should enable GCC inline asm paths for AVX |
64 | | // |
65 | | |
66 | | #ifdef OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX |
67 | | |
68 | | #define IMF_HAVE_GCC_INLINEASM |
69 | | |
70 | | #ifdef __LP64__ |
71 | | #define IMF_HAVE_GCC_INLINEASM_64 |
72 | | #endif /* __LP64__ */ |
73 | | |
74 | | #endif /* OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX */ |
75 | | |
76 | | // |
77 | | // A simple 64-element array, aligned properly for SIMD access. |
78 | | // |
79 | | |
80 | | template <class T> |
81 | | class SimdAlignedBuffer64 |
82 | | { |
83 | | public: |
84 | | |
85 | 0 | SimdAlignedBuffer64(): _buffer (0), _handle (0) |
86 | 0 | { |
87 | 0 | alloc(); |
88 | 0 | } Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<unsigned short>::SimdAlignedBuffer64() Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<float>::SimdAlignedBuffer64() |
89 | | |
90 | | SimdAlignedBuffer64(const SimdAlignedBuffer64 &rhs): _handle(0) |
91 | | { |
92 | | alloc(); |
93 | | memcpy (_buffer, rhs._buffer, 64 * sizeof (T)); |
94 | | } |
95 | | |
96 | | SimdAlignedBuffer64 &operator=(const SimdAlignedBuffer64 &rhs) |
97 | | { |
98 | | memcpy (_buffer, rhs._buffer, 64 * sizeof (T)); |
99 | | return *this; |
100 | | } |
101 | | |
102 | | #if __cplusplus >= 201103L |
103 | | SimdAlignedBuffer64(SimdAlignedBuffer64 &&rhs) noexcept |
104 | 0 | : _handle(rhs._handle), _buffer(rhs._buffer) |
105 | 0 | { |
106 | 0 | rhs._handle = nullptr; |
107 | 0 | rhs._buffer = nullptr; |
108 | 0 | } |
109 | | |
110 | | SimdAlignedBuffer64 &operator=(SimdAlignedBuffer64 &&rhs) noexcept |
111 | | { |
112 | | std::swap(_handle, rhs._handle); |
113 | | std::swap(_buffer, rhs._buffer); |
114 | | return *this; |
115 | | } |
116 | | #endif |
117 | | ~SimdAlignedBuffer64 () |
118 | 0 | { |
119 | 0 | if (_handle) |
120 | 0 | EXRFreeAligned (_handle); |
121 | 0 | _handle = 0; |
122 | 0 | _buffer = 0; |
123 | 0 | } Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<float>::~SimdAlignedBuffer64() Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<unsigned short>::~SimdAlignedBuffer64() |
124 | | |
125 | | void alloc() |
126 | 0 | { |
127 | | // |
128 | | // Try EXRAllocAligned first - but it might fallback to |
129 | | // unaligned allocs. If so, overalloc. |
130 | | // |
131 | |
|
132 | 0 | _handle = (char *) EXRAllocAligned |
133 | 0 | (64 * sizeof(T), _SSE_ALIGNMENT); |
134 | |
|
135 | 0 | if (((size_t)_handle & (_SSE_ALIGNMENT - 1)) == 0) |
136 | 0 | { |
137 | 0 | _buffer = (T *)_handle; |
138 | 0 | return; |
139 | 0 | } |
140 | | |
141 | 0 | EXRFreeAligned(_handle); |
142 | 0 | _handle = (char *) EXRAllocAligned |
143 | 0 | (64 * sizeof(T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT); |
144 | |
|
145 | 0 | char *aligned = _handle; |
146 | |
|
147 | 0 | while ((size_t)aligned & (_SSE_ALIGNMENT - 1)) |
148 | 0 | aligned++; |
149 | |
|
150 | 0 | _buffer = (T *)aligned; |
151 | 0 | } Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<unsigned short>::alloc() Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<float>::alloc() |
152 | | |
153 | | T *_buffer; |
154 | | |
155 | | private: |
156 | | |
157 | | char *_handle; |
158 | | }; |
159 | | |
160 | | typedef SimdAlignedBuffer64<float> SimdAlignedBuffer64f; |
161 | | typedef SimdAlignedBuffer64<unsigned short> SimdAlignedBuffer64us; |
162 | | |
163 | | namespace { |
164 | | |
165 | | // |
166 | | // Color space conversion, Inverse 709 CSC, Y'CbCr -> R'G'B' |
167 | | // |
168 | | |
169 | | void |
170 | | csc709Inverse (float &comp0, float &comp1, float &comp2) |
171 | 0 | { |
172 | 0 | float src[3]; |
173 | |
|
174 | 0 | src[0] = comp0; |
175 | 0 | src[1] = comp1; |
176 | 0 | src[2] = comp2; |
177 | |
|
178 | 0 | comp0 = src[0] + 1.5747f * src[2]; |
179 | 0 | comp1 = src[0] - 0.1873f * src[1] - 0.4682f * src[2]; |
180 | 0 | comp2 = src[0] + 1.8556f * src[1]; |
181 | 0 | } |
182 | | |
183 | | #ifndef IMF_HAVE_SSE2 |
184 | | |
185 | | |
186 | | // |
187 | | // Scalar color space conversion, based on 709 primiary chromaticies. |
188 | | // No scaling or offsets, just the matrix |
189 | | // |
190 | | |
191 | | void |
192 | | csc709Inverse64 (float *comp0, float *comp1, float *comp2) |
193 | | { |
194 | | for (int i = 0; i < 64; ++i) |
195 | | csc709Inverse (comp0[i], comp1[i], comp2[i]); |
196 | | } |
197 | | |
198 | | #else /* IMF_HAVE_SSE2 */ |
199 | | |
200 | | // |
201 | | // SSE2 color space conversion |
202 | | // |
203 | | |
204 | | void |
205 | | csc709Inverse64 (float *comp0, float *comp1, float *comp2) |
206 | 0 | { |
207 | 0 | __m128 c0 = { 1.5747f, 1.5747f, 1.5747f, 1.5747f}; |
208 | 0 | __m128 c1 = { 1.8556f, 1.8556f, 1.8556f, 1.8556f}; |
209 | 0 | __m128 c2 = {-0.1873f, -0.1873f, -0.1873f, -0.1873f}; |
210 | 0 | __m128 c3 = {-0.4682f, -0.4682f, -0.4682f, -0.4682f}; |
211 | |
|
212 | 0 | __m128 *r = (__m128 *)comp0; |
213 | 0 | __m128 *g = (__m128 *)comp1; |
214 | 0 | __m128 *b = (__m128 *)comp2; |
215 | 0 | __m128 src[3]; |
216 | |
|
217 | 0 | #define CSC_INVERSE_709_SSE2_LOOP(i) \ |
218 | 0 | src[0] = r[i]; \ |
219 | 0 | src[1] = g[i]; \ |
220 | 0 | src[2] = b[i]; \ |
221 | 0 | \ |
222 | 0 | r[i] = _mm_add_ps (r[i], _mm_mul_ps (src[2], c0)); \ |
223 | 0 | \ |
224 | 0 | g[i] = _mm_mul_ps (g[i], c2); \ |
225 | 0 | src[2] = _mm_mul_ps (src[2], c3); \ |
226 | 0 | g[i] = _mm_add_ps (g[i], src[0]); \ |
227 | 0 | g[i] = _mm_add_ps (g[i], src[2]); \ |
228 | 0 | \ |
229 | 0 | b[i] = _mm_mul_ps (c1, src[1]); \ |
230 | 0 | b[i] = _mm_add_ps (b[i], src[0]); |
231 | |
|
232 | 0 | CSC_INVERSE_709_SSE2_LOOP (0) |
233 | 0 | CSC_INVERSE_709_SSE2_LOOP (1) |
234 | 0 | CSC_INVERSE_709_SSE2_LOOP (2) |
235 | 0 | CSC_INVERSE_709_SSE2_LOOP (3) |
236 | |
|
237 | 0 | CSC_INVERSE_709_SSE2_LOOP (4) |
238 | 0 | CSC_INVERSE_709_SSE2_LOOP (5) |
239 | 0 | CSC_INVERSE_709_SSE2_LOOP (6) |
240 | 0 | CSC_INVERSE_709_SSE2_LOOP (7) |
241 | |
|
242 | 0 | CSC_INVERSE_709_SSE2_LOOP (8) |
243 | 0 | CSC_INVERSE_709_SSE2_LOOP (9) |
244 | 0 | CSC_INVERSE_709_SSE2_LOOP (10) |
245 | 0 | CSC_INVERSE_709_SSE2_LOOP (11) |
246 | |
|
247 | 0 | CSC_INVERSE_709_SSE2_LOOP (12) |
248 | 0 | CSC_INVERSE_709_SSE2_LOOP (13) |
249 | 0 | CSC_INVERSE_709_SSE2_LOOP (14) |
250 | 0 | CSC_INVERSE_709_SSE2_LOOP (15) |
251 | 0 | } |
252 | | |
253 | | #endif /* IMF_HAVE_SSE2 */ |
254 | | |
255 | | |
256 | | // |
257 | | // Color space conversion, Forward 709 CSC, R'G'B' -> Y'CbCr |
258 | | // |
259 | | // Simple FPU color space conversion. Based on the 709 |
260 | | // primary chromaticies, with no scaling or offsets. |
261 | | // |
262 | | |
263 | | void |
264 | | csc709Forward64 (float *comp0, float *comp1, float *comp2) |
265 | 0 | { |
266 | 0 | float src[3]; |
267 | |
|
268 | 0 | for (int i = 0; i<64; ++i) |
269 | 0 | { |
270 | 0 | src[0] = comp0[i]; |
271 | 0 | src[1] = comp1[i]; |
272 | 0 | src[2] = comp2[i]; |
273 | |
|
274 | 0 | comp0[i] = 0.2126f * src[0] + 0.7152f * src[1] + 0.0722f * src[2]; |
275 | 0 | comp1[i] = -0.1146f * src[0] - 0.3854f * src[1] + 0.5000f * src[2]; |
276 | 0 | comp2[i] = 0.5000f * src[0] - 0.4542f * src[1] - 0.0458f * src[2]; |
277 | 0 | } |
278 | 0 | } |
279 | | |
280 | | |
281 | | // |
282 | | // Byte interleaving of 2 byte arrays: |
283 | | // src0 = AAAA |
284 | | // src1 = BBBB |
285 | | // dst = ABABABAB |
286 | | // |
287 | | // numBytes is the size of each of the source buffers |
288 | | // |
289 | | |
290 | | #ifndef IMF_HAVE_SSE2 |
291 | | |
292 | | // |
293 | | // Scalar default implementation |
294 | | // |
295 | | |
296 | | void |
297 | | interleaveByte2 (char *dst, char *src0, char *src1, int numBytes) |
298 | | { |
299 | | for (int x = 0; x < numBytes; ++x) |
300 | | { |
301 | | dst[2 * x] = src0[x]; |
302 | | dst[2 * x + 1] = src1[x]; |
303 | | } |
304 | | } |
305 | | |
306 | | #else /* IMF_HAVE_SSE2 */ |
307 | | |
308 | | // |
309 | | // SSE2 byte interleaving |
310 | | // |
311 | | |
312 | | void |
313 | | interleaveByte2 (char *dst, char *src0, char *src1, int numBytes) |
314 | 0 | { |
315 | 0 | int dstAlignment = (size_t)dst % 16; |
316 | 0 | int src0Alignment = (size_t)src0 % 16; |
317 | 0 | int src1Alignment = (size_t)src1 % 16; |
318 | |
|
319 | 0 | __m128i *dst_epi8 = (__m128i*)dst; |
320 | 0 | __m128i *src0_epi8 = (__m128i*)src0; |
321 | 0 | __m128i *src1_epi8 = (__m128i*)src1; |
322 | 0 | int sseWidth = numBytes / 16; |
323 | |
|
324 | 0 | if ((!dstAlignment) && (!src0Alignment) && (!src1Alignment)) |
325 | 0 | { |
326 | 0 | __m128i tmp0, tmp1; |
327 | | |
328 | | // |
329 | | // Aligned loads and stores |
330 | | // |
331 | |
|
332 | 0 | for (int x = 0; x < sseWidth; ++x) |
333 | 0 | { |
334 | 0 | tmp0 = src0_epi8[x]; |
335 | 0 | tmp1 = src1_epi8[x]; |
336 | |
|
337 | 0 | _mm_stream_si128 (&dst_epi8[2 * x], |
338 | 0 | _mm_unpacklo_epi8 (tmp0, tmp1)); |
339 | |
|
340 | 0 | _mm_stream_si128 (&dst_epi8[2 * x + 1], |
341 | 0 | _mm_unpackhi_epi8 (tmp0, tmp1)); |
342 | 0 | } |
343 | | |
344 | | // |
345 | | // Then do run the leftovers one at a time |
346 | | // |
347 | |
|
348 | 0 | for (int x = 16 * sseWidth; x < numBytes; ++x) |
349 | 0 | { |
350 | 0 | dst[2 * x] = src0[x]; |
351 | 0 | dst[2 * x + 1] = src1[x]; |
352 | 0 | } |
353 | 0 | } |
354 | 0 | else if ((!dstAlignment) && (src0Alignment == 8) && (src1Alignment == 8)) |
355 | 0 | { |
356 | | // |
357 | | // Aligned stores, but catch up a few values so we can |
358 | | // use aligned loads |
359 | | // |
360 | | |
361 | 0 | for (int x = 0; x < std::min (numBytes, 8); ++x) |
362 | 0 | { |
363 | 0 | dst[2 * x] = src0[x]; |
364 | 0 | dst[2 * x + 1] = src1[x]; |
365 | 0 | } |
366 | |
|
367 | 0 | if (numBytes > 8) |
368 | 0 | { |
369 | 0 | dst_epi8 = (__m128i*)&dst[16]; |
370 | 0 | src0_epi8 = (__m128i*)&src0[8]; |
371 | 0 | src1_epi8 = (__m128i*)&src1[8]; |
372 | 0 | sseWidth = (numBytes - 8) / 16; |
373 | |
|
374 | 0 | for (int x=0; x<sseWidth; ++x) |
375 | 0 | { |
376 | 0 | _mm_stream_si128 (&dst_epi8[2 * x], |
377 | 0 | _mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x])); |
378 | |
|
379 | 0 | _mm_stream_si128 (&dst_epi8[2 * x + 1], |
380 | 0 | _mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x])); |
381 | 0 | } |
382 | | |
383 | | // |
384 | | // Then do run the leftovers one at a time |
385 | | // |
386 | |
|
387 | 0 | for (int x = 16 * sseWidth + 8; x < numBytes; ++x) |
388 | 0 | { |
389 | 0 | dst[2 * x] = src0[x]; |
390 | 0 | dst[2 * x + 1] = src1[x]; |
391 | 0 | } |
392 | 0 | } |
393 | 0 | } |
394 | 0 | else |
395 | 0 | { |
396 | | // |
397 | | // Unaligned everything |
398 | | // |
399 | |
|
400 | 0 | for (int x = 0; x < sseWidth; ++x) |
401 | 0 | { |
402 | 0 | __m128i tmpSrc0_epi8 = _mm_loadu_si128 (&src0_epi8[x]); |
403 | 0 | __m128i tmpSrc1_epi8 = _mm_loadu_si128 (&src1_epi8[x]); |
404 | |
|
405 | 0 | _mm_storeu_si128 (&dst_epi8[2 * x], |
406 | 0 | _mm_unpacklo_epi8 (tmpSrc0_epi8, tmpSrc1_epi8)); |
407 | |
|
408 | 0 | _mm_storeu_si128 (&dst_epi8[2 * x + 1], |
409 | 0 | _mm_unpackhi_epi8 (tmpSrc0_epi8, tmpSrc1_epi8)); |
410 | 0 | } |
411 | | |
412 | | // |
413 | | // Then do run the leftovers one at a time |
414 | | // |
415 | |
|
416 | 0 | for (int x = 16 * sseWidth; x < numBytes; ++x) |
417 | 0 | { |
418 | 0 | dst[2 * x] = src0[x]; |
419 | 0 | dst[2 * x + 1] = src1[x]; |
420 | 0 | } |
421 | 0 | } |
422 | 0 | } |
423 | | |
424 | | #endif /* IMF_HAVE_SSE2 */ |
425 | | |
426 | | |
427 | | // |
428 | | // Float -> half float conversion |
429 | | // |
430 | | // To enable F16C based conversion, we can't rely on compile-time |
431 | | // detection, hence the multiple defined versions. Pick one based |
432 | | // on runtime cpuid detection. |
433 | | // |
434 | | |
435 | | // |
436 | | // Default boring conversion |
437 | | // |
438 | | |
439 | | void |
440 | | convertFloatToHalf64_scalar (unsigned short *dst, float *src) |
441 | 0 | { |
442 | 0 | for (int i=0; i<64; ++i) |
443 | 0 | dst[i] = ((half)src[i]).bits(); |
444 | 0 | } |
445 | | |
446 | | |
447 | | // |
448 | | // F16C conversion - Assumes aligned src and dst |
449 | | // |
450 | | |
451 | | void |
452 | | convertFloatToHalf64_f16c (unsigned short *dst, float *src) |
453 | 0 | { |
454 | | // |
455 | | // Ordinarly, I'd avoid using inline asm and prefer intrinsics. |
456 | | // However, in order to get the intrinsics, we need to tell |
457 | | // the compiler to generate VEX instructions. |
458 | | // |
459 | | // (On the GCC side, -mf16c goes ahead and activates -mavc, |
460 | | // resulting in VEX code. Without -mf16c, no intrinsics..) |
461 | | // |
462 | | // Now, it's quite likely that we'll find ourselves in situations |
463 | | // where we want to build *without* VEX, in order to maintain |
464 | | // maximum compatability. But to get there with intrinsics, |
465 | | // we'd need to break out code into a separate file. Bleh. |
466 | | // I'll take the asm. |
467 | | // |
468 | |
|
469 | | #if defined IMF_HAVE_GCC_INLINEASM |
470 | | __asm__ |
471 | | ("vmovaps (%0), %%ymm0 \n" |
472 | | "vmovaps 0x20(%0), %%ymm1 \n" |
473 | | "vmovaps 0x40(%0), %%ymm2 \n" |
474 | | "vmovaps 0x60(%0), %%ymm3 \n" |
475 | | "vcvtps2ph $0, %%ymm0, %%xmm0 \n" |
476 | | "vcvtps2ph $0, %%ymm1, %%xmm1 \n" |
477 | | "vcvtps2ph $0, %%ymm2, %%xmm2 \n" |
478 | | "vcvtps2ph $0, %%ymm3, %%xmm3 \n" |
479 | | "vmovdqa %%xmm0, 0x00(%1) \n" |
480 | | "vmovdqa %%xmm1, 0x10(%1) \n" |
481 | | "vmovdqa %%xmm2, 0x20(%1) \n" |
482 | | "vmovdqa %%xmm3, 0x30(%1) \n" |
483 | | "vmovaps 0x80(%0), %%ymm0 \n" |
484 | | "vmovaps 0xa0(%0), %%ymm1 \n" |
485 | | "vmovaps 0xc0(%0), %%ymm2 \n" |
486 | | "vmovaps 0xe0(%0), %%ymm3 \n" |
487 | | "vcvtps2ph $0, %%ymm0, %%xmm0 \n" |
488 | | "vcvtps2ph $0, %%ymm1, %%xmm1 \n" |
489 | | "vcvtps2ph $0, %%ymm2, %%xmm2 \n" |
490 | | "vcvtps2ph $0, %%ymm3, %%xmm3 \n" |
491 | | "vmovdqa %%xmm0, 0x40(%1) \n" |
492 | | "vmovdqa %%xmm1, 0x50(%1) \n" |
493 | | "vmovdqa %%xmm2, 0x60(%1) \n" |
494 | | "vmovdqa %%xmm3, 0x70(%1) \n" |
495 | | #ifndef __AVX__ |
496 | | "vzeroupper \n" |
497 | | #endif /* __AVX__ */ |
498 | | : /* Output */ |
499 | | : /* Input */ "r"(src), "r"(dst) |
500 | | #ifndef __AVX__ |
501 | | : /* Clobber */ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory" |
502 | | #else |
503 | | : /* Clobber */ "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory" |
504 | | #endif /* __AVX__ */ |
505 | | ); |
506 | | #else |
507 | 0 | convertFloatToHalf64_scalar (dst, src); |
508 | 0 | #endif /* IMF_HAVE_GCC_INLINEASM */ |
509 | 0 | } |
510 | | |
511 | | |
512 | | // |
513 | | // Convert an 8x8 block of HALF from zig-zag order to |
514 | | // FLOAT in normal order. The order we want is: |
515 | | // |
516 | | // src dst |
517 | | // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28 |
518 | | // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42 |
519 | | // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43 |
520 | | // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53 |
521 | | // 32 33 34 35 36 37 38 39 10 19 23 32 39 45 52 54 |
522 | | // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60 |
523 | | // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61 |
524 | | // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63 |
525 | | // |
526 | | |
527 | | void |
528 | | fromHalfZigZag_scalar (unsigned short *src, float *dst) |
529 | 0 | { |
530 | 0 | half *srcHalf = (half *)src; |
531 | |
|
532 | 0 | dst[0] = (float)srcHalf[0]; |
533 | 0 | dst[1] = (float)srcHalf[1]; |
534 | 0 | dst[2] = (float)srcHalf[5]; |
535 | 0 | dst[3] = (float)srcHalf[6]; |
536 | 0 | dst[4] = (float)srcHalf[14]; |
537 | 0 | dst[5] = (float)srcHalf[15]; |
538 | 0 | dst[6] = (float)srcHalf[27]; |
539 | 0 | dst[7] = (float)srcHalf[28]; |
540 | 0 | dst[8] = (float)srcHalf[2]; |
541 | 0 | dst[9] = (float)srcHalf[4]; |
542 | |
|
543 | 0 | dst[10] = (float)srcHalf[7]; |
544 | 0 | dst[11] = (float)srcHalf[13]; |
545 | 0 | dst[12] = (float)srcHalf[16]; |
546 | 0 | dst[13] = (float)srcHalf[26]; |
547 | 0 | dst[14] = (float)srcHalf[29]; |
548 | 0 | dst[15] = (float)srcHalf[42]; |
549 | 0 | dst[16] = (float)srcHalf[3]; |
550 | 0 | dst[17] = (float)srcHalf[8]; |
551 | 0 | dst[18] = (float)srcHalf[12]; |
552 | 0 | dst[19] = (float)srcHalf[17]; |
553 | |
|
554 | 0 | dst[20] = (float)srcHalf[25]; |
555 | 0 | dst[21] = (float)srcHalf[30]; |
556 | 0 | dst[22] = (float)srcHalf[41]; |
557 | 0 | dst[23] = (float)srcHalf[43]; |
558 | 0 | dst[24] = (float)srcHalf[9]; |
559 | 0 | dst[25] = (float)srcHalf[11]; |
560 | 0 | dst[26] = (float)srcHalf[18]; |
561 | 0 | dst[27] = (float)srcHalf[24]; |
562 | 0 | dst[28] = (float)srcHalf[31]; |
563 | 0 | dst[29] = (float)srcHalf[40]; |
564 | |
|
565 | 0 | dst[30] = (float)srcHalf[44]; |
566 | 0 | dst[31] = (float)srcHalf[53]; |
567 | 0 | dst[32] = (float)srcHalf[10]; |
568 | 0 | dst[33] = (float)srcHalf[19]; |
569 | 0 | dst[34] = (float)srcHalf[23]; |
570 | 0 | dst[35] = (float)srcHalf[32]; |
571 | 0 | dst[36] = (float)srcHalf[39]; |
572 | 0 | dst[37] = (float)srcHalf[45]; |
573 | 0 | dst[38] = (float)srcHalf[52]; |
574 | 0 | dst[39] = (float)srcHalf[54]; |
575 | |
|
576 | 0 | dst[40] = (float)srcHalf[20]; |
577 | 0 | dst[41] = (float)srcHalf[22]; |
578 | 0 | dst[42] = (float)srcHalf[33]; |
579 | 0 | dst[43] = (float)srcHalf[38]; |
580 | 0 | dst[44] = (float)srcHalf[46]; |
581 | 0 | dst[45] = (float)srcHalf[51]; |
582 | 0 | dst[46] = (float)srcHalf[55]; |
583 | 0 | dst[47] = (float)srcHalf[60]; |
584 | 0 | dst[48] = (float)srcHalf[21]; |
585 | 0 | dst[49] = (float)srcHalf[34]; |
586 | |
|
587 | 0 | dst[50] = (float)srcHalf[37]; |
588 | 0 | dst[51] = (float)srcHalf[47]; |
589 | 0 | dst[52] = (float)srcHalf[50]; |
590 | 0 | dst[53] = (float)srcHalf[56]; |
591 | 0 | dst[54] = (float)srcHalf[59]; |
592 | 0 | dst[55] = (float)srcHalf[61]; |
593 | 0 | dst[56] = (float)srcHalf[35]; |
594 | 0 | dst[57] = (float)srcHalf[36]; |
595 | 0 | dst[58] = (float)srcHalf[48]; |
596 | 0 | dst[59] = (float)srcHalf[49]; |
597 | |
|
598 | 0 | dst[60] = (float)srcHalf[57]; |
599 | 0 | dst[61] = (float)srcHalf[58]; |
600 | 0 | dst[62] = (float)srcHalf[62]; |
601 | 0 | dst[63] = (float)srcHalf[63]; |
602 | 0 | } |
603 | | |
604 | | |
605 | | // |
606 | | // If we can form the correct ordering in xmm registers, |
607 | | // we can use F16C to convert from HALF -> FLOAT. However, |
608 | | // making the correct order isn't trivial. |
609 | | // |
610 | | // We want to re-order a source 8x8 matrix from: |
611 | | // |
612 | | // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28 |
613 | | // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42 |
614 | | // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43 |
615 | | // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53 (A) |
616 | | // 32 33 34 35 36 37 38 39 --> 10 19 23 32 39 45 52 54 |
617 | | // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60 |
618 | | // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61 |
619 | | // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63 |
620 | | // |
621 | | // Which looks like a mess, right? |
622 | | // |
623 | | // Now, check out the NE/SW diagonals of (A). Along those lines, |
624 | | // we have runs of contiguous values! If we rewrite (A) a bit, we get: |
625 | | // |
626 | | // 0 |
627 | | // 1 2 |
628 | | // 5 4 3 |
629 | | // 6 7 8 9 |
630 | | // 14 13 12 11 10 |
631 | | // 15 16 17 18 19 20 |
632 | | // 27 26 25 24 23 22 21 (B) |
633 | | // 28 29 30 31 32 33 34 35 |
634 | | // 42 41 40 39 38 37 36 |
635 | | // 43 44 45 46 47 48 |
636 | | // 53 52 51 50 49 |
637 | | // 54 55 56 57 |
638 | | // 60 59 58 |
639 | | // 61 62 |
640 | | // 63 |
641 | | // |
642 | | // In this ordering, the columns are the rows (A). If we can 'transpose' |
643 | | // (B), we'll achieve our goal. But we want this to fit nicely into |
644 | | // xmm registers and still be able to load large runs efficiently. |
645 | | // Also, notice that the odd rows are in ascending order, while |
646 | | // the even rows are in descending order. |
647 | | // |
648 | | // If we 'fold' the bottom half up into the top, we can preserve ordered |
649 | | // runs accross rows, and still keep all the correct values in columns. |
650 | | // After transposing, we'll need to rotate things back into place. |
651 | | // This gives us: |
652 | | // |
653 | | // 0 | 42 41 40 39 38 37 36 |
654 | | // 1 2 | 43 44 45 46 47 48 |
655 | | // 5 4 3 | 53 52 51 50 49 |
656 | | // 6 7 8 9 | 54 55 56 57 (C) |
657 | | // 14 13 12 11 10 | 60 59 58 |
658 | | // 15 16 17 18 19 20 | 61 62 |
659 | | // 27 26 25 24 23 22 21 | 61 |
660 | | // 28 29 30 31 32 33 34 35 |
661 | | // |
662 | | // But hang on. We still have the backwards descending rows to deal with. |
663 | | // Lets reverse the even rows so that all values are in ascending order |
664 | | // |
665 | | // 36 37 38 39 40 41 42 | 0 |
666 | | // 1 2 | 43 44 45 46 47 48 |
667 | | // 49 50 51 52 53 | 3 4 5 |
668 | | // 6 7 8 9 | 54 55 56 57 (D) |
669 | | // 58 59 60 | 10 11 12 13 14 |
670 | | // 15 16 17 18 19 20 | 61 62 |
671 | | // 61 | 21 22 23 24 25 26 27 |
672 | | // 28 29 30 31 32 33 34 35 |
673 | | // |
674 | | // If we can form (D), we will then: |
675 | | // 1) Reverse the even rows |
676 | | // 2) Transpose |
677 | | // 3) Rotate the rows |
678 | | // |
679 | | // and we'll have (A). |
680 | | // |
681 | | |
682 | | void |
683 | | fromHalfZigZag_f16c (unsigned short *src, float *dst) |
684 | 0 | { |
685 | | #if defined IMF_HAVE_GCC_INLINEASM_64 |
686 | | __asm__ |
687 | | |
688 | | /* x3 <- 0 |
689 | | * x8 <- [ 0- 7] |
690 | | * x6 <- [56-63] |
691 | | * x9 <- [21-28] |
692 | | * x7 <- [28-35] |
693 | | * x3 <- [ 6- 9] (lower half) */ |
694 | | |
695 | | ("vpxor %%xmm3, %%xmm3, %%xmm3 \n" |
696 | | "vmovdqa (%0), %%xmm8 \n" |
697 | | "vmovdqa 112(%0), %%xmm6 \n" |
698 | | "vmovdqu 42(%0), %%xmm9 \n" |
699 | | "vmovdqu 56(%0), %%xmm7 \n" |
700 | | "vmovq 12(%0), %%xmm3 \n" |
701 | | |
702 | | /* Setup rows 0-2 of A in xmm0-xmm2 |
703 | | * x1 <- x8 >> 16 (1 value) |
704 | | * x2 <- x8 << 32 (2 values) |
705 | | * x0 <- alignr([35-42], x8, 2) |
706 | | * x1 <- blend(x1, [41-48]) |
707 | | * x2 <- blend(x2, [49-56]) */ |
708 | | |
709 | | "vpsrldq $2, %%xmm8, %%xmm1 \n" |
710 | | "vpslldq $4, %%xmm8, %%xmm2 \n" |
711 | | "vpalignr $2, 70(%0), %%xmm8, %%xmm0 \n" |
712 | | "vpblendw $0xfc, 82(%0), %%xmm1, %%xmm1 \n" |
713 | | "vpblendw $0x1f, 98(%0), %%xmm2, %%xmm2 \n" |
714 | | |
715 | | /* Setup rows 4-6 of A in xmm4-xmm6 |
716 | | * x4 <- x6 >> 32 (2 values) |
717 | | * x5 <- x6 << 16 (1 value) |
718 | | * x6 <- alignr(x6,x9,14) |
719 | | * x4 <- blend(x4, [ 7-14]) |
720 | | * x5 <- blend(x5, [15-22]) */ |
721 | | |
722 | | "vpsrldq $4, %%xmm6, %%xmm4 \n" |
723 | | "vpslldq $2, %%xmm6, %%xmm5 \n" |
724 | | "vpalignr $14, %%xmm6, %%xmm9, %%xmm6 \n" |
725 | | "vpblendw $0xf8, 14(%0), %%xmm4, %%xmm4 \n" |
726 | | "vpblendw $0x3f, 30(%0), %%xmm5, %%xmm5 \n" |
727 | | |
728 | | /* Load the upper half of row 3 into xmm3 |
729 | | * x3 <- [54-57] (upper half) */ |
730 | | |
731 | | "vpinsrq $1, 108(%0), %%xmm3, %%xmm3\n" |
732 | | |
733 | | /* Reverse the even rows. We're not using PSHUFB as |
734 | | * that requires loading an extra constant all the time, |
735 | | * and we're alreadly pretty memory bound. |
736 | | */ |
737 | | |
738 | | "vpshuflw $0x1b, %%xmm0, %%xmm0 \n" |
739 | | "vpshuflw $0x1b, %%xmm2, %%xmm2 \n" |
740 | | "vpshuflw $0x1b, %%xmm4, %%xmm4 \n" |
741 | | "vpshuflw $0x1b, %%xmm6, %%xmm6 \n" |
742 | | |
743 | | "vpshufhw $0x1b, %%xmm0, %%xmm0 \n" |
744 | | "vpshufhw $0x1b, %%xmm2, %%xmm2 \n" |
745 | | "vpshufhw $0x1b, %%xmm4, %%xmm4 \n" |
746 | | "vpshufhw $0x1b, %%xmm6, %%xmm6 \n" |
747 | | |
748 | | "vpshufd $0x4e, %%xmm0, %%xmm0 \n" |
749 | | "vpshufd $0x4e, %%xmm2, %%xmm2 \n" |
750 | | "vpshufd $0x4e, %%xmm4, %%xmm4 \n" |
751 | | "vpshufd $0x4e, %%xmm6, %%xmm6 \n" |
752 | | |
753 | | /* Transpose xmm0-xmm7 into xmm8-xmm15 */ |
754 | | |
755 | | "vpunpcklwd %%xmm1, %%xmm0, %%xmm8 \n" |
756 | | "vpunpcklwd %%xmm3, %%xmm2, %%xmm9 \n" |
757 | | "vpunpcklwd %%xmm5, %%xmm4, %%xmm10 \n" |
758 | | "vpunpcklwd %%xmm7, %%xmm6, %%xmm11 \n" |
759 | | "vpunpckhwd %%xmm1, %%xmm0, %%xmm12 \n" |
760 | | "vpunpckhwd %%xmm3, %%xmm2, %%xmm13 \n" |
761 | | "vpunpckhwd %%xmm5, %%xmm4, %%xmm14 \n" |
762 | | "vpunpckhwd %%xmm7, %%xmm6, %%xmm15 \n" |
763 | | |
764 | | "vpunpckldq %%xmm9, %%xmm8, %%xmm0 \n" |
765 | | "vpunpckldq %%xmm11, %%xmm10, %%xmm1 \n" |
766 | | "vpunpckhdq %%xmm9, %%xmm8, %%xmm2 \n" |
767 | | "vpunpckhdq %%xmm11, %%xmm10, %%xmm3 \n" |
768 | | "vpunpckldq %%xmm13, %%xmm12, %%xmm4 \n" |
769 | | "vpunpckldq %%xmm15, %%xmm14, %%xmm5 \n" |
770 | | "vpunpckhdq %%xmm13, %%xmm12, %%xmm6 \n" |
771 | | "vpunpckhdq %%xmm15, %%xmm14, %%xmm7 \n" |
772 | | |
773 | | "vpunpcklqdq %%xmm1, %%xmm0, %%xmm8 \n" |
774 | | "vpunpckhqdq %%xmm1, %%xmm0, %%xmm9 \n" |
775 | | "vpunpcklqdq %%xmm3, %%xmm2, %%xmm10 \n" |
776 | | "vpunpckhqdq %%xmm3, %%xmm2, %%xmm11 \n" |
777 | | "vpunpcklqdq %%xmm4, %%xmm5, %%xmm12 \n" |
778 | | "vpunpckhqdq %%xmm5, %%xmm4, %%xmm13 \n" |
779 | | "vpunpcklqdq %%xmm7, %%xmm6, %%xmm14 \n" |
780 | | "vpunpckhqdq %%xmm7, %%xmm6, %%xmm15 \n" |
781 | | |
782 | | /* Rotate the rows to get the correct final order. |
783 | | * Rotating xmm12 isn't needed, as we can handle |
784 | | * the rotation in the PUNPCKLQDQ above. Rotating |
785 | | * xmm8 isn't needed as it's already in the right order |
786 | | */ |
787 | | |
788 | | "vpalignr $2, %%xmm9, %%xmm9, %%xmm9 \n" |
789 | | "vpalignr $4, %%xmm10, %%xmm10, %%xmm10 \n" |
790 | | "vpalignr $6, %%xmm11, %%xmm11, %%xmm11 \n" |
791 | | "vpalignr $10, %%xmm13, %%xmm13, %%xmm13 \n" |
792 | | "vpalignr $12, %%xmm14, %%xmm14, %%xmm14 \n" |
793 | | "vpalignr $14, %%xmm15, %%xmm15, %%xmm15 \n" |
794 | | |
795 | | /* Convert from half -> float */ |
796 | | |
797 | | "vcvtph2ps %%xmm8, %%ymm8 \n" |
798 | | "vcvtph2ps %%xmm9, %%ymm9 \n" |
799 | | "vcvtph2ps %%xmm10, %%ymm10 \n" |
800 | | "vcvtph2ps %%xmm11, %%ymm11 \n" |
801 | | "vcvtph2ps %%xmm12, %%ymm12 \n" |
802 | | "vcvtph2ps %%xmm13, %%ymm13 \n" |
803 | | "vcvtph2ps %%xmm14, %%ymm14 \n" |
804 | | "vcvtph2ps %%xmm15, %%ymm15 \n" |
805 | | |
806 | | /* Move float values to dst */ |
807 | | |
808 | | "vmovaps %%ymm8, (%1) \n" |
809 | | "vmovaps %%ymm9, 32(%1) \n" |
810 | | "vmovaps %%ymm10, 64(%1) \n" |
811 | | "vmovaps %%ymm11, 96(%1) \n" |
812 | | "vmovaps %%ymm12, 128(%1) \n" |
813 | | "vmovaps %%ymm13, 160(%1) \n" |
814 | | "vmovaps %%ymm14, 192(%1) \n" |
815 | | "vmovaps %%ymm15, 224(%1) \n" |
816 | | #ifndef __AVX__ |
817 | | "vzeroupper \n" |
818 | | #endif /* __AVX__ */ |
819 | | : /* Output */ |
820 | | : /* Input */ "r"(src), "r"(dst) |
821 | | : /* Clobber */ "memory", |
822 | | #ifndef __AVX__ |
823 | | "%xmm0", "%xmm1", "%xmm2", "%xmm3", |
824 | | "%xmm4", "%xmm5", "%xmm6", "%xmm7", |
825 | | "%xmm8", "%xmm9", "%xmm10", "%xmm11", |
826 | | "%xmm12", "%xmm13", "%xmm14", "%xmm15" |
827 | | #else |
828 | | "%ymm0", "%ymm1", "%ymm2", "%ymm3", |
829 | | "%ymm4", "%ymm5", "%ymm6", "%ymm7", |
830 | | "%ymm8", "%ymm9", "%ymm10", "%ymm11", |
831 | | "%ymm12", "%ymm13", "%ymm14", "%ymm15" |
832 | | #endif /* __AVX__ */ |
833 | | ); |
834 | | |
835 | | #else |
836 | 0 | fromHalfZigZag_scalar(src, dst); |
837 | 0 | #endif /* defined IMF_HAVE_GCC_INLINEASM_64 */ |
838 | 0 | } |
839 | | |
840 | | |
841 | | // |
842 | | // Inverse 8x8 DCT, only inverting the DC. This assumes that |
843 | | // all AC frequencies are 0. |
844 | | // |
845 | | |
846 | | #ifndef IMF_HAVE_SSE2 |
847 | | |
848 | | void |
849 | | dctInverse8x8DcOnly (float *data) |
850 | | { |
851 | | float val = data[0] * 3.535536e-01f * 3.535536e-01f; |
852 | | |
853 | | for (int i = 0; i < 64; ++i) |
854 | | data[i] = val; |
855 | | } |
856 | | |
857 | | #else /* IMF_HAVE_SSE2 */ |
858 | | |
859 | | void |
860 | | dctInverse8x8DcOnly (float *data) |
861 | 0 | { |
862 | 0 | __m128 src = _mm_set1_ps (data[0] * 3.535536e-01f * 3.535536e-01f); |
863 | 0 | __m128 *dst = (__m128 *)data; |
864 | |
|
865 | 0 | for (int i = 0; i < 16; ++i) |
866 | 0 | dst[i] = src; |
867 | 0 | } |
868 | | |
869 | | #endif /* IMF_HAVE_SSE2 */ |
870 | | |
871 | | |
872 | | // |
873 | | // Full 8x8 Inverse DCT: |
874 | | // |
875 | | // Simple inverse DCT on an 8x8 block, with scalar ops only. |
876 | | // Operates on data in-place. |
877 | | // |
878 | | // This is based on the iDCT formuation (y = frequency domain, |
879 | | // x = spatial domain) |
880 | | // |
881 | | // [x0] [ ][y0] [ ][y1] |
882 | | // [x1] = [ M1 ][y2] + [ M2 ][y3] |
883 | | // [x2] [ ][y4] [ ][y5] |
884 | | // [x3] [ ][y6] [ ][y7] |
885 | | // |
886 | | // [x7] [ ][y0] [ ][y1] |
887 | | // [x6] = [ M1 ][y2] - [ M2 ][y3] |
888 | | // [x5] [ ][y4] [ ][y5] |
889 | | // [x4] [ ][y6] [ ][y7] |
890 | | // |
891 | | // where M1: M2: |
892 | | // |
893 | | // [a c a f] [b d e g] |
894 | | // [a f -a -c] [d -g -b -e] |
895 | | // [a -f -a c] [e -b g d] |
896 | | // [a -c a -f] [g -e d -b] |
897 | | // |
898 | | // and the constants are as defined below.. |
899 | | // |
900 | | // If you know how many of the lower rows are zero, that can |
901 | | // be passed in to help speed things up. If you don't know, |
902 | | // just set zeroedRows=0. |
903 | | // |
904 | | |
905 | | // |
906 | | // Default implementation |
907 | | // |
908 | | |
909 | | template <int zeroedRows> |
910 | | void |
911 | | dctInverse8x8_scalar (float *data) |
912 | 0 | { |
913 | 0 | const float a = .5f * cosf (3.14159f / 4.0f); |
914 | 0 | const float b = .5f * cosf (3.14159f / 16.0f); |
915 | 0 | const float c = .5f * cosf (3.14159f / 8.0f); |
916 | 0 | const float d = .5f * cosf (3.f*3.14159f / 16.0f); |
917 | 0 | const float e = .5f * cosf (5.f*3.14159f / 16.0f); |
918 | 0 | const float f = .5f * cosf (3.f*3.14159f / 8.0f); |
919 | 0 | const float g = .5f * cosf (7.f*3.14159f / 16.0f); |
920 | |
|
921 | 0 | float alpha[4], beta[4], theta[4], gamma[4]; |
922 | |
|
923 | 0 | float *rowPtr = NULL; |
924 | | |
925 | | // |
926 | | // First pass - row wise. |
927 | | // |
928 | | // This looks less-compact than the description above in |
929 | | // an attempt to fold together common sub-expressions. |
930 | | // |
931 | |
|
932 | 0 | for (int row = 0; row < 8 - zeroedRows; ++row) |
933 | 0 | { |
934 | 0 | rowPtr = data + row * 8; |
935 | |
|
936 | 0 | alpha[0] = c * rowPtr[2]; |
937 | 0 | alpha[1] = f * rowPtr[2]; |
938 | 0 | alpha[2] = c * rowPtr[6]; |
939 | 0 | alpha[3] = f * rowPtr[6]; |
940 | |
|
941 | 0 | beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7]; |
942 | 0 | beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7]; |
943 | 0 | beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7]; |
944 | 0 | beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7]; |
945 | |
|
946 | 0 | theta[0] = a * (rowPtr[0] + rowPtr[4]); |
947 | 0 | theta[3] = a * (rowPtr[0] - rowPtr[4]); |
948 | |
|
949 | 0 | theta[1] = alpha[0] + alpha[3]; |
950 | 0 | theta[2] = alpha[1] - alpha[2]; |
951 | | |
952 | |
|
953 | 0 | gamma[0] = theta[0] + theta[1]; |
954 | 0 | gamma[1] = theta[3] + theta[2]; |
955 | 0 | gamma[2] = theta[3] - theta[2]; |
956 | 0 | gamma[3] = theta[0] - theta[1]; |
957 | | |
958 | |
|
959 | 0 | rowPtr[0] = gamma[0] + beta[0]; |
960 | 0 | rowPtr[1] = gamma[1] + beta[1]; |
961 | 0 | rowPtr[2] = gamma[2] + beta[2]; |
962 | 0 | rowPtr[3] = gamma[3] + beta[3]; |
963 | |
|
964 | 0 | rowPtr[4] = gamma[3] - beta[3]; |
965 | 0 | rowPtr[5] = gamma[2] - beta[2]; |
966 | 0 | rowPtr[6] = gamma[1] - beta[1]; |
967 | 0 | rowPtr[7] = gamma[0] - beta[0]; |
968 | 0 | } |
969 | | |
970 | | // |
971 | | // Second pass - column wise. |
972 | | // |
973 | |
|
974 | 0 | for (int column = 0; column < 8; ++column) |
975 | 0 | { |
976 | 0 | alpha[0] = c * data[16+column]; |
977 | 0 | alpha[1] = f * data[16+column]; |
978 | 0 | alpha[2] = c * data[48+column]; |
979 | 0 | alpha[3] = f * data[48+column]; |
980 | |
|
981 | 0 | beta[0] = b * data[8+column] + d * data[24+column] + |
982 | 0 | e * data[40+column] + g * data[56+column]; |
983 | |
|
984 | 0 | beta[1] = d * data[8+column] - g * data[24+column] - |
985 | 0 | b * data[40+column] - e * data[56+column]; |
986 | |
|
987 | 0 | beta[2] = e * data[8+column] - b * data[24+column] + |
988 | 0 | g * data[40+column] + d * data[56+column]; |
989 | |
|
990 | 0 | beta[3] = g * data[8+column] - e * data[24+column] + |
991 | 0 | d * data[40+column] - b * data[56+column]; |
992 | |
|
993 | 0 | theta[0] = a * (data[column] + data[32+column]); |
994 | 0 | theta[3] = a * (data[column] - data[32+column]); |
995 | |
|
996 | 0 | theta[1] = alpha[0] + alpha[3]; |
997 | 0 | theta[2] = alpha[1] - alpha[2]; |
998 | |
|
999 | 0 | gamma[0] = theta[0] + theta[1]; |
1000 | 0 | gamma[1] = theta[3] + theta[2]; |
1001 | 0 | gamma[2] = theta[3] - theta[2]; |
1002 | 0 | gamma[3] = theta[0] - theta[1]; |
1003 | |
|
1004 | 0 | data[ column] = gamma[0] + beta[0]; |
1005 | 0 | data[ 8 + column] = gamma[1] + beta[1]; |
1006 | 0 | data[16 + column] = gamma[2] + beta[2]; |
1007 | 0 | data[24 + column] = gamma[3] + beta[3]; |
1008 | |
|
1009 | 0 | data[32 + column] = gamma[3] - beta[3]; |
1010 | 0 | data[40 + column] = gamma[2] - beta[2]; |
1011 | 0 | data[48 + column] = gamma[1] - beta[1]; |
1012 | 0 | data[56 + column] = gamma[0] - beta[0]; |
1013 | 0 | } |
1014 | 0 | } Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<0>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<1>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<2>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<3>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<4>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<5>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<6>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<7>(float*) |
1015 | | |
1016 | | |
1017 | | // |
1018 | | // SSE2 Implementation |
1019 | | // |
1020 | | |
1021 | | template <int zeroedRows> |
1022 | | void |
1023 | | dctInverse8x8_sse2 (float *data) |
1024 | 0 | { |
1025 | 0 | #ifdef IMF_HAVE_SSE2 |
1026 | 0 | __m128 a = {3.535536e-01f,3.535536e-01f,3.535536e-01f,3.535536e-01f}; |
1027 | 0 | __m128 b = {4.903927e-01f,4.903927e-01f,4.903927e-01f,4.903927e-01f}; |
1028 | 0 | __m128 c = {4.619398e-01f,4.619398e-01f,4.619398e-01f,4.619398e-01f}; |
1029 | 0 | __m128 d = {4.157349e-01f,4.157349e-01f,4.157349e-01f,4.157349e-01f}; |
1030 | 0 | __m128 e = {2.777855e-01f,2.777855e-01f,2.777855e-01f,2.777855e-01f}; |
1031 | 0 | __m128 f = {1.913422e-01f,1.913422e-01f,1.913422e-01f,1.913422e-01f}; |
1032 | 0 | __m128 g = {9.754573e-02f,9.754573e-02f,9.754573e-02f,9.754573e-02f}; |
1033 | |
|
1034 | 0 | __m128 c0 = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f}; |
1035 | 0 | __m128 c1 = {4.619398e-01f, 1.913422e-01f,-1.913422e-01f,-4.619398e-01f}; |
1036 | 0 | __m128 c2 = {3.535536e-01f,-3.535536e-01f,-3.535536e-01f, 3.535536e-01f}; |
1037 | 0 | __m128 c3 = {1.913422e-01f,-4.619398e-01f, 4.619398e-01f,-1.913422e-01f}; |
1038 | |
|
1039 | 0 | __m128 c4 = {4.903927e-01f, 4.157349e-01f, 2.777855e-01f, 9.754573e-02f}; |
1040 | 0 | __m128 c5 = {4.157349e-01f,-9.754573e-02f,-4.903927e-01f,-2.777855e-01f}; |
1041 | 0 | __m128 c6 = {2.777855e-01f,-4.903927e-01f, 9.754573e-02f, 4.157349e-01f}; |
1042 | 0 | __m128 c7 = {9.754573e-02f,-2.777855e-01f, 4.157349e-01f,-4.903927e-01f}; |
1043 | |
|
1044 | 0 | __m128 *srcVec = (__m128 *)data; |
1045 | 0 | __m128 x[8], evenSum, oddSum; |
1046 | 0 | __m128 in[8], alpha[4], beta[4], theta[4], gamma[4]; |
1047 | | |
1048 | | // |
1049 | | // Rows - |
1050 | | // |
1051 | | // Treat this just like matrix-vector multiplication. The |
1052 | | // trick is to note that: |
1053 | | // |
1054 | | // [M00 M01 M02 M03][v0] [(v0 M00) + (v1 M01) + (v2 M02) + (v3 M03)] |
1055 | | // [M10 M11 M12 M13][v1] = [(v0 M10) + (v1 M11) + (v2 M12) + (v3 M13)] |
1056 | | // [M20 M21 M22 M23][v2] [(v0 M20) + (v1 M21) + (v2 M22) + (v3 M23)] |
1057 | | // [M30 M31 M32 M33][v3] [(v0 M30) + (v1 M31) + (v2 M32) + (v3 M33)] |
1058 | | // |
1059 | | // Then, we can fill a register with v_i and multiply by the i-th column |
1060 | | // of M, accumulating across all i-s. |
1061 | | // |
1062 | | // The kids refer to the populating of a register with a single value |
1063 | | // "broadcasting", and it can be done with a shuffle instruction. It |
1064 | | // seems to be the slowest part of the whole ordeal. |
1065 | | // |
1066 | | // Our matrix columns are stored above in c0-c7. c0-3 make up M1, and |
1067 | | // c4-7 are from M2. |
1068 | | // |
1069 | |
|
1070 | 0 | #define DCT_INVERSE_8x8_SS2_ROW_LOOP(i) \ |
1071 | | /* \ |
1072 | | * Broadcast the components of the row \ |
1073 | | */ \ |
1074 | 0 | \ |
1075 | 0 | x[0] = _mm_shuffle_ps (srcVec[2 * i], \ |
1076 | 0 | srcVec[2 * i], \ |
1077 | 0 | _MM_SHUFFLE (0, 0, 0, 0)); \ |
1078 | 0 | \ |
1079 | 0 | x[1] = _mm_shuffle_ps (srcVec[2 * i], \ |
1080 | 0 | srcVec[2 * i], \ |
1081 | 0 | _MM_SHUFFLE (1, 1, 1, 1)); \ |
1082 | 0 | \ |
1083 | 0 | x[2] = _mm_shuffle_ps (srcVec[2 * i], \ |
1084 | 0 | srcVec[2 * i], \ |
1085 | 0 | _MM_SHUFFLE (2, 2, 2, 2)); \ |
1086 | 0 | \ |
1087 | 0 | x[3] = _mm_shuffle_ps (srcVec[2 * i], \ |
1088 | 0 | srcVec[2 * i], \ |
1089 | 0 | _MM_SHUFFLE (3, 3, 3, 3)); \ |
1090 | 0 | \ |
1091 | 0 | x[4] = _mm_shuffle_ps (srcVec[2 * i + 1], \ |
1092 | 0 | srcVec[2 * i + 1], \ |
1093 | 0 | _MM_SHUFFLE (0, 0, 0, 0)); \ |
1094 | 0 | \ |
1095 | 0 | x[5] = _mm_shuffle_ps (srcVec[2 * i + 1], \ |
1096 | 0 | srcVec[2 * i + 1], \ |
1097 | 0 | _MM_SHUFFLE (1, 1, 1, 1)); \ |
1098 | 0 | \ |
1099 | 0 | x[6] = _mm_shuffle_ps (srcVec[2 * i + 1], \ |
1100 | 0 | srcVec[2 * i + 1], \ |
1101 | 0 | _MM_SHUFFLE (2, 2, 2, 2)); \ |
1102 | 0 | \ |
1103 | 0 | x[7] = _mm_shuffle_ps (srcVec[2 * i + 1], \ |
1104 | 0 | srcVec[2 * i + 1], \ |
1105 | 0 | _MM_SHUFFLE (3, 3, 3, 3)); \ |
1106 | | /* \ |
1107 | | * Multiply the components by each column of the matrix \ |
1108 | | */ \ |
1109 | 0 | \ |
1110 | 0 | x[0] = _mm_mul_ps (x[0], c0); \ |
1111 | 0 | x[2] = _mm_mul_ps (x[2], c1); \ |
1112 | 0 | x[4] = _mm_mul_ps (x[4], c2); \ |
1113 | 0 | x[6] = _mm_mul_ps (x[6], c3); \ |
1114 | 0 | \ |
1115 | 0 | x[1] = _mm_mul_ps (x[1], c4); \ |
1116 | 0 | x[3] = _mm_mul_ps (x[3], c5); \ |
1117 | 0 | x[5] = _mm_mul_ps (x[5], c6); \ |
1118 | 0 | x[7] = _mm_mul_ps (x[7], c7); \ |
1119 | 0 | \ |
1120 | | /* \ |
1121 | | * Add across \ |
1122 | | */ \ |
1123 | 0 | \ |
1124 | 0 | evenSum = _mm_setzero_ps(); \ |
1125 | 0 | evenSum = _mm_add_ps (evenSum, x[0]); \ |
1126 | 0 | evenSum = _mm_add_ps (evenSum, x[2]); \ |
1127 | 0 | evenSum = _mm_add_ps (evenSum, x[4]); \ |
1128 | 0 | evenSum = _mm_add_ps (evenSum, x[6]); \ |
1129 | 0 | \ |
1130 | 0 | oddSum = _mm_setzero_ps(); \ |
1131 | 0 | oddSum = _mm_add_ps (oddSum, x[1]); \ |
1132 | 0 | oddSum = _mm_add_ps (oddSum, x[3]); \ |
1133 | 0 | oddSum = _mm_add_ps (oddSum, x[5]); \ |
1134 | 0 | oddSum = _mm_add_ps (oddSum, x[7]); \ |
1135 | 0 | \ |
1136 | | /* \ |
1137 | | * Final Sum: \ |
1138 | | * out [0, 1, 2, 3] = evenSum + oddSum \ |
1139 | | * out [7, 6, 5, 4] = evenSum - oddSum \ |
1140 | | */ \ |
1141 | 0 | \ |
1142 | 0 | srcVec[2 * i] = _mm_add_ps (evenSum, oddSum); \ |
1143 | 0 | srcVec[2 * i + 1] = _mm_sub_ps (evenSum, oddSum); \ |
1144 | 0 | srcVec[2 * i + 1] = _mm_shuffle_ps (srcVec[2 * i + 1], \ |
1145 | 0 | srcVec[2 * i + 1], \ |
1146 | 0 | _MM_SHUFFLE (0, 1, 2, 3)); |
1147 | |
|
1148 | 0 | switch (zeroedRows) |
1149 | 0 | { |
1150 | 0 | case 0: |
1151 | 0 | default: |
1152 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0) |
1153 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1) |
1154 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2) |
1155 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3) |
1156 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (4) |
1157 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (5) |
1158 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (6) |
1159 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (7) |
1160 | 0 | break; |
1161 | | |
1162 | 0 | case 1: |
1163 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0) |
1164 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1) |
1165 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2) |
1166 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3) |
1167 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (4) |
1168 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (5) |
1169 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (6) |
1170 | 0 | break; |
1171 | | |
1172 | 0 | case 2: |
1173 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0) |
1174 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1) |
1175 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2) |
1176 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3) |
1177 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (4) |
1178 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (5) |
1179 | 0 | break; |
1180 | | |
1181 | 0 | case 3: |
1182 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0) |
1183 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1) |
1184 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2) |
1185 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3) |
1186 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (4) |
1187 | 0 | break; |
1188 | | |
1189 | 0 | case 4: |
1190 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0) |
1191 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1) |
1192 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2) |
1193 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3) |
1194 | 0 | break; |
1195 | | |
1196 | 0 | case 5: |
1197 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0) |
1198 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1) |
1199 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2) |
1200 | 0 | break; |
1201 | | |
1202 | 0 | case 6: |
1203 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0) |
1204 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1) |
1205 | 0 | break; |
1206 | | |
1207 | 0 | case 7: |
1208 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0) |
1209 | 0 | break; |
1210 | 0 | } |
1211 | | |
1212 | | // |
1213 | | // Columns - |
1214 | | // |
1215 | | // This is slightly more straightforward, if less readable. Here |
1216 | | // we just operate on 4 columns at a time, in two batches. |
1217 | | // |
1218 | | // The slight mess is to try and cache sub-expressions, which |
1219 | | // we ignore in the row-wise pass. |
1220 | | // |
1221 | | |
1222 | 0 | for (int col = 0; col < 2; ++col) |
1223 | 0 | { |
1224 | |
|
1225 | 0 | for (int i = 0; i < 8; ++i) |
1226 | 0 | in[i] = srcVec[2 * i + col]; |
1227 | |
|
1228 | 0 | alpha[0] = _mm_mul_ps (c, in[2]); |
1229 | 0 | alpha[1] = _mm_mul_ps (f, in[2]); |
1230 | 0 | alpha[2] = _mm_mul_ps (c, in[6]); |
1231 | 0 | alpha[3] = _mm_mul_ps (f, in[6]); |
1232 | |
|
1233 | 0 | beta[0] = _mm_add_ps (_mm_add_ps (_mm_mul_ps (in[1], b), |
1234 | 0 | _mm_mul_ps (in[3], d)), |
1235 | 0 | _mm_add_ps (_mm_mul_ps (in[5], e), |
1236 | 0 | _mm_mul_ps (in[7], g))); |
1237 | |
|
1238 | 0 | beta[1] = _mm_sub_ps (_mm_sub_ps (_mm_mul_ps (in[1], d), |
1239 | 0 | _mm_mul_ps (in[3], g)), |
1240 | 0 | _mm_add_ps (_mm_mul_ps (in[5], b), |
1241 | 0 | _mm_mul_ps (in[7], e))); |
1242 | |
|
1243 | 0 | beta[2] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], e), |
1244 | 0 | _mm_mul_ps (in[3], b)), |
1245 | 0 | _mm_add_ps (_mm_mul_ps (in[5], g), |
1246 | 0 | _mm_mul_ps (in[7], d))); |
1247 | |
|
1248 | 0 | beta[3] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], g), |
1249 | 0 | _mm_mul_ps (in[3], e)), |
1250 | 0 | _mm_sub_ps (_mm_mul_ps (in[5], d), |
1251 | 0 | _mm_mul_ps (in[7], b))); |
1252 | |
|
1253 | 0 | theta[0] = _mm_mul_ps (a, _mm_add_ps (in[0], in[4])); |
1254 | 0 | theta[3] = _mm_mul_ps (a, _mm_sub_ps (in[0], in[4])); |
1255 | |
|
1256 | 0 | theta[1] = _mm_add_ps (alpha[0], alpha[3]); |
1257 | 0 | theta[2] = _mm_sub_ps (alpha[1], alpha[2]); |
1258 | |
|
1259 | 0 | gamma[0] = _mm_add_ps (theta[0], theta[1]); |
1260 | 0 | gamma[1] = _mm_add_ps (theta[3], theta[2]); |
1261 | 0 | gamma[2] = _mm_sub_ps (theta[3], theta[2]); |
1262 | 0 | gamma[3] = _mm_sub_ps (theta[0], theta[1]); |
1263 | |
|
1264 | 0 | srcVec[ col] = _mm_add_ps (gamma[0], beta[0]); |
1265 | 0 | srcVec[2+col] = _mm_add_ps (gamma[1], beta[1]); |
1266 | 0 | srcVec[4+col] = _mm_add_ps (gamma[2], beta[2]); |
1267 | 0 | srcVec[6+col] = _mm_add_ps (gamma[3], beta[3]); |
1268 | |
|
1269 | 0 | srcVec[ 8+col] = _mm_sub_ps (gamma[3], beta[3]); |
1270 | 0 | srcVec[10+col] = _mm_sub_ps (gamma[2], beta[2]); |
1271 | 0 | srcVec[12+col] = _mm_sub_ps (gamma[1], beta[1]); |
1272 | 0 | srcVec[14+col] = _mm_sub_ps (gamma[0], beta[0]); |
1273 | 0 | } |
1274 | |
|
1275 | | #else /* IMF_HAVE_SSE2 */ |
1276 | | |
1277 | | dctInverse8x8_scalar<zeroedRows> (data); |
1278 | | |
1279 | | #endif /* IMF_HAVE_SSE2 */ |
1280 | 0 | } Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<0>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<1>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<2>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<3>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<4>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<5>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<6>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<7>(float*) |
1281 | | |
1282 | | |
1283 | | // |
1284 | | // AVX Implementation |
1285 | | // |
1286 | | |
1287 | | #define STR(A) #A |
1288 | | |
1289 | | #define IDCT_AVX_SETUP_2_ROWS(_DST0, _DST1, _TMP0, _TMP1, \ |
1290 | | _OFF00, _OFF01, _OFF10, _OFF11) \ |
1291 | | "vmovaps " STR(_OFF00) "(%0), %%xmm" STR(_TMP0) " \n" \ |
1292 | | "vmovaps " STR(_OFF01) "(%0), %%xmm" STR(_TMP1) " \n" \ |
1293 | | " \n" \ |
1294 | | "vinsertf128 $1, " STR(_OFF10) "(%0), %%ymm" STR(_TMP0) ", %%ymm" STR(_TMP0) " \n" \ |
1295 | | "vinsertf128 $1, " STR(_OFF11) "(%0), %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP1) " \n" \ |
1296 | | " \n" \ |
1297 | | "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \ |
1298 | | "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n" \ |
1299 | | " \n" \ |
1300 | | "vunpcklps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP0) " \n" \ |
1301 | | "vunpckhps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP1) " \n" \ |
1302 | | " \n" \ |
1303 | | "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \ |
1304 | | "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n" |
1305 | | |
1306 | | #define IDCT_AVX_MMULT_ROWS(_SRC) \ |
1307 | | /* Broadcast the source values into y12-y15 */ \ |
1308 | | "vpermilps $0x00, " STR(_SRC) ", %%ymm12 \n" \ |
1309 | | "vpermilps $0x55, " STR(_SRC) ", %%ymm13 \n" \ |
1310 | | "vpermilps $0xaa, " STR(_SRC) ", %%ymm14 \n" \ |
1311 | | "vpermilps $0xff, " STR(_SRC) ", %%ymm15 \n" \ |
1312 | | \ |
1313 | | /* Multiple coefs and the broadcasted values */ \ |
1314 | | "vmulps %%ymm12, %%ymm8, %%ymm12 \n" \ |
1315 | | "vmulps %%ymm13, %%ymm9, %%ymm13 \n" \ |
1316 | | "vmulps %%ymm14, %%ymm10, %%ymm14 \n" \ |
1317 | | "vmulps %%ymm15, %%ymm11, %%ymm15 \n" \ |
1318 | | \ |
1319 | | /* Accumulate the result back into the source */ \ |
1320 | | "vaddps %%ymm13, %%ymm12, %%ymm12 \n" \ |
1321 | | "vaddps %%ymm15, %%ymm14, %%ymm14 \n" \ |
1322 | | "vaddps %%ymm14, %%ymm12, " STR(_SRC) "\n" |
1323 | | |
1324 | | #define IDCT_AVX_EO_TO_ROW_HALVES(_EVEN, _ODD, _FRONT, _BACK) \ |
1325 | | "vsubps " STR(_ODD) "," STR(_EVEN) "," STR(_BACK) "\n" \ |
1326 | | "vaddps " STR(_ODD) "," STR(_EVEN) "," STR(_FRONT) "\n" \ |
1327 | | /* Reverse the back half */ \ |
1328 | | "vpermilps $0x1b," STR(_BACK) "," STR(_BACK) "\n" |
1329 | | |
1330 | | /* In order to allow for path paths when we know certain rows |
1331 | | * of the 8x8 block are zero, most of the body of the DCT is |
1332 | | * in the following macro. Statements are wrapped in a ROWn() |
1333 | | * macro, where n is the lowest row in the 8x8 block in which |
1334 | | * they depend. |
1335 | | * |
1336 | | * This should work for the cases where we have 2-8 full rows. |
1337 | | * the 1-row case is special, and we'll handle it seperately. |
1338 | | */ |
1339 | | #define IDCT_AVX_BODY \ |
1340 | | /* ============================================== |
1341 | | * Row 1D DCT |
1342 | | * ---------------------------------------------- |
1343 | | */ \ |
1344 | | \ |
1345 | | /* Setup for the row-oriented 1D DCT. Assuming that (%0) holds |
1346 | | * the row-major 8x8 block, load ymm0-3 with the even columns |
1347 | | * and ymm4-7 with the odd columns. The lower half of the ymm |
1348 | | * holds one row, while the upper half holds the next row. |
1349 | | * |
1350 | | * If our source is: |
1351 | | * a0 a1 a2 a3 a4 a5 a6 a7 |
1352 | | * b0 b1 b2 b3 b4 b5 b6 b7 |
1353 | | * |
1354 | | * We'll be forming: |
1355 | | * a0 a2 a4 a6 b0 b2 b4 b6 |
1356 | | * a1 a3 a5 a7 b1 b3 b5 b7 |
1357 | | */ \ |
1358 | | ROW0( IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48) ) \ |
1359 | | ROW2( IDCT_AVX_SETUP_2_ROWS(1, 5, 12, 13, 64, 80, 96, 112) ) \ |
1360 | | ROW4( IDCT_AVX_SETUP_2_ROWS(2, 6, 10, 11, 128, 144, 160, 176) ) \ |
1361 | | ROW6( IDCT_AVX_SETUP_2_ROWS(3, 7, 8, 9, 192, 208, 224, 240) ) \ |
1362 | | \ |
1363 | | /* Multiple the even columns (ymm0-3) by the matrix M1 |
1364 | | * storing the results back in ymm0-3 |
1365 | | * |
1366 | | * Assume that (%1) holds the matrix in column major order |
1367 | | */ \ |
1368 | | "vbroadcastf128 (%1), %%ymm8 \n" \ |
1369 | | "vbroadcastf128 16(%1), %%ymm9 \n" \ |
1370 | | "vbroadcastf128 32(%1), %%ymm10 \n" \ |
1371 | | "vbroadcastf128 48(%1), %%ymm11 \n" \ |
1372 | | \ |
1373 | | ROW0( IDCT_AVX_MMULT_ROWS(%%ymm0) ) \ |
1374 | | ROW2( IDCT_AVX_MMULT_ROWS(%%ymm1) ) \ |
1375 | | ROW4( IDCT_AVX_MMULT_ROWS(%%ymm2) ) \ |
1376 | | ROW6( IDCT_AVX_MMULT_ROWS(%%ymm3) ) \ |
1377 | | \ |
1378 | | /* Repeat, but with the odd columns (ymm4-7) and the |
1379 | | * matrix M2 |
1380 | | */ \ |
1381 | | "vbroadcastf128 64(%1), %%ymm8 \n" \ |
1382 | | "vbroadcastf128 80(%1), %%ymm9 \n" \ |
1383 | | "vbroadcastf128 96(%1), %%ymm10 \n" \ |
1384 | | "vbroadcastf128 112(%1), %%ymm11 \n" \ |
1385 | | \ |
1386 | | ROW0( IDCT_AVX_MMULT_ROWS(%%ymm4) ) \ |
1387 | | ROW2( IDCT_AVX_MMULT_ROWS(%%ymm5) ) \ |
1388 | | ROW4( IDCT_AVX_MMULT_ROWS(%%ymm6) ) \ |
1389 | | ROW6( IDCT_AVX_MMULT_ROWS(%%ymm7) ) \ |
1390 | | \ |
1391 | | /* Sum the M1 (ymm0-3) and M2 (ymm4-7) results to get the |
1392 | | * front halves of the results, and difference to get the |
1393 | | * back halves. The front halfs end up in ymm0-3, the back |
1394 | | * halves end up in ymm12-15. |
1395 | | */ \ |
1396 | | ROW0( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) ) \ |
1397 | | ROW2( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm1, %%ymm5, %%ymm1, %%ymm13) ) \ |
1398 | | ROW4( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm2, %%ymm6, %%ymm2, %%ymm14) ) \ |
1399 | | ROW6( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm3, %%ymm7, %%ymm3, %%ymm15) ) \ |
1400 | | \ |
1401 | | /* Reassemble the rows halves into ymm0-7 */ \ |
1402 | | ROW7( "vperm2f128 $0x13, %%ymm3, %%ymm15, %%ymm7 \n" ) \ |
1403 | | ROW6( "vperm2f128 $0x02, %%ymm3, %%ymm15, %%ymm6 \n" ) \ |
1404 | | ROW5( "vperm2f128 $0x13, %%ymm2, %%ymm14, %%ymm5 \n" ) \ |
1405 | | ROW4( "vperm2f128 $0x02, %%ymm2, %%ymm14, %%ymm4 \n" ) \ |
1406 | | ROW3( "vperm2f128 $0x13, %%ymm1, %%ymm13, %%ymm3 \n" ) \ |
1407 | | ROW2( "vperm2f128 $0x02, %%ymm1, %%ymm13, %%ymm2 \n" ) \ |
1408 | | ROW1( "vperm2f128 $0x13, %%ymm0, %%ymm12, %%ymm1 \n" ) \ |
1409 | | ROW0( "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" ) \ |
1410 | | \ |
1411 | | \ |
1412 | | /* ============================================== |
1413 | | * Column 1D DCT |
1414 | | * ---------------------------------------------- |
1415 | | */ \ |
1416 | | \ |
1417 | | /* Rows should be in ymm0-7, and M2 columns should still be |
1418 | | * preserved in ymm8-11. M2 has 4 unique values (and +- |
1419 | | * versions of each), and all (positive) values appear in |
1420 | | * the first column (and row), which is in ymm8. |
1421 | | * |
1422 | | * For the column-wise DCT, we need to: |
1423 | | * 1) Broadcast each element a row of M2 into 4 vectors |
1424 | | * 2) Multiple the odd rows (ymm1,3,5,7) by the broadcasts. |
1425 | | * 3) Accumulate into ymm12-15 for the odd outputs. |
1426 | | * |
1427 | | * Instead of doing 16 broadcasts for each element in M2, |
1428 | | * do 4, filling y8-11 with: |
1429 | | * |
1430 | | * ymm8: [ b b b b | b b b b ] |
1431 | | * ymm9: [ d d d d | d d d d ] |
1432 | | * ymm10: [ e e e e | e e e e ] |
1433 | | * ymm11: [ g g g g | g g g g ] |
1434 | | * |
1435 | | * And deal with the negative values by subtracting during accum. |
1436 | | */ \ |
1437 | | "vpermilps $0xff, %%ymm8, %%ymm11 \n" \ |
1438 | | "vpermilps $0xaa, %%ymm8, %%ymm10 \n" \ |
1439 | | "vpermilps $0x55, %%ymm8, %%ymm9 \n" \ |
1440 | | "vpermilps $0x00, %%ymm8, %%ymm8 \n" \ |
1441 | | \ |
1442 | | /* This one is easy, since we have ymm12-15 open for scratch |
1443 | | * ymm12 = b ymm1 + d ymm3 + e ymm5 + g ymm7 |
1444 | | */ \ |
1445 | | ROW1( "vmulps %%ymm1, %%ymm8, %%ymm12 \n" ) \ |
1446 | | ROW3( "vmulps %%ymm3, %%ymm9, %%ymm13 \n" ) \ |
1447 | | ROW5( "vmulps %%ymm5, %%ymm10, %%ymm14 \n" ) \ |
1448 | | ROW7( "vmulps %%ymm7, %%ymm11, %%ymm15 \n" ) \ |
1449 | | \ |
1450 | | ROW3( "vaddps %%ymm12, %%ymm13, %%ymm12 \n" ) \ |
1451 | | ROW7( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \ |
1452 | | ROW5( "vaddps %%ymm12, %%ymm14, %%ymm12 \n" ) \ |
1453 | | \ |
1454 | | /* Tricker, since only y13-15 are open for scratch |
1455 | | * ymm13 = d ymm1 - g ymm3 - b ymm5 - e ymm7 |
1456 | | */ \ |
1457 | | ROW1( "vmulps %%ymm1, %%ymm9, %%ymm13 \n" ) \ |
1458 | | ROW3( "vmulps %%ymm3, %%ymm11, %%ymm14 \n" ) \ |
1459 | | ROW5( "vmulps %%ymm5, %%ymm8, %%ymm15 \n" ) \ |
1460 | | \ |
1461 | | ROW5( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \ |
1462 | | ROW3( "vsubps %%ymm14, %%ymm13, %%ymm13 \n" ) \ |
1463 | | \ |
1464 | | ROW7( "vmulps %%ymm7, %%ymm10, %%ymm15 \n" ) \ |
1465 | | ROW7( "vsubps %%ymm15, %%ymm13, %%ymm13 \n" ) \ |
1466 | | \ |
1467 | | /* Tricker still, as only y14-15 are open for scratch |
1468 | | * ymm14 = e ymm1 - b ymm3 + g ymm5 + d ymm7 |
1469 | | */ \ |
1470 | | ROW1( "vmulps %%ymm1, %%ymm10, %%ymm14 \n" ) \ |
1471 | | ROW3( "vmulps %%ymm3, %%ymm8, %%ymm15 \n" ) \ |
1472 | | \ |
1473 | | ROW3( "vsubps %%ymm15, %%ymm14, %%ymm14 \n" ) \ |
1474 | | \ |
1475 | | ROW5( "vmulps %%ymm5, %%ymm11, %%ymm15 \n" ) \ |
1476 | | ROW5( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \ |
1477 | | \ |
1478 | | ROW7( "vmulps %%ymm7, %%ymm9, %%ymm15 \n" ) \ |
1479 | | ROW7( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \ |
1480 | | \ |
1481 | | \ |
1482 | | /* Easy, as we can blow away ymm1,3,5,7 for scratch |
1483 | | * ymm15 = g ymm1 - e ymm3 + d ymm5 - b ymm7 |
1484 | | */ \ |
1485 | | ROW1( "vmulps %%ymm1, %%ymm11, %%ymm15 \n" ) \ |
1486 | | ROW3( "vmulps %%ymm3, %%ymm10, %%ymm3 \n" ) \ |
1487 | | ROW5( "vmulps %%ymm5, %%ymm9, %%ymm5 \n" ) \ |
1488 | | ROW7( "vmulps %%ymm7, %%ymm8, %%ymm7 \n" ) \ |
1489 | | \ |
1490 | | ROW5( "vaddps %%ymm15, %%ymm5, %%ymm15 \n" ) \ |
1491 | | ROW7( "vaddps %%ymm3, %%ymm7, %%ymm3 \n" ) \ |
1492 | | ROW3( "vsubps %%ymm3, %%ymm15, %%ymm15 \n" ) \ |
1493 | | \ |
1494 | | \ |
1495 | | /* Load coefs for M1. Because we're going to broadcast |
1496 | | * coefs, we don't need to load the actual structure from |
1497 | | * M1. Instead, just load enough that we can broadcast. |
1498 | | * There are only 6 unique values in M1, but they're in +- |
1499 | | * pairs, leaving only 3 unique coefs if we add and subtract |
1500 | | * properly. |
1501 | | * |
1502 | | * Fill ymm1 with coef[2] = [ a a c f | a a c f ] |
1503 | | * Broadcast ymm5 with [ f f f f | f f f f ] |
1504 | | * Broadcast ymm3 with [ c c c c | c c c c ] |
1505 | | * Broadcast ymm1 with [ a a a a | a a a a ] |
1506 | | */ \ |
1507 | | "vbroadcastf128 8(%1), %%ymm1 \n" \ |
1508 | | "vpermilps $0xff, %%ymm1, %%ymm5 \n" \ |
1509 | | "vpermilps $0xaa, %%ymm1, %%ymm3 \n" \ |
1510 | | "vpermilps $0x00, %%ymm1, %%ymm1 \n" \ |
1511 | | \ |
1512 | | /* If we expand E = [M1] [x0 x2 x4 x6]^t, we get the following |
1513 | | * common expressions: |
1514 | | * |
1515 | | * E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6) |
1516 | | * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6) |
1517 | | * |
1518 | | * E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6) |
1519 | | * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6) |
1520 | | * |
1521 | | * Afterwards, ymm8-11 will hold the even outputs. |
1522 | | */ \ |
1523 | | \ |
1524 | | /* ymm11 = (a ymm0 + a ymm4), ymm1 = (a ymm0 - a ymm4) */ \ |
1525 | | ROW0( "vmulps %%ymm1, %%ymm0, %%ymm11 \n" ) \ |
1526 | | ROW4( "vmulps %%ymm1, %%ymm4, %%ymm4 \n" ) \ |
1527 | | ROW0( "vmovaps %%ymm11, %%ymm1 \n" ) \ |
1528 | | ROW4( "vaddps %%ymm4, %%ymm11, %%ymm11 \n" ) \ |
1529 | | ROW4( "vsubps %%ymm4, %%ymm1, %%ymm1 \n" ) \ |
1530 | | \ |
1531 | | /* ymm7 = (c ymm2 + f ymm6) */ \ |
1532 | | ROW2( "vmulps %%ymm3, %%ymm2, %%ymm7 \n" ) \ |
1533 | | ROW6( "vmulps %%ymm5, %%ymm6, %%ymm9 \n" ) \ |
1534 | | ROW6( "vaddps %%ymm9, %%ymm7, %%ymm7 \n" ) \ |
1535 | | \ |
1536 | | /* E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6) |
1537 | | * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6) |
1538 | | */ \ |
1539 | | ROW0( "vmovaps %%ymm11, %%ymm8 \n" ) \ |
1540 | | ROW2( "vaddps %%ymm7, %%ymm8, %%ymm8 \n" ) \ |
1541 | | ROW2( "vsubps %%ymm7, %%ymm11, %%ymm11 \n" ) \ |
1542 | | \ |
1543 | | /* ymm7 = (f ymm2 - c ymm6) */ \ |
1544 | | ROW2( "vmulps %%ymm5, %%ymm2, %%ymm7 \n" ) \ |
1545 | | ROW6( "vmulps %%ymm3, %%ymm6, %%ymm9 \n" ) \ |
1546 | | ROW6( "vsubps %%ymm9, %%ymm7, %%ymm7 \n" ) \ |
1547 | | \ |
1548 | | /* E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6) |
1549 | | * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6) |
1550 | | */ \ |
1551 | | ROW0( "vmovaps %%ymm1, %%ymm9 \n" ) \ |
1552 | | ROW0( "vmovaps %%ymm1, %%ymm10 \n" ) \ |
1553 | | ROW2( "vaddps %%ymm7, %%ymm1, %%ymm9 \n" ) \ |
1554 | | ROW2( "vsubps %%ymm7, %%ymm1, %%ymm10 \n" ) \ |
1555 | | \ |
1556 | | /* Add the even (ymm8-11) and the odds (ymm12-15), |
1557 | | * placing the results into ymm0-7 |
1558 | | */ \ |
1559 | | "vaddps %%ymm12, %%ymm8, %%ymm0 \n" \ |
1560 | | "vaddps %%ymm13, %%ymm9, %%ymm1 \n" \ |
1561 | | "vaddps %%ymm14, %%ymm10, %%ymm2 \n" \ |
1562 | | "vaddps %%ymm15, %%ymm11, %%ymm3 \n" \ |
1563 | | \ |
1564 | | "vsubps %%ymm12, %%ymm8, %%ymm7 \n" \ |
1565 | | "vsubps %%ymm13, %%ymm9, %%ymm6 \n" \ |
1566 | | "vsubps %%ymm14, %%ymm10, %%ymm5 \n" \ |
1567 | | "vsubps %%ymm15, %%ymm11, %%ymm4 \n" \ |
1568 | | \ |
1569 | | /* Copy out the results from ymm0-7 */ \ |
1570 | | "vmovaps %%ymm0, (%0) \n" \ |
1571 | | "vmovaps %%ymm1, 32(%0) \n" \ |
1572 | | "vmovaps %%ymm2, 64(%0) \n" \ |
1573 | | "vmovaps %%ymm3, 96(%0) \n" \ |
1574 | | "vmovaps %%ymm4, 128(%0) \n" \ |
1575 | | "vmovaps %%ymm5, 160(%0) \n" \ |
1576 | | "vmovaps %%ymm6, 192(%0) \n" \ |
1577 | | "vmovaps %%ymm7, 224(%0) \n" |
1578 | | |
1579 | | /* Output, input, and clobber (OIC) sections of the inline asm */ |
1580 | | #define IDCT_AVX_OIC(_IN0) \ |
1581 | | : /* Output */ \ |
1582 | | : /* Input */ "r"(_IN0), "r"(sAvxCoef) \ |
1583 | | : /* Clobber */ "memory", \ |
1584 | | "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
1585 | | "%xmm4", "%xmm5", "%xmm6", "%xmm7", \ |
1586 | | "%xmm8", "%xmm9", "%xmm10", "%xmm11",\ |
1587 | | "%xmm12", "%xmm13", "%xmm14", "%xmm15" |
1588 | | |
1589 | | /* Include vzeroupper for non-AVX builds */ |
1590 | | #ifndef __AVX__ |
1591 | | #define IDCT_AVX_ASM(_IN0) \ |
1592 | | __asm__( \ |
1593 | | IDCT_AVX_BODY \ |
1594 | | "vzeroupper \n" \ |
1595 | | IDCT_AVX_OIC(_IN0) \ |
1596 | | ); |
1597 | | #else /* __AVX__ */ |
1598 | | #define IDCT_AVX_ASM(_IN0) \ |
1599 | | __asm__( \ |
1600 | | IDCT_AVX_BODY \ |
1601 | | IDCT_AVX_OIC(_IN0) \ |
1602 | | ); |
1603 | | #endif /* __AVX__ */ |
1604 | | |
1605 | | template <int zeroedRows> |
1606 | | void |
1607 | | dctInverse8x8_avx (float *data) |
1608 | 0 | { |
1609 | | #if defined IMF_HAVE_GCC_INLINEASM_64 |
1610 | | |
1611 | | /* The column-major version of M1, followed by the |
1612 | | * column-major version of M2: |
1613 | | * |
1614 | | * [ a c a f ] [ b d e g ] |
1615 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] |
1616 | | * [ a -f -a c ] [ e -b g d ] |
1617 | | * [ a -c a -f ] [ g -e d -b ] |
1618 | | */ |
1619 | | const float sAvxCoef[32] __attribute__((aligned(32))) = { |
1620 | | 3.535536e-01, 3.535536e-01, 3.535536e-01, 3.535536e-01, /* a a a a */ |
1621 | | 4.619398e-01, 1.913422e-01, -1.913422e-01, -4.619398e-01, /* c f -f -c */ |
1622 | | 3.535536e-01, -3.535536e-01, -3.535536e-01, 3.535536e-01, /* a -a -a a */ |
1623 | | 1.913422e-01, -4.619398e-01, 4.619398e-01, -1.913422e-01, /* f -c c -f */ |
1624 | | |
1625 | | 4.903927e-01, 4.157349e-01, 2.777855e-01, 9.754573e-02, /* b d e g */ |
1626 | | 4.157349e-01, -9.754573e-02, -4.903927e-01, -2.777855e-01, /* d -g -b -e */ |
1627 | | 2.777855e-01, -4.903927e-01, 9.754573e-02, 4.157349e-01, /* e -b g d */ |
1628 | | 9.754573e-02, -2.777855e-01, 4.157349e-01, -4.903927e-01 /* g -e d -b */ |
1629 | | }; |
1630 | | |
1631 | | #define ROW0(_X) _X |
1632 | | #define ROW1(_X) _X |
1633 | | #define ROW2(_X) _X |
1634 | | #define ROW3(_X) _X |
1635 | | #define ROW4(_X) _X |
1636 | | #define ROW5(_X) _X |
1637 | | #define ROW6(_X) _X |
1638 | | #define ROW7(_X) _X |
1639 | | |
1640 | | if (zeroedRows == 0) { |
1641 | | |
1642 | | IDCT_AVX_ASM(data) |
1643 | | |
1644 | | } else if (zeroedRows == 1) { |
1645 | | |
1646 | | #undef ROW7 |
1647 | | #define ROW7(_X) |
1648 | | IDCT_AVX_ASM(data) |
1649 | | |
1650 | | } else if (zeroedRows == 2) { |
1651 | | |
1652 | | #undef ROW6 |
1653 | | #define ROW6(_X) |
1654 | | IDCT_AVX_ASM(data) |
1655 | | |
1656 | | } else if (zeroedRows == 3) { |
1657 | | |
1658 | | #undef ROW5 |
1659 | | #define ROW5(_X) |
1660 | | IDCT_AVX_ASM(data) |
1661 | | |
1662 | | } else if (zeroedRows == 4) { |
1663 | | |
1664 | | #undef ROW4 |
1665 | | #define ROW4(_X) |
1666 | | IDCT_AVX_ASM(data) |
1667 | | |
1668 | | } else if (zeroedRows == 5) { |
1669 | | |
1670 | | #undef ROW3 |
1671 | | #define ROW3(_X) |
1672 | | IDCT_AVX_ASM(data) |
1673 | | |
1674 | | } else if (zeroedRows == 6) { |
1675 | | |
1676 | | #undef ROW2 |
1677 | | #define ROW2(_X) |
1678 | | IDCT_AVX_ASM(data) |
1679 | | |
1680 | | } else if (zeroedRows == 7) { |
1681 | | |
1682 | | __asm__( |
1683 | | |
1684 | | /* ============================================== |
1685 | | * Row 1D DCT |
1686 | | * ---------------------------------------------- |
1687 | | */ |
1688 | | IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48) |
1689 | | |
1690 | | "vbroadcastf128 (%1), %%ymm8 \n" |
1691 | | "vbroadcastf128 16(%1), %%ymm9 \n" |
1692 | | "vbroadcastf128 32(%1), %%ymm10 \n" |
1693 | | "vbroadcastf128 48(%1), %%ymm11 \n" |
1694 | | |
1695 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ |
1696 | | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" |
1697 | | |
1698 | | IDCT_AVX_MMULT_ROWS(%%ymm0) |
1699 | | |
1700 | | "vbroadcastf128 64(%1), %%ymm8 \n" |
1701 | | "vbroadcastf128 80(%1), %%ymm9 \n" |
1702 | | "vbroadcastf128 96(%1), %%ymm10 \n" |
1703 | | "vbroadcastf128 112(%1), %%ymm11 \n" |
1704 | | |
1705 | | IDCT_AVX_MMULT_ROWS(%%ymm4) |
1706 | | |
1707 | | IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) |
1708 | | |
1709 | | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" |
1710 | | |
1711 | | /* ============================================== |
1712 | | * Column 1D DCT |
1713 | | * ---------------------------------------------- |
1714 | | */ |
1715 | | |
1716 | | /* DC only, so multiple by a and we're done */ |
1717 | | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" |
1718 | | |
1719 | | /* Copy out results */ |
1720 | | "vmovaps %%ymm0, (%0) \n" |
1721 | | "vmovaps %%ymm0, 32(%0) \n" |
1722 | | "vmovaps %%ymm0, 64(%0) \n" |
1723 | | "vmovaps %%ymm0, 96(%0) \n" |
1724 | | "vmovaps %%ymm0, 128(%0) \n" |
1725 | | "vmovaps %%ymm0, 160(%0) \n" |
1726 | | "vmovaps %%ymm0, 192(%0) \n" |
1727 | | "vmovaps %%ymm0, 224(%0) \n" |
1728 | | |
1729 | | #ifndef __AVX__ |
1730 | | "vzeroupper \n" |
1731 | | #endif /* __AVX__ */ |
1732 | | IDCT_AVX_OIC(data) |
1733 | | ); |
1734 | | } else { |
1735 | | assert(false); // Invalid template instance parameter |
1736 | | } |
1737 | | #else /* IMF_HAVE_GCC_INLINEASM_64 */ |
1738 | |
|
1739 | 0 | dctInverse8x8_scalar<zeroedRows>(data); |
1740 | |
|
1741 | 0 | #endif /* IMF_HAVE_GCC_INLINEASM_64 */ |
1742 | 0 | } Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<0>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<1>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<2>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<3>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<4>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<5>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<6>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<7>(float*) |
1743 | | |
1744 | | |
1745 | | // |
1746 | | // Full 8x8 Forward DCT: |
1747 | | // |
1748 | | // Base forward 8x8 DCT implementation. Works on the data in-place |
1749 | | // |
1750 | | // The implementation describedin Pennebaker + Mitchell, |
1751 | | // section 4.3.2, and illustrated in figure 4-7 |
1752 | | // |
1753 | | // The basic idea is that the 1D DCT math reduces to: |
1754 | | // |
1755 | | // 2*out_0 = c_4 [(s_07 + s_34) + (s_12 + s_56)] |
1756 | | // 2*out_4 = c_4 [(s_07 + s_34) - (s_12 + s_56)] |
1757 | | // |
1758 | | // {2*out_2, 2*out_6} = rot_6 ((d_12 - d_56), (s_07 - s_34)) |
1759 | | // |
1760 | | // {2*out_3, 2*out_5} = rot_-3 (d_07 - c_4 (s_12 - s_56), |
1761 | | // d_34 - c_4 (d_12 + d_56)) |
1762 | | // |
1763 | | // {2*out_1, 2*out_7} = rot_-1 (d_07 + c_4 (s_12 - s_56), |
1764 | | // -d_34 - c_4 (d_12 + d_56)) |
1765 | | // |
1766 | | // where: |
1767 | | // |
1768 | | // c_i = cos(i*pi/16) |
1769 | | // s_i = sin(i*pi/16) |
1770 | | // |
1771 | | // s_ij = in_i + in_j |
1772 | | // d_ij = in_i - in_j |
1773 | | // |
1774 | | // rot_i(x, y) = {c_i*x + s_i*y, -s_i*x + c_i*y} |
1775 | | // |
1776 | | // We'll run the DCT in two passes. First, run the 1D DCT on |
1777 | | // the rows, in-place. Then, run over the columns in-place, |
1778 | | // and be done with it. |
1779 | | // |
1780 | | |
1781 | | #ifndef IMF_HAVE_SSE2 |
1782 | | |
1783 | | // |
1784 | | // Default implementation |
1785 | | // |
1786 | | |
1787 | | void |
1788 | | dctForward8x8 (float *data) |
1789 | | { |
1790 | | float A0, A1, A2, A3, A4, A5, A6, A7; |
1791 | | float K0, K1, rot_x, rot_y; |
1792 | | |
1793 | | float *srcPtr = data; |
1794 | | float *dstPtr = data; |
1795 | | |
1796 | | const float c1 = cosf (3.14159f * 1.0f / 16.0f); |
1797 | | const float c2 = cosf (3.14159f * 2.0f / 16.0f); |
1798 | | const float c3 = cosf (3.14159f * 3.0f / 16.0f); |
1799 | | const float c4 = cosf (3.14159f * 4.0f / 16.0f); |
1800 | | const float c5 = cosf (3.14159f * 5.0f / 16.0f); |
1801 | | const float c6 = cosf (3.14159f * 6.0f / 16.0f); |
1802 | | const float c7 = cosf (3.14159f * 7.0f / 16.0f); |
1803 | | |
1804 | | const float c1Half = .5f * c1; |
1805 | | const float c2Half = .5f * c2; |
1806 | | const float c3Half = .5f * c3; |
1807 | | const float c5Half = .5f * c5; |
1808 | | const float c6Half = .5f * c6; |
1809 | | const float c7Half = .5f * c7; |
1810 | | |
1811 | | // |
1812 | | // First pass - do a 1D DCT over the rows and write the |
1813 | | // results back in place |
1814 | | // |
1815 | | |
1816 | | for (int row=0; row<8; ++row) |
1817 | | { |
1818 | | float *srcRowPtr = srcPtr + 8 * row; |
1819 | | float *dstRowPtr = dstPtr + 8 * row; |
1820 | | |
1821 | | A0 = srcRowPtr[0] + srcRowPtr[7]; |
1822 | | A1 = srcRowPtr[1] + srcRowPtr[2]; |
1823 | | A2 = srcRowPtr[1] - srcRowPtr[2]; |
1824 | | A3 = srcRowPtr[3] + srcRowPtr[4]; |
1825 | | A4 = srcRowPtr[3] - srcRowPtr[4]; |
1826 | | A5 = srcRowPtr[5] + srcRowPtr[6]; |
1827 | | A6 = srcRowPtr[5] - srcRowPtr[6]; |
1828 | | A7 = srcRowPtr[0] - srcRowPtr[7]; |
1829 | | |
1830 | | K0 = c4 * (A0 + A3); |
1831 | | K1 = c4 * (A1 + A5); |
1832 | | |
1833 | | dstRowPtr[0] = .5f * (K0 + K1); |
1834 | | dstRowPtr[4] = .5f * (K0 - K1); |
1835 | | |
1836 | | // |
1837 | | // (2*dst2, 2*dst6) = rot 6 (d12 - d56, s07 - s34) |
1838 | | // |
1839 | | |
1840 | | rot_x = A2 - A6; |
1841 | | rot_y = A0 - A3; |
1842 | | |
1843 | | dstRowPtr[2] = c6Half * rot_x + c2Half * rot_y; |
1844 | | dstRowPtr[6] = c6Half * rot_y - c2Half * rot_x; |
1845 | | |
1846 | | // |
1847 | | // K0, K1 are active until after dst[1],dst[7] |
1848 | | // as well as dst[3], dst[5] are computed. |
1849 | | // |
1850 | | |
1851 | | K0 = c4 * (A1 - A5); |
1852 | | K1 = -1 * c4 * (A2 + A6); |
1853 | | |
1854 | | // |
1855 | | // Two ways to do a rotation: |
1856 | | // |
1857 | | // rot i (x, y) = |
1858 | | // X = c_i*x + s_i*y |
1859 | | // Y = -s_i*x + c_i*y |
1860 | | // |
1861 | | // OR |
1862 | | // |
1863 | | // X = c_i*(x+y) + (s_i-c_i)*y |
1864 | | // Y = c_i*y - (s_i+c_i)*x |
1865 | | // |
1866 | | // the first case has 4 multiplies, but fewer constants, |
1867 | | // while the 2nd case has fewer multiplies but takes more space. |
1868 | | |
1869 | | // |
1870 | | // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 ) |
1871 | | // |
1872 | | |
1873 | | rot_x = A7 - K0; |
1874 | | rot_y = A4 + K1; |
1875 | | |
1876 | | dstRowPtr[3] = c3Half * rot_x - c5Half * rot_y; |
1877 | | dstRowPtr[5] = c5Half * rot_x + c3Half * rot_y; |
1878 | | |
1879 | | // |
1880 | | // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 ) |
1881 | | // |
1882 | | |
1883 | | rot_x = A7 + K0; |
1884 | | rot_y = K1 - A4; |
1885 | | |
1886 | | // |
1887 | | // A: 4, 7 are inactive. All A's are inactive |
1888 | | // |
1889 | | |
1890 | | dstRowPtr[1] = c1Half * rot_x - c7Half * rot_y; |
1891 | | dstRowPtr[7] = c7Half * rot_x + c1Half * rot_y; |
1892 | | } |
1893 | | |
1894 | | // |
1895 | | // Second pass - do the same, but on the columns |
1896 | | // |
1897 | | |
1898 | | for (int column = 0; column < 8; ++column) |
1899 | | { |
1900 | | |
1901 | | A0 = srcPtr[ column] + srcPtr[56 + column]; |
1902 | | A7 = srcPtr[ column] - srcPtr[56 + column]; |
1903 | | |
1904 | | A1 = srcPtr[ 8 + column] + srcPtr[16 + column]; |
1905 | | A2 = srcPtr[ 8 + column] - srcPtr[16 + column]; |
1906 | | |
1907 | | A3 = srcPtr[24 + column] + srcPtr[32 + column]; |
1908 | | A4 = srcPtr[24 + column] - srcPtr[32 + column]; |
1909 | | |
1910 | | A5 = srcPtr[40 + column] + srcPtr[48 + column]; |
1911 | | A6 = srcPtr[40 + column] - srcPtr[48 + column]; |
1912 | | |
1913 | | K0 = c4 * (A0 + A3); |
1914 | | K1 = c4 * (A1 + A5); |
1915 | | |
1916 | | dstPtr[ column] = .5f * (K0 + K1); |
1917 | | dstPtr[32+column] = .5f * (K0 - K1); |
1918 | | |
1919 | | // |
1920 | | // (2*dst2, 2*dst6) = rot 6 ( d12 - d56, s07 - s34 ) |
1921 | | // |
1922 | | |
1923 | | rot_x = A2 - A6; |
1924 | | rot_y = A0 - A3; |
1925 | | |
1926 | | dstPtr[16+column] = .5f * (c6 * rot_x + c2 * rot_y); |
1927 | | dstPtr[48+column] = .5f * (c6 * rot_y - c2 * rot_x); |
1928 | | |
1929 | | // |
1930 | | // K0, K1 are active until after dst[1],dst[7] |
1931 | | // as well as dst[3], dst[5] are computed. |
1932 | | // |
1933 | | |
1934 | | K0 = c4 * (A1 - A5); |
1935 | | K1 = -1 * c4 * (A2 + A6); |
1936 | | |
1937 | | // |
1938 | | // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 ) |
1939 | | // |
1940 | | |
1941 | | rot_x = A7 - K0; |
1942 | | rot_y = A4 + K1; |
1943 | | |
1944 | | dstPtr[24+column] = .5f * (c3 * rot_x - c5 * rot_y); |
1945 | | dstPtr[40+column] = .5f * (c5 * rot_x + c3 * rot_y); |
1946 | | |
1947 | | // |
1948 | | // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 ) |
1949 | | // |
1950 | | |
1951 | | rot_x = A7 + K0; |
1952 | | rot_y = K1 - A4; |
1953 | | |
1954 | | dstPtr[ 8+column] = .5f * (c1 * rot_x - c7 * rot_y); |
1955 | | dstPtr[56+column] = .5f * (c7 * rot_x + c1 * rot_y); |
1956 | | } |
1957 | | } |
1958 | | |
1959 | | #else /* IMF_HAVE_SSE2 */ |
1960 | | |
1961 | | // |
1962 | | // SSE2 implementation |
1963 | | // |
1964 | | // Here, we're always doing a column-wise operation |
1965 | | // plus transposes. This might be faster to do differently |
1966 | | // between rows-wise and column-wise |
1967 | | // |
1968 | | |
1969 | | void |
1970 | | dctForward8x8 (float *data) |
1971 | 0 | { |
1972 | 0 | __m128 *srcVec = (__m128 *)data; |
1973 | 0 | __m128 a0Vec, a1Vec, a2Vec, a3Vec, a4Vec, a5Vec, a6Vec, a7Vec; |
1974 | 0 | __m128 k0Vec, k1Vec, rotXVec, rotYVec; |
1975 | 0 | __m128 transTmp[4], transTmp2[4]; |
1976 | |
|
1977 | 0 | __m128 c4Vec = { .70710678f, .70710678f, .70710678f, .70710678f}; |
1978 | 0 | __m128 c4NegVec = {-.70710678f, -.70710678f, -.70710678f, -.70710678f}; |
1979 | |
|
1980 | 0 | __m128 c1HalfVec = {.490392640f, .490392640f, .490392640f, .490392640f}; |
1981 | 0 | __m128 c2HalfVec = {.461939770f, .461939770f, .461939770f, .461939770f}; |
1982 | 0 | __m128 c3HalfVec = {.415734810f, .415734810f, .415734810f, .415734810f}; |
1983 | 0 | __m128 c5HalfVec = {.277785120f, .277785120f, .277785120f, .277785120f}; |
1984 | 0 | __m128 c6HalfVec = {.191341720f, .191341720f, .191341720f, .191341720f}; |
1985 | 0 | __m128 c7HalfVec = {.097545161f, .097545161f, .097545161f, .097545161f}; |
1986 | |
|
1987 | 0 | __m128 halfVec = {.5f, .5f, .5f, .5f}; |
1988 | |
|
1989 | 0 | for (int iter = 0; iter < 2; ++iter) |
1990 | 0 | { |
1991 | | // |
1992 | | // Operate on 4 columns at a time. The |
1993 | | // offsets into our row-major array are: |
1994 | | // 0: 0 1 |
1995 | | // 1: 2 3 |
1996 | | // 2: 4 5 |
1997 | | // 3: 6 7 |
1998 | | // 4: 8 9 |
1999 | | // 5: 10 11 |
2000 | | // 6: 12 13 |
2001 | | // 7: 14 15 |
2002 | | // |
2003 | |
|
2004 | 0 | for (int pass=0; pass<2; ++pass) |
2005 | 0 | { |
2006 | 0 | a0Vec = _mm_add_ps (srcVec[ 0 + pass], srcVec[14 + pass]); |
2007 | 0 | a1Vec = _mm_add_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]); |
2008 | 0 | a3Vec = _mm_add_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]); |
2009 | 0 | a5Vec = _mm_add_ps (srcVec[10 + pass], srcVec[12 + pass]); |
2010 | | |
2011 | 0 | a7Vec = _mm_sub_ps (srcVec[ 0 + pass], srcVec[14 + pass]); |
2012 | 0 | a2Vec = _mm_sub_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]); |
2013 | 0 | a4Vec = _mm_sub_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]); |
2014 | 0 | a6Vec = _mm_sub_ps (srcVec[10 + pass], srcVec[12 + pass]); |
2015 | | |
2016 | | // |
2017 | | // First stage; Compute out_0 and out_4 |
2018 | | // |
2019 | |
|
2020 | 0 | k0Vec = _mm_add_ps (a0Vec, a3Vec); |
2021 | 0 | k1Vec = _mm_add_ps (a1Vec, a5Vec); |
2022 | |
|
2023 | 0 | k0Vec = _mm_mul_ps (c4Vec, k0Vec); |
2024 | 0 | k1Vec = _mm_mul_ps (c4Vec, k1Vec); |
2025 | |
|
2026 | 0 | srcVec[0 + pass] = _mm_add_ps (k0Vec, k1Vec); |
2027 | 0 | srcVec[8 + pass] = _mm_sub_ps (k0Vec, k1Vec); |
2028 | |
|
2029 | 0 | srcVec[0 + pass] = _mm_mul_ps (srcVec[0 + pass], halfVec ); |
2030 | 0 | srcVec[8 + pass] = _mm_mul_ps (srcVec[8 + pass], halfVec ); |
2031 | | |
2032 | | |
2033 | | // |
2034 | | // Second stage; Compute out_2 and out_6 |
2035 | | // |
2036 | | |
2037 | 0 | k0Vec = _mm_sub_ps (a2Vec, a6Vec); |
2038 | 0 | k1Vec = _mm_sub_ps (a0Vec, a3Vec); |
2039 | |
|
2040 | 0 | srcVec[ 4 + pass] = _mm_add_ps (_mm_mul_ps (c6HalfVec, k0Vec), |
2041 | 0 | _mm_mul_ps (c2HalfVec, k1Vec)); |
2042 | |
|
2043 | 0 | srcVec[12 + pass] = _mm_sub_ps (_mm_mul_ps (c6HalfVec, k1Vec), |
2044 | 0 | _mm_mul_ps (c2HalfVec, k0Vec)); |
2045 | | |
2046 | | // |
2047 | | // Precompute K0 and K1 for the remaining stages |
2048 | | // |
2049 | |
|
2050 | 0 | k0Vec = _mm_mul_ps (_mm_sub_ps (a1Vec, a5Vec), c4Vec); |
2051 | 0 | k1Vec = _mm_mul_ps (_mm_add_ps (a2Vec, a6Vec), c4NegVec); |
2052 | | |
2053 | | // |
2054 | | // Third Stage, compute out_3 and out_5 |
2055 | | // |
2056 | |
|
2057 | 0 | rotXVec = _mm_sub_ps (a7Vec, k0Vec); |
2058 | 0 | rotYVec = _mm_add_ps (a4Vec, k1Vec); |
2059 | |
|
2060 | 0 | srcVec[ 6 + pass] = _mm_sub_ps (_mm_mul_ps (c3HalfVec, rotXVec), |
2061 | 0 | _mm_mul_ps (c5HalfVec, rotYVec)); |
2062 | |
|
2063 | 0 | srcVec[10 + pass] = _mm_add_ps (_mm_mul_ps (c5HalfVec, rotXVec), |
2064 | 0 | _mm_mul_ps (c3HalfVec, rotYVec)); |
2065 | | |
2066 | | // |
2067 | | // Fourth Stage, compute out_1 and out_7 |
2068 | | // |
2069 | |
|
2070 | 0 | rotXVec = _mm_add_ps (a7Vec, k0Vec); |
2071 | 0 | rotYVec = _mm_sub_ps (k1Vec, a4Vec); |
2072 | |
|
2073 | 0 | srcVec[ 2 + pass] = _mm_sub_ps (_mm_mul_ps (c1HalfVec, rotXVec), |
2074 | 0 | _mm_mul_ps (c7HalfVec, rotYVec)); |
2075 | |
|
2076 | 0 | srcVec[14 + pass] = _mm_add_ps (_mm_mul_ps (c7HalfVec, rotXVec), |
2077 | 0 | _mm_mul_ps (c1HalfVec, rotYVec)); |
2078 | 0 | } |
2079 | | |
2080 | | // |
2081 | | // Transpose the matrix, in 4x4 blocks. So, if we have our |
2082 | | // 8x8 matrix divied into 4x4 blocks: |
2083 | | // |
2084 | | // M0 | M1 M0t | M2t |
2085 | | // ----+--- --> -----+------ |
2086 | | // M2 | M3 M1t | M3t |
2087 | | // |
2088 | | |
2089 | | // |
2090 | | // M0t, done in place, the first half. |
2091 | | // |
2092 | |
|
2093 | 0 | transTmp[0] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0x44); |
2094 | 0 | transTmp[1] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0x44); |
2095 | 0 | transTmp[3] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0xEE); |
2096 | 0 | transTmp[2] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0xEE); |
2097 | | |
2098 | | // |
2099 | | // M3t, also done in place, the first half. |
2100 | | // |
2101 | |
|
2102 | 0 | transTmp2[0] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0x44); |
2103 | 0 | transTmp2[1] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0x44); |
2104 | 0 | transTmp2[2] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0xEE); |
2105 | 0 | transTmp2[3] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0xEE); |
2106 | | |
2107 | | // |
2108 | | // M0t, the second half. |
2109 | | // |
2110 | |
|
2111 | 0 | srcVec[0] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88); |
2112 | 0 | srcVec[4] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88); |
2113 | 0 | srcVec[2] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD); |
2114 | 0 | srcVec[6] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD); |
2115 | | |
2116 | | // |
2117 | | // M3t, the second half. |
2118 | | // |
2119 | |
|
2120 | 0 | srcVec[ 9] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88); |
2121 | 0 | srcVec[13] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88); |
2122 | 0 | srcVec[11] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD); |
2123 | 0 | srcVec[15] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD); |
2124 | | |
2125 | | // |
2126 | | // M1 and M2 need to be done at the same time, because we're |
2127 | | // swapping. |
2128 | | // |
2129 | | // First, the first half of M1t |
2130 | | // |
2131 | |
|
2132 | 0 | transTmp[0] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0x44); |
2133 | 0 | transTmp[1] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0x44); |
2134 | 0 | transTmp[2] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0xEE); |
2135 | 0 | transTmp[3] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0xEE); |
2136 | | |
2137 | | // |
2138 | | // And the first half of M2t |
2139 | | // |
2140 | |
|
2141 | 0 | transTmp2[0] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0x44); |
2142 | 0 | transTmp2[1] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0x44); |
2143 | 0 | transTmp2[2] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0xEE); |
2144 | 0 | transTmp2[3] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0xEE); |
2145 | | |
2146 | | // |
2147 | | // Second half of M1t |
2148 | | // |
2149 | |
|
2150 | 0 | srcVec[ 8] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88); |
2151 | 0 | srcVec[12] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88); |
2152 | 0 | srcVec[10] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD); |
2153 | 0 | srcVec[14] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD); |
2154 | | |
2155 | | // |
2156 | | // Second half of M2 |
2157 | | // |
2158 | |
|
2159 | 0 | srcVec[1] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88); |
2160 | 0 | srcVec[5] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88); |
2161 | 0 | srcVec[3] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD); |
2162 | | srcVec[7] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD); |
2163 | 0 | } |
2164 | 0 | } |
2165 | | |
2166 | | #endif /* IMF_HAVE_SSE2 */ |
2167 | | |
2168 | | } // anonymous namespace |
2169 | | |
2170 | | OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT |
2171 | | |
2172 | | #endif |