/src/openexr/src/lib/OpenEXR/ImfDwaCompressorSimd.h
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // SPDX-License-Identifier: BSD-3-Clause |
3 | | // Copyright (c) Contributors to the OpenEXR Project. |
4 | | // |
5 | | |
6 | | #ifndef IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED |
7 | | #define IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED |
8 | | |
9 | | // |
10 | | // Various SSE accelerated functions, used by Imf::DwaCompressor. |
11 | | // These have been separated into a separate .h file, as the fast |
12 | | // paths are done with template specialization. |
13 | | // |
14 | | // Unless otherwise noted, all pointers are assumed to be 32-byte |
15 | | // aligned. Unaligned pointers may risk seg-faulting. |
16 | | // |
17 | | |
18 | | #include "ImfNamespace.h" |
19 | | #include "ImfSimd.h" |
20 | | #include "ImfSystemSpecific.h" |
21 | | #include "OpenEXRConfig.h" |
22 | | #include "OpenEXRConfigInternal.h" |
23 | | |
24 | | #include <assert.h> |
25 | | #include <half.h> |
26 | | |
27 | | #include <algorithm> |
28 | | |
29 | | OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER |
30 | | |
31 | 158k | #define _SSE_ALIGNMENT 32 |
32 | 316k | #define _SSE_ALIGNMENT_MASK 0x0F |
33 | | #define _AVX_ALIGNMENT_MASK 0x1F |
34 | | |
35 | | // |
36 | | // A simple 64-element array, aligned properly for SIMD access. |
37 | | // |
38 | | |
39 | | template <class T> class SimdAlignedBuffer64 |
40 | | { |
41 | | public: |
42 | 8.82k | SimdAlignedBuffer64 () : _buffer (0), _handle (0) { alloc (); } Imf_3_3::SimdAlignedBuffer64<unsigned short>::SimdAlignedBuffer64() Line | Count | Source | 42 | 4.41k | SimdAlignedBuffer64 () : _buffer (0), _handle (0) { alloc (); } |
Imf_3_3::SimdAlignedBuffer64<float>::SimdAlignedBuffer64() Line | Count | Source | 42 | 4.41k | SimdAlignedBuffer64 () : _buffer (0), _handle (0) { alloc (); } |
|
43 | | |
44 | | SimdAlignedBuffer64 (const SimdAlignedBuffer64& rhs) |
45 | | : _buffer (0), _handle (0) |
46 | | { |
47 | | alloc (); |
48 | | memcpy (_buffer, rhs._buffer, 64 * sizeof (T)); |
49 | | } |
50 | | |
51 | | SimdAlignedBuffer64& operator= (const SimdAlignedBuffer64& rhs) |
52 | | { |
53 | | memcpy (_buffer, rhs._buffer, 64 * sizeof (T)); |
54 | | return *this; |
55 | | } |
56 | | |
57 | | #if __cplusplus >= 201103L |
58 | | SimdAlignedBuffer64 (SimdAlignedBuffer64&& rhs) noexcept |
59 | | : _buffer (rhs._buffer), _handle (rhs._handle) |
60 | 0 | { |
61 | 0 | rhs._handle = nullptr; |
62 | 0 | rhs._buffer = nullptr; |
63 | 0 | } |
64 | | |
65 | | SimdAlignedBuffer64& operator= (SimdAlignedBuffer64&& rhs) noexcept |
66 | | { |
67 | | std::swap (_handle, rhs._handle); |
68 | | std::swap (_buffer, rhs._buffer); |
69 | | return *this; |
70 | | } |
71 | | #endif |
72 | | ~SimdAlignedBuffer64 () |
73 | 8.82k | { |
74 | 8.82k | if (_handle) EXRFreeAligned (_handle); |
75 | 8.82k | _handle = 0; |
76 | 8.82k | _buffer = 0; |
77 | 8.82k | } Imf_3_3::SimdAlignedBuffer64<float>::~SimdAlignedBuffer64() Line | Count | Source | 73 | 4.41k | { | 74 | 4.41k | if (_handle) EXRFreeAligned (_handle); | 75 | 4.41k | _handle = 0; | 76 | 4.41k | _buffer = 0; | 77 | 4.41k | } |
Imf_3_3::SimdAlignedBuffer64<unsigned short>::~SimdAlignedBuffer64() Line | Count | Source | 73 | 4.41k | { | 74 | 4.41k | if (_handle) EXRFreeAligned (_handle); | 75 | 4.41k | _handle = 0; | 76 | 4.41k | _buffer = 0; | 77 | 4.41k | } |
|
78 | | |
79 | | void alloc () |
80 | 8.82k | { |
81 | | // |
82 | | // Try EXRAllocAligned first - but it might fallback to |
83 | | // unaligned allocs. If so, overalloc. |
84 | | // |
85 | | |
86 | 8.82k | _handle = (char*) EXRAllocAligned (64 * sizeof (T), _SSE_ALIGNMENT); |
87 | | |
88 | 8.82k | if (((size_t) _handle & (_SSE_ALIGNMENT - 1)) == 0) |
89 | 8.82k | { |
90 | 8.82k | _buffer = (T*) _handle; |
91 | 8.82k | return; |
92 | 8.82k | } |
93 | | |
94 | 0 | EXRFreeAligned (_handle); |
95 | 0 | _handle = (char*) EXRAllocAligned ( |
96 | 0 | 64 * sizeof (T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT); |
97 | |
|
98 | 0 | char* aligned = _handle; |
99 | |
|
100 | 0 | while ((size_t) aligned & (_SSE_ALIGNMENT - 1)) |
101 | 0 | aligned++; |
102 | |
|
103 | 0 | _buffer = (T*) aligned; |
104 | 0 | } Imf_3_3::SimdAlignedBuffer64<unsigned short>::alloc() Line | Count | Source | 80 | 4.41k | { | 81 | | // | 82 | | // Try EXRAllocAligned first - but it might fallback to | 83 | | // unaligned allocs. If so, overalloc. | 84 | | // | 85 | | | 86 | 4.41k | _handle = (char*) EXRAllocAligned (64 * sizeof (T), _SSE_ALIGNMENT); | 87 | | | 88 | 4.41k | if (((size_t) _handle & (_SSE_ALIGNMENT - 1)) == 0) | 89 | 4.41k | { | 90 | 4.41k | _buffer = (T*) _handle; | 91 | 4.41k | return; | 92 | 4.41k | } | 93 | | | 94 | 0 | EXRFreeAligned (_handle); | 95 | 0 | _handle = (char*) EXRAllocAligned ( | 96 | 0 | 64 * sizeof (T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT); | 97 | |
| 98 | 0 | char* aligned = _handle; | 99 | |
| 100 | 0 | while ((size_t) aligned & (_SSE_ALIGNMENT - 1)) | 101 | 0 | aligned++; | 102 | |
| 103 | 0 | _buffer = (T*) aligned; | 104 | 0 | } |
Imf_3_3::SimdAlignedBuffer64<float>::alloc() Line | Count | Source | 80 | 4.41k | { | 81 | | // | 82 | | // Try EXRAllocAligned first - but it might fallback to | 83 | | // unaligned allocs. If so, overalloc. | 84 | | // | 85 | | | 86 | 4.41k | _handle = (char*) EXRAllocAligned (64 * sizeof (T), _SSE_ALIGNMENT); | 87 | | | 88 | 4.41k | if (((size_t) _handle & (_SSE_ALIGNMENT - 1)) == 0) | 89 | 4.41k | { | 90 | 4.41k | _buffer = (T*) _handle; | 91 | 4.41k | return; | 92 | 4.41k | } | 93 | | | 94 | 0 | EXRFreeAligned (_handle); | 95 | 0 | _handle = (char*) EXRAllocAligned ( | 96 | 0 | 64 * sizeof (T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT); | 97 | |
| 98 | 0 | char* aligned = _handle; | 99 | |
| 100 | 0 | while ((size_t) aligned & (_SSE_ALIGNMENT - 1)) | 101 | 0 | aligned++; | 102 | |
| 103 | 0 | _buffer = (T*) aligned; | 104 | 0 | } |
|
105 | | |
106 | | T* _buffer; |
107 | | |
108 | | private: |
109 | | char* _handle; |
110 | | }; |
111 | | |
112 | | typedef SimdAlignedBuffer64<float> SimdAlignedBuffer64f; |
113 | | typedef SimdAlignedBuffer64<unsigned short> SimdAlignedBuffer64us; |
114 | | |
115 | | namespace |
116 | | { |
117 | | |
118 | | // |
119 | | // Color space conversion, Inverse 709 CSC, Y'CbCr -> R'G'B' |
120 | | // |
121 | | |
122 | | void |
123 | | csc709Inverse (float& comp0, float& comp1, float& comp2) |
124 | 262 | { |
125 | 262 | float src[3]; |
126 | | |
127 | 262 | src[0] = comp0; |
128 | 262 | src[1] = comp1; |
129 | 262 | src[2] = comp2; |
130 | | |
131 | 262 | comp0 = src[0] + 1.5747f * src[2]; |
132 | 262 | comp1 = src[0] - 0.1873f * src[1] - 0.4682f * src[2]; |
133 | 262 | comp2 = src[0] + 1.8556f * src[1]; |
134 | 262 | } |
135 | | |
136 | | #ifndef IMF_HAVE_SSE2 |
137 | | |
138 | | // |
139 | | // Scalar color space conversion, based on 709 primiary chromaticies. |
140 | | // No scaling or offsets, just the matrix |
141 | | // |
142 | | |
143 | | void |
144 | | csc709Inverse64 (float* comp0, float* comp1, float* comp2) |
145 | | { |
146 | | for (int i = 0; i < 64; ++i) |
147 | | csc709Inverse (comp0[i], comp1[i], comp2[i]); |
148 | | } |
149 | | |
150 | | #else /* IMF_HAVE_SSE2 */ |
151 | | |
152 | | // |
153 | | // SSE2 color space conversion |
154 | | // |
155 | | |
156 | | void |
157 | | csc709Inverse64 (float* comp0, float* comp1, float* comp2) |
158 | 17.5k | { |
159 | 17.5k | __m128 c0 = {1.5747f, 1.5747f, 1.5747f, 1.5747f}; |
160 | 17.5k | __m128 c1 = {1.8556f, 1.8556f, 1.8556f, 1.8556f}; |
161 | 17.5k | __m128 c2 = {-0.1873f, -0.1873f, -0.1873f, -0.1873f}; |
162 | 17.5k | __m128 c3 = {-0.4682f, -0.4682f, -0.4682f, -0.4682f}; |
163 | | |
164 | 17.5k | __m128* r = (__m128*) comp0; |
165 | 17.5k | __m128* g = (__m128*) comp1; |
166 | 17.5k | __m128* b = (__m128*) comp2; |
167 | 17.5k | __m128 src[3]; |
168 | | |
169 | 17.5k | # define CSC_INVERSE_709_SSE2_LOOP(i) \ |
170 | 281k | src[0] = r[i]; \ |
171 | 281k | src[1] = g[i]; \ |
172 | 281k | src[2] = b[i]; \ |
173 | 281k | \ |
174 | 281k | r[i] = _mm_add_ps (r[i], _mm_mul_ps (src[2], c0)); \ |
175 | 281k | \ |
176 | 281k | g[i] = _mm_mul_ps (g[i], c2); \ |
177 | 281k | src[2] = _mm_mul_ps (src[2], c3); \ |
178 | 281k | g[i] = _mm_add_ps (g[i], src[0]); \ |
179 | 281k | g[i] = _mm_add_ps (g[i], src[2]); \ |
180 | 281k | \ |
181 | 281k | b[i] = _mm_mul_ps (c1, src[1]); \ |
182 | 281k | b[i] = _mm_add_ps (b[i], src[0]); |
183 | | |
184 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (0) |
185 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (1) |
186 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (2) |
187 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (3) |
188 | | |
189 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (4) |
190 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (5) |
191 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (6) |
192 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (7) |
193 | | |
194 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (8) |
195 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (9) |
196 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (10) |
197 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (11) |
198 | | |
199 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (12) |
200 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (13) |
201 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (14) |
202 | 17.5k | CSC_INVERSE_709_SSE2_LOOP (15) |
203 | 17.5k | } |
204 | | |
205 | | #endif /* IMF_HAVE_SSE2 */ |
206 | | |
207 | | // |
208 | | // Color space conversion, Forward 709 CSC, R'G'B' -> Y'CbCr |
209 | | // |
210 | | // Simple FPU color space conversion. Based on the 709 |
211 | | // primary chromaticies, with no scaling or offsets. |
212 | | // |
213 | | |
214 | | void |
215 | | csc709Forward64 (float* comp0, float* comp1, float* comp2) |
216 | 0 | { |
217 | 0 | float src[3]; |
218 | |
|
219 | 0 | for (int i = 0; i < 64; ++i) |
220 | 0 | { |
221 | 0 | src[0] = comp0[i]; |
222 | 0 | src[1] = comp1[i]; |
223 | 0 | src[2] = comp2[i]; |
224 | |
|
225 | 0 | comp0[i] = 0.2126f * src[0] + 0.7152f * src[1] + 0.0722f * src[2]; |
226 | 0 | comp1[i] = -0.1146f * src[0] - 0.3854f * src[1] + 0.5000f * src[2]; |
227 | 0 | comp2[i] = 0.5000f * src[0] - 0.4542f * src[1] - 0.0458f * src[2]; |
228 | 0 | } |
229 | 0 | } |
230 | | |
231 | | // |
232 | | // Byte interleaving of 2 byte arrays: |
233 | | // src0 = AAAA |
234 | | // src1 = BBBB |
235 | | // dst = ABABABAB |
236 | | // |
237 | | // numBytes is the size of each of the source buffers |
238 | | // |
239 | | |
240 | | #ifndef IMF_HAVE_SSE2 |
241 | | |
242 | | // |
243 | | // Scalar default implementation |
244 | | // |
245 | | |
246 | | void |
247 | | interleaveByte2 (char* dst, char* src0, char* src1, int numBytes) |
248 | | { |
249 | | for (int x = 0; x < numBytes; ++x) |
250 | | { |
251 | | dst[2 * x] = src0[x]; |
252 | | dst[2 * x + 1] = src1[x]; |
253 | | } |
254 | | } |
255 | | |
256 | | #else /* IMF_HAVE_SSE2 */ |
257 | | |
258 | | // |
259 | | // SSE2 byte interleaving |
260 | | // |
261 | | |
262 | | void |
263 | | interleaveByte2 (char* dst, char* src0, char* src1, int numBytes) |
264 | 17.7k | { |
265 | 17.7k | int dstAlignment = (size_t) dst % 16; |
266 | 17.7k | int src0Alignment = (size_t) src0 % 16; |
267 | 17.7k | int src1Alignment = (size_t) src1 % 16; |
268 | | |
269 | 17.7k | __m128i* dst_epi8 = (__m128i*) dst; |
270 | 17.7k | __m128i* src0_epi8 = (__m128i*) src0; |
271 | 17.7k | __m128i* src1_epi8 = (__m128i*) src1; |
272 | 17.7k | int sseWidth = numBytes / 16; |
273 | | |
274 | 17.7k | if ((!dstAlignment) && (!src0Alignment) && (!src1Alignment)) |
275 | 3.49k | { |
276 | 3.49k | __m128i tmp0, tmp1; |
277 | | |
278 | | // |
279 | | // Aligned loads and stores |
280 | | // |
281 | | |
282 | 75.1k | for (int x = 0; x < sseWidth; ++x) |
283 | 71.6k | { |
284 | 71.6k | tmp0 = src0_epi8[x]; |
285 | 71.6k | tmp1 = src1_epi8[x]; |
286 | | |
287 | 71.6k | _mm_stream_si128 (&dst_epi8[2 * x], _mm_unpacklo_epi8 (tmp0, tmp1)); |
288 | | |
289 | 71.6k | _mm_stream_si128 ( |
290 | 71.6k | &dst_epi8[2 * x + 1], _mm_unpackhi_epi8 (tmp0, tmp1)); |
291 | 71.6k | } |
292 | | |
293 | | // |
294 | | // Then do run the leftovers one at a time |
295 | | // |
296 | | |
297 | 13.5k | for (int x = 16 * sseWidth; x < numBytes; ++x) |
298 | 10.0k | { |
299 | 10.0k | dst[2 * x] = src0[x]; |
300 | 10.0k | dst[2 * x + 1] = src1[x]; |
301 | 10.0k | } |
302 | 3.49k | } |
303 | 14.2k | else if ((!dstAlignment) && (src0Alignment == 8) && (src1Alignment == 8)) |
304 | 1.50k | { |
305 | | // |
306 | | // Aligned stores, but catch up a few values so we can |
307 | | // use aligned loads |
308 | | // |
309 | | |
310 | 10.9k | for (int x = 0; x < std::min (numBytes, 8); ++x) |
311 | 9.49k | { |
312 | 9.49k | dst[2 * x] = src0[x]; |
313 | 9.49k | dst[2 * x + 1] = src1[x]; |
314 | 9.49k | } |
315 | | |
316 | 1.50k | if (numBytes > 8) |
317 | 734 | { |
318 | 734 | dst_epi8 = (__m128i*) &dst[16]; |
319 | 734 | src0_epi8 = (__m128i*) &src0[8]; |
320 | 734 | src1_epi8 = (__m128i*) &src1[8]; |
321 | 734 | sseWidth = (numBytes - 8) / 16; |
322 | | |
323 | 42.4k | for (int x = 0; x < sseWidth; ++x) |
324 | 41.7k | { |
325 | 41.7k | _mm_stream_si128 ( |
326 | 41.7k | &dst_epi8[2 * x], |
327 | 41.7k | _mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x])); |
328 | | |
329 | 41.7k | _mm_stream_si128 ( |
330 | 41.7k | &dst_epi8[2 * x + 1], |
331 | 41.7k | _mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x])); |
332 | 41.7k | } |
333 | | |
334 | | // |
335 | | // Then do run the leftovers one at a time |
336 | | // |
337 | | |
338 | 4.26k | for (int x = 16 * sseWidth + 8; x < numBytes; ++x) |
339 | 3.53k | { |
340 | 3.53k | dst[2 * x] = src0[x]; |
341 | 3.53k | dst[2 * x + 1] = src1[x]; |
342 | 3.53k | } |
343 | 734 | } |
344 | 1.50k | } |
345 | 12.7k | else |
346 | 12.7k | { |
347 | | // |
348 | | // Unaligned everything |
349 | | // |
350 | | |
351 | 344k | for (int x = 0; x < sseWidth; ++x) |
352 | 332k | { |
353 | 332k | __m128i tmpSrc0_epi8 = _mm_loadu_si128 (&src0_epi8[x]); |
354 | 332k | __m128i tmpSrc1_epi8 = _mm_loadu_si128 (&src1_epi8[x]); |
355 | | |
356 | 332k | _mm_storeu_si128 ( |
357 | 332k | &dst_epi8[2 * x], |
358 | 332k | _mm_unpacklo_epi8 (tmpSrc0_epi8, tmpSrc1_epi8)); |
359 | | |
360 | 332k | _mm_storeu_si128 ( |
361 | 332k | &dst_epi8[2 * x + 1], |
362 | 332k | _mm_unpackhi_epi8 (tmpSrc0_epi8, tmpSrc1_epi8)); |
363 | 332k | } |
364 | | |
365 | | // |
366 | | // Then do run the leftovers one at a time |
367 | | // |
368 | | |
369 | 93.3k | for (int x = 16 * sseWidth; x < numBytes; ++x) |
370 | 80.5k | { |
371 | 80.5k | dst[2 * x] = src0[x]; |
372 | 80.5k | dst[2 * x + 1] = src1[x]; |
373 | 80.5k | } |
374 | 12.7k | } |
375 | 17.7k | } |
376 | | |
377 | | #endif /* IMF_HAVE_SSE2 */ |
378 | | |
379 | | // |
380 | | // Float -> half float conversion |
381 | | // |
382 | | // To enable F16C based conversion, we can't rely on compile-time |
383 | | // detection, hence the multiple defined versions. Pick one based |
384 | | // on runtime cpuid detection. |
385 | | // |
386 | | |
387 | | // |
388 | | // Default boring conversion |
389 | | // |
390 | | |
391 | | void |
392 | | convertFloatToHalf64_scalar (unsigned short* dst, float* src) |
393 | 0 | { |
394 | 0 | for (int i = 0; i < 64; ++i) |
395 | 0 | dst[i] = ((half) src[i]).bits (); |
396 | 0 | } |
397 | | |
398 | | #ifdef IMF_HAVE_NEON_AARCH64 |
399 | | |
400 | | void |
401 | | convertFloatToHalf64_neon (unsigned short* dst, float* src) |
402 | | { |
403 | | for (int i = 0; i < 64; i += 8) |
404 | | { |
405 | | float32x4x2_t vec_fp32 = vld1q_f32_x2 (src + i); |
406 | | vst1q_u16 ( |
407 | | dst + i, |
408 | | vcombine_u16 ( |
409 | | vreinterpret_u16_f16 (vcvt_f16_f32 (vec_fp32.val[0])), |
410 | | vreinterpret_u16_f16 (vcvt_f16_f32 (vec_fp32.val[1])))); |
411 | | } |
412 | | } |
413 | | #endif |
414 | | |
415 | | // |
416 | | // F16C conversion - Assumes aligned src and dst |
417 | | // |
418 | | |
419 | | void |
420 | | convertFloatToHalf64_f16c (unsigned short* dst, float* src) |
421 | 227k | { |
422 | | // |
423 | | // Ordinarily, I'd avoid using inline asm and prefer intrinsics. |
424 | | // However, in order to get the intrinsics, we need to tell |
425 | | // the compiler to generate VEX instructions. |
426 | | // |
427 | | // (On the GCC side, -mf16c goes ahead and activates -mavc, |
428 | | // resulting in VEX code. Without -mf16c, no intrinsics..) |
429 | | // |
430 | | // Now, it's quite likely that we'll find ourselves in situations |
431 | | // where we want to build *without* VEX, in order to maintain |
432 | | // maximum compatibility. But to get there with intrinsics, |
433 | | // we'd need to break out code into a separate file. Bleh. |
434 | | // I'll take the asm. |
435 | | // |
436 | | |
437 | 227k | #if defined IMF_HAVE_GCC_INLINEASM_X86 |
438 | 227k | __asm__ ("vmovaps (%0), %%ymm0 \n" |
439 | 227k | "vmovaps 0x20(%0), %%ymm1 \n" |
440 | 227k | "vmovaps 0x40(%0), %%ymm2 \n" |
441 | 227k | "vmovaps 0x60(%0), %%ymm3 \n" |
442 | 227k | "vcvtps2ph $0, %%ymm0, %%xmm0 \n" |
443 | 227k | "vcvtps2ph $0, %%ymm1, %%xmm1 \n" |
444 | 227k | "vcvtps2ph $0, %%ymm2, %%xmm2 \n" |
445 | 227k | "vcvtps2ph $0, %%ymm3, %%xmm3 \n" |
446 | 227k | "vmovdqa %%xmm0, 0x00(%1) \n" |
447 | 227k | "vmovdqa %%xmm1, 0x10(%1) \n" |
448 | 227k | "vmovdqa %%xmm2, 0x20(%1) \n" |
449 | 227k | "vmovdqa %%xmm3, 0x30(%1) \n" |
450 | 227k | "vmovaps 0x80(%0), %%ymm0 \n" |
451 | 227k | "vmovaps 0xa0(%0), %%ymm1 \n" |
452 | 227k | "vmovaps 0xc0(%0), %%ymm2 \n" |
453 | 227k | "vmovaps 0xe0(%0), %%ymm3 \n" |
454 | 227k | "vcvtps2ph $0, %%ymm0, %%xmm0 \n" |
455 | 227k | "vcvtps2ph $0, %%ymm1, %%xmm1 \n" |
456 | 227k | "vcvtps2ph $0, %%ymm2, %%xmm2 \n" |
457 | 227k | "vcvtps2ph $0, %%ymm3, %%xmm3 \n" |
458 | 227k | "vmovdqa %%xmm0, 0x40(%1) \n" |
459 | 227k | "vmovdqa %%xmm1, 0x50(%1) \n" |
460 | 227k | "vmovdqa %%xmm2, 0x60(%1) \n" |
461 | 227k | "vmovdqa %%xmm3, 0x70(%1) \n" |
462 | 227k | # ifndef __AVX__ |
463 | 227k | "vzeroupper \n" |
464 | 227k | # endif /* __AVX__ */ |
465 | 227k | : /* Output */ |
466 | 227k | : /* Input */ "r"(src), "r"(dst) |
467 | 227k | # ifndef __AVX__ |
468 | 227k | : /* Clobber */ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory" |
469 | | # else |
470 | | : /* Clobber */ "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory" |
471 | | # endif /* __AVX__ */ |
472 | 227k | ); |
473 | | #else |
474 | | convertFloatToHalf64_scalar (dst, src); |
475 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86 */ |
476 | 227k | } |
477 | | |
478 | | // |
479 | | // Convert an 8x8 block of HALF from zig-zag order to |
480 | | // FLOAT in normal order. The order we want is: |
481 | | // |
482 | | // src dst |
483 | | // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28 |
484 | | // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42 |
485 | | // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43 |
486 | | // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53 |
487 | | // 32 33 34 35 36 37 38 39 10 19 23 32 39 45 52 54 |
488 | | // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60 |
489 | | // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61 |
490 | | // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63 |
491 | | // |
492 | | |
493 | | void |
494 | | fromHalfZigZag_scalar (unsigned short* src, float* dst) |
495 | 0 | { |
496 | 0 | half* srcHalf = (half*) src; |
497 | |
|
498 | 0 | dst[0] = (float) srcHalf[0]; |
499 | 0 | dst[1] = (float) srcHalf[1]; |
500 | 0 | dst[2] = (float) srcHalf[5]; |
501 | 0 | dst[3] = (float) srcHalf[6]; |
502 | 0 | dst[4] = (float) srcHalf[14]; |
503 | 0 | dst[5] = (float) srcHalf[15]; |
504 | 0 | dst[6] = (float) srcHalf[27]; |
505 | 0 | dst[7] = (float) srcHalf[28]; |
506 | 0 | dst[8] = (float) srcHalf[2]; |
507 | 0 | dst[9] = (float) srcHalf[4]; |
508 | |
|
509 | 0 | dst[10] = (float) srcHalf[7]; |
510 | 0 | dst[11] = (float) srcHalf[13]; |
511 | 0 | dst[12] = (float) srcHalf[16]; |
512 | 0 | dst[13] = (float) srcHalf[26]; |
513 | 0 | dst[14] = (float) srcHalf[29]; |
514 | 0 | dst[15] = (float) srcHalf[42]; |
515 | 0 | dst[16] = (float) srcHalf[3]; |
516 | 0 | dst[17] = (float) srcHalf[8]; |
517 | 0 | dst[18] = (float) srcHalf[12]; |
518 | 0 | dst[19] = (float) srcHalf[17]; |
519 | |
|
520 | 0 | dst[20] = (float) srcHalf[25]; |
521 | 0 | dst[21] = (float) srcHalf[30]; |
522 | 0 | dst[22] = (float) srcHalf[41]; |
523 | 0 | dst[23] = (float) srcHalf[43]; |
524 | 0 | dst[24] = (float) srcHalf[9]; |
525 | 0 | dst[25] = (float) srcHalf[11]; |
526 | 0 | dst[26] = (float) srcHalf[18]; |
527 | 0 | dst[27] = (float) srcHalf[24]; |
528 | 0 | dst[28] = (float) srcHalf[31]; |
529 | 0 | dst[29] = (float) srcHalf[40]; |
530 | |
|
531 | 0 | dst[30] = (float) srcHalf[44]; |
532 | 0 | dst[31] = (float) srcHalf[53]; |
533 | 0 | dst[32] = (float) srcHalf[10]; |
534 | 0 | dst[33] = (float) srcHalf[19]; |
535 | 0 | dst[34] = (float) srcHalf[23]; |
536 | 0 | dst[35] = (float) srcHalf[32]; |
537 | 0 | dst[36] = (float) srcHalf[39]; |
538 | 0 | dst[37] = (float) srcHalf[45]; |
539 | 0 | dst[38] = (float) srcHalf[52]; |
540 | 0 | dst[39] = (float) srcHalf[54]; |
541 | |
|
542 | 0 | dst[40] = (float) srcHalf[20]; |
543 | 0 | dst[41] = (float) srcHalf[22]; |
544 | 0 | dst[42] = (float) srcHalf[33]; |
545 | 0 | dst[43] = (float) srcHalf[38]; |
546 | 0 | dst[44] = (float) srcHalf[46]; |
547 | 0 | dst[45] = (float) srcHalf[51]; |
548 | 0 | dst[46] = (float) srcHalf[55]; |
549 | 0 | dst[47] = (float) srcHalf[60]; |
550 | 0 | dst[48] = (float) srcHalf[21]; |
551 | 0 | dst[49] = (float) srcHalf[34]; |
552 | |
|
553 | 0 | dst[50] = (float) srcHalf[37]; |
554 | 0 | dst[51] = (float) srcHalf[47]; |
555 | 0 | dst[52] = (float) srcHalf[50]; |
556 | 0 | dst[53] = (float) srcHalf[56]; |
557 | 0 | dst[54] = (float) srcHalf[59]; |
558 | 0 | dst[55] = (float) srcHalf[61]; |
559 | 0 | dst[56] = (float) srcHalf[35]; |
560 | 0 | dst[57] = (float) srcHalf[36]; |
561 | 0 | dst[58] = (float) srcHalf[48]; |
562 | 0 | dst[59] = (float) srcHalf[49]; |
563 | |
|
564 | 0 | dst[60] = (float) srcHalf[57]; |
565 | 0 | dst[61] = (float) srcHalf[58]; |
566 | 0 | dst[62] = (float) srcHalf[62]; |
567 | 0 | dst[63] = (float) srcHalf[63]; |
568 | 0 | } |
569 | | |
570 | | // |
571 | | // If we can form the correct ordering in xmm registers, |
572 | | // we can use F16C to convert from HALF -> FLOAT. However, |
573 | | // making the correct order isn't trivial. |
574 | | // |
575 | | // We want to re-order a source 8x8 matrix from: |
576 | | // |
577 | | // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28 |
578 | | // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42 |
579 | | // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43 |
580 | | // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53 (A) |
581 | | // 32 33 34 35 36 37 38 39 --> 10 19 23 32 39 45 52 54 |
582 | | // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60 |
583 | | // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61 |
584 | | // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63 |
585 | | // |
586 | | // Which looks like a mess, right? |
587 | | // |
588 | | // Now, check out the NE/SW diagonals of (A). Along those lines, |
589 | | // we have runs of contiguous values! If we rewrite (A) a bit, we get: |
590 | | // |
591 | | // 0 |
592 | | // 1 2 |
593 | | // 5 4 3 |
594 | | // 6 7 8 9 |
595 | | // 14 13 12 11 10 |
596 | | // 15 16 17 18 19 20 |
597 | | // 27 26 25 24 23 22 21 (B) |
598 | | // 28 29 30 31 32 33 34 35 |
599 | | // 42 41 40 39 38 37 36 |
600 | | // 43 44 45 46 47 48 |
601 | | // 53 52 51 50 49 |
602 | | // 54 55 56 57 |
603 | | // 60 59 58 |
604 | | // 61 62 |
605 | | // 63 |
606 | | // |
607 | | // In this ordering, the columns are the rows (A). If we can 'transpose' |
608 | | // (B), we'll achieve our goal. But we want this to fit nicely into |
609 | | // xmm registers and still be able to load large runs efficiently. |
610 | | // Also, notice that the odd rows are in ascending order, while |
611 | | // the even rows are in descending order. |
612 | | // |
613 | | // If we 'fold' the bottom half up into the top, we can preserve ordered |
614 | | // runs across rows, and still keep all the correct values in columns. |
615 | | // After transposing, we'll need to rotate things back into place. |
616 | | // This gives us: |
617 | | // |
618 | | // 0 | 42 41 40 39 38 37 36 |
619 | | // 1 2 | 43 44 45 46 47 48 |
620 | | // 5 4 3 | 53 52 51 50 49 |
621 | | // 6 7 8 9 | 54 55 56 57 (C) |
622 | | // 14 13 12 11 10 | 60 59 58 |
623 | | // 15 16 17 18 19 20 | 61 62 |
624 | | // 27 26 25 24 23 22 21 | 61 |
625 | | // 28 29 30 31 32 33 34 35 |
626 | | // |
627 | | // But hang on. We still have the backwards descending rows to deal with. |
628 | | // Lets reverse the even rows so that all values are in ascending order |
629 | | // |
630 | | // 36 37 38 39 40 41 42 | 0 |
631 | | // 1 2 | 43 44 45 46 47 48 |
632 | | // 49 50 51 52 53 | 3 4 5 |
633 | | // 6 7 8 9 | 54 55 56 57 (D) |
634 | | // 58 59 60 | 10 11 12 13 14 |
635 | | // 15 16 17 18 19 20 | 61 62 |
636 | | // 61 | 21 22 23 24 25 26 27 |
637 | | // 28 29 30 31 32 33 34 35 |
638 | | // |
639 | | // If we can form (D), we will then: |
640 | | // 1) Reverse the even rows |
641 | | // 2) Transpose |
642 | | // 3) Rotate the rows |
643 | | // |
644 | | // and we'll have (A). |
645 | | // |
646 | | |
647 | | void |
648 | | fromHalfZigZag_f16c (unsigned short* src, float* dst) |
649 | 226k | { |
650 | 226k | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 |
651 | 226k | __asm__ |
652 | | |
653 | | /* x3 <- 0 |
654 | | * x8 <- [ 0- 7] |
655 | | * x6 <- [56-63] |
656 | | * x9 <- [21-28] |
657 | | * x7 <- [28-35] |
658 | | * x3 <- [ 6- 9] (lower half) */ |
659 | | |
660 | 226k | ("vpxor %%xmm3, %%xmm3, %%xmm3 \n" |
661 | 226k | "vmovdqa (%0), %%xmm8 \n" |
662 | 226k | "vmovdqa 112(%0), %%xmm6 \n" |
663 | 226k | "vmovdqu 42(%0), %%xmm9 \n" |
664 | 226k | "vmovdqu 56(%0), %%xmm7 \n" |
665 | 226k | "vmovq 12(%0), %%xmm3 \n" |
666 | | |
667 | | /* Setup rows 0-2 of A in xmm0-xmm2 |
668 | | * x1 <- x8 >> 16 (1 value) |
669 | | * x2 <- x8 << 32 (2 values) |
670 | | * x0 <- alignr([35-42], x8, 2) |
671 | | * x1 <- blend(x1, [41-48]) |
672 | | * x2 <- blend(x2, [49-56]) */ |
673 | | |
674 | 226k | "vpsrldq $2, %%xmm8, %%xmm1 \n" |
675 | 226k | "vpslldq $4, %%xmm8, %%xmm2 \n" |
676 | 226k | "vpalignr $2, 70(%0), %%xmm8, %%xmm0 \n" |
677 | 226k | "vpblendw $0xfc, 82(%0), %%xmm1, %%xmm1 \n" |
678 | 226k | "vpblendw $0x1f, 98(%0), %%xmm2, %%xmm2 \n" |
679 | | |
680 | | /* Setup rows 4-6 of A in xmm4-xmm6 |
681 | | * x4 <- x6 >> 32 (2 values) |
682 | | * x5 <- x6 << 16 (1 value) |
683 | | * x6 <- alignr(x6,x9,14) |
684 | | * x4 <- blend(x4, [ 7-14]) |
685 | | * x5 <- blend(x5, [15-22]) */ |
686 | | |
687 | 226k | "vpsrldq $4, %%xmm6, %%xmm4 \n" |
688 | 226k | "vpslldq $2, %%xmm6, %%xmm5 \n" |
689 | 226k | "vpalignr $14, %%xmm6, %%xmm9, %%xmm6 \n" |
690 | 226k | "vpblendw $0xf8, 14(%0), %%xmm4, %%xmm4 \n" |
691 | 226k | "vpblendw $0x3f, 30(%0), %%xmm5, %%xmm5 \n" |
692 | | |
693 | | /* Load the upper half of row 3 into xmm3 |
694 | | * x3 <- [54-57] (upper half) */ |
695 | | |
696 | 226k | "vpinsrq $1, 108(%0), %%xmm3, %%xmm3\n" |
697 | | |
698 | | /* Reverse the even rows. We're not using PSHUFB as |
699 | | * that requires loading an extra constant all the time, |
700 | | * and we're already pretty memory bound. |
701 | | */ |
702 | | |
703 | 226k | "vpshuflw $0x1b, %%xmm0, %%xmm0 \n" |
704 | 226k | "vpshuflw $0x1b, %%xmm2, %%xmm2 \n" |
705 | 226k | "vpshuflw $0x1b, %%xmm4, %%xmm4 \n" |
706 | 226k | "vpshuflw $0x1b, %%xmm6, %%xmm6 \n" |
707 | | |
708 | 226k | "vpshufhw $0x1b, %%xmm0, %%xmm0 \n" |
709 | 226k | "vpshufhw $0x1b, %%xmm2, %%xmm2 \n" |
710 | 226k | "vpshufhw $0x1b, %%xmm4, %%xmm4 \n" |
711 | 226k | "vpshufhw $0x1b, %%xmm6, %%xmm6 \n" |
712 | | |
713 | 226k | "vpshufd $0x4e, %%xmm0, %%xmm0 \n" |
714 | 226k | "vpshufd $0x4e, %%xmm2, %%xmm2 \n" |
715 | 226k | "vpshufd $0x4e, %%xmm4, %%xmm4 \n" |
716 | 226k | "vpshufd $0x4e, %%xmm6, %%xmm6 \n" |
717 | | |
718 | | /* Transpose xmm0-xmm7 into xmm8-xmm15 */ |
719 | | |
720 | 226k | "vpunpcklwd %%xmm1, %%xmm0, %%xmm8 \n" |
721 | 226k | "vpunpcklwd %%xmm3, %%xmm2, %%xmm9 \n" |
722 | 226k | "vpunpcklwd %%xmm5, %%xmm4, %%xmm10 \n" |
723 | 226k | "vpunpcklwd %%xmm7, %%xmm6, %%xmm11 \n" |
724 | 226k | "vpunpckhwd %%xmm1, %%xmm0, %%xmm12 \n" |
725 | 226k | "vpunpckhwd %%xmm3, %%xmm2, %%xmm13 \n" |
726 | 226k | "vpunpckhwd %%xmm5, %%xmm4, %%xmm14 \n" |
727 | 226k | "vpunpckhwd %%xmm7, %%xmm6, %%xmm15 \n" |
728 | | |
729 | 226k | "vpunpckldq %%xmm9, %%xmm8, %%xmm0 \n" |
730 | 226k | "vpunpckldq %%xmm11, %%xmm10, %%xmm1 \n" |
731 | 226k | "vpunpckhdq %%xmm9, %%xmm8, %%xmm2 \n" |
732 | 226k | "vpunpckhdq %%xmm11, %%xmm10, %%xmm3 \n" |
733 | 226k | "vpunpckldq %%xmm13, %%xmm12, %%xmm4 \n" |
734 | 226k | "vpunpckldq %%xmm15, %%xmm14, %%xmm5 \n" |
735 | 226k | "vpunpckhdq %%xmm13, %%xmm12, %%xmm6 \n" |
736 | 226k | "vpunpckhdq %%xmm15, %%xmm14, %%xmm7 \n" |
737 | | |
738 | 226k | "vpunpcklqdq %%xmm1, %%xmm0, %%xmm8 \n" |
739 | 226k | "vpunpckhqdq %%xmm1, %%xmm0, %%xmm9 \n" |
740 | 226k | "vpunpcklqdq %%xmm3, %%xmm2, %%xmm10 \n" |
741 | 226k | "vpunpckhqdq %%xmm3, %%xmm2, %%xmm11 \n" |
742 | 226k | "vpunpcklqdq %%xmm4, %%xmm5, %%xmm12 \n" |
743 | 226k | "vpunpckhqdq %%xmm5, %%xmm4, %%xmm13 \n" |
744 | 226k | "vpunpcklqdq %%xmm7, %%xmm6, %%xmm14 \n" |
745 | 226k | "vpunpckhqdq %%xmm7, %%xmm6, %%xmm15 \n" |
746 | | |
747 | | /* Rotate the rows to get the correct final order. |
748 | | * Rotating xmm12 isn't needed, as we can handle |
749 | | * the rotation in the PUNPCKLQDQ above. Rotating |
750 | | * xmm8 isn't needed as it's already in the right order |
751 | | */ |
752 | | |
753 | 226k | "vpalignr $2, %%xmm9, %%xmm9, %%xmm9 \n" |
754 | 226k | "vpalignr $4, %%xmm10, %%xmm10, %%xmm10 \n" |
755 | 226k | "vpalignr $6, %%xmm11, %%xmm11, %%xmm11 \n" |
756 | 226k | "vpalignr $10, %%xmm13, %%xmm13, %%xmm13 \n" |
757 | 226k | "vpalignr $12, %%xmm14, %%xmm14, %%xmm14 \n" |
758 | 226k | "vpalignr $14, %%xmm15, %%xmm15, %%xmm15 \n" |
759 | | |
760 | | /* Convert from half -> float */ |
761 | | |
762 | 226k | "vcvtph2ps %%xmm8, %%ymm8 \n" |
763 | 226k | "vcvtph2ps %%xmm9, %%ymm9 \n" |
764 | 226k | "vcvtph2ps %%xmm10, %%ymm10 \n" |
765 | 226k | "vcvtph2ps %%xmm11, %%ymm11 \n" |
766 | 226k | "vcvtph2ps %%xmm12, %%ymm12 \n" |
767 | 226k | "vcvtph2ps %%xmm13, %%ymm13 \n" |
768 | 226k | "vcvtph2ps %%xmm14, %%ymm14 \n" |
769 | 226k | "vcvtph2ps %%xmm15, %%ymm15 \n" |
770 | | |
771 | | /* Move float values to dst */ |
772 | | |
773 | 226k | "vmovaps %%ymm8, (%1) \n" |
774 | 226k | "vmovaps %%ymm9, 32(%1) \n" |
775 | 226k | "vmovaps %%ymm10, 64(%1) \n" |
776 | 226k | "vmovaps %%ymm11, 96(%1) \n" |
777 | 226k | "vmovaps %%ymm12, 128(%1) \n" |
778 | 226k | "vmovaps %%ymm13, 160(%1) \n" |
779 | 226k | "vmovaps %%ymm14, 192(%1) \n" |
780 | 226k | "vmovaps %%ymm15, 224(%1) \n" |
781 | 226k | # ifndef __AVX__ |
782 | 226k | "vzeroupper \n" |
783 | 226k | # endif /* __AVX__ */ |
784 | 226k | : /* Output */ |
785 | 226k | : /* Input */ "r"(src), "r"(dst) |
786 | 226k | : /* Clobber */ "memory", |
787 | 226k | # ifndef __AVX__ |
788 | 226k | "%xmm0", |
789 | 226k | "%xmm1", |
790 | 226k | "%xmm2", |
791 | 226k | "%xmm3", |
792 | 226k | "%xmm4", |
793 | 226k | "%xmm5", |
794 | 226k | "%xmm6", |
795 | 226k | "%xmm7", |
796 | 226k | "%xmm8", |
797 | 226k | "%xmm9", |
798 | 226k | "%xmm10", |
799 | 226k | "%xmm11", |
800 | 226k | "%xmm12", |
801 | 226k | "%xmm13", |
802 | 226k | "%xmm14", |
803 | 226k | "%xmm15" |
804 | | # else |
805 | | "%ymm0", |
806 | | "%ymm1", |
807 | | "%ymm2", |
808 | | "%ymm3", |
809 | | "%ymm4", |
810 | | "%ymm5", |
811 | | "%ymm6", |
812 | | "%ymm7", |
813 | | "%ymm8", |
814 | | "%ymm9", |
815 | | "%ymm10", |
816 | | "%ymm11", |
817 | | "%ymm12", |
818 | | "%ymm13", |
819 | | "%ymm14", |
820 | | "%ymm15" |
821 | | # endif /* __AVX__ */ |
822 | 226k | ); |
823 | | |
824 | | #else |
825 | | fromHalfZigZag_scalar (src, dst); |
826 | | #endif /* defined IMF_HAVE_GCC_INLINEASM_X86_64 */ |
827 | 226k | } |
828 | | |
829 | | #ifdef IMF_HAVE_NEON_AARCH64 |
830 | | |
831 | | void |
832 | | fromHalfZigZag_neon (unsigned short* __restrict__ src, float* __restrict__ dst) |
833 | | { |
834 | | uint8x16_t res_tbl[4] = { |
835 | | {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42}, |
836 | | {3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53}, |
837 | | {10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60}, |
838 | | {21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63}}; |
839 | | |
840 | | uint8x16x4_t vec_input_l, vec_input_h; |
841 | | |
842 | | for (int i = 0; i < 4; i++) |
843 | | { |
844 | | uint8x16x2_t vec_in_u8 = vld2q_u8 ((unsigned char*) (src + 16 * i)); |
845 | | vec_input_l.val[i] = vec_in_u8.val[0]; |
846 | | vec_input_h.val[i] = vec_in_u8.val[1]; |
847 | | } |
848 | | |
849 | | # pragma unroll(4) |
850 | | for (int i = 0; i < 4; i++) |
851 | | { |
852 | | uint8x16_t res_vec_l, res_vec_h; |
853 | | res_vec_l = vqtbl4q_u8 (vec_input_l, res_tbl[i]); |
854 | | res_vec_h = vqtbl4q_u8 (vec_input_h, res_tbl[i]); |
855 | | float16x8_t res_vec_l_f16 = |
856 | | vreinterpretq_f16_u8 (vzip1q_u8 (res_vec_l, res_vec_h)); |
857 | | float16x8_t res_vec_h_f16 = |
858 | | vreinterpretq_f16_u8 (vzip2q_u8 (res_vec_l, res_vec_h)); |
859 | | vst1q_f32 (dst + i * 16, vcvt_f32_f16 (vget_low_f16 (res_vec_l_f16))); |
860 | | vst1q_f32 (dst + i * 16 + 4, vcvt_high_f32_f16 (res_vec_l_f16)); |
861 | | vst1q_f32 ( |
862 | | dst + i * 16 + 8, vcvt_f32_f16 (vget_low_f16 (res_vec_h_f16))); |
863 | | vst1q_f32 (dst + i * 16 + 12, vcvt_high_f32_f16 (res_vec_h_f16)); |
864 | | } |
865 | | } |
866 | | |
867 | | #endif // IMF_HAVE_NEON_AARCH64 |
868 | | |
869 | | // |
870 | | // Inverse 8x8 DCT, only inverting the DC. This assumes that |
871 | | // all AC frequencies are 0. |
872 | | // |
873 | | |
874 | | #ifndef IMF_HAVE_SSE2 |
875 | | |
876 | | void |
877 | | dctInverse8x8DcOnly (float* data) |
878 | | { |
879 | | float val = data[0] * 3.535536e-01f * 3.535536e-01f; |
880 | | |
881 | | for (int i = 0; i < 64; ++i) |
882 | | data[i] = val; |
883 | | } |
884 | | |
885 | | #else /* IMF_HAVE_SSE2 */ |
886 | | |
887 | | void |
888 | | dctInverse8x8DcOnly (float* data) |
889 | 23.0k | { |
890 | 23.0k | __m128 src = _mm_set1_ps (data[0] * 3.535536e-01f * 3.535536e-01f); |
891 | 23.0k | __m128* dst = (__m128*) data; |
892 | | |
893 | 391k | for (int i = 0; i < 16; ++i) |
894 | 368k | dst[i] = src; |
895 | 23.0k | } |
896 | | |
897 | | #endif /* IMF_HAVE_SSE2 */ |
898 | | |
899 | | // |
900 | | // Full 8x8 Inverse DCT: |
901 | | // |
902 | | // Simple inverse DCT on an 8x8 block, with scalar ops only. |
903 | | // Operates on data in-place. |
904 | | // |
905 | | // This is based on the iDCT formuation (y = frequency domain, |
906 | | // x = spatial domain) |
907 | | // |
908 | | // [x0] [ ][y0] [ ][y1] |
909 | | // [x1] = [ M1 ][y2] + [ M2 ][y3] |
910 | | // [x2] [ ][y4] [ ][y5] |
911 | | // [x3] [ ][y6] [ ][y7] |
912 | | // |
913 | | // [x7] [ ][y0] [ ][y1] |
914 | | // [x6] = [ M1 ][y2] - [ M2 ][y3] |
915 | | // [x5] [ ][y4] [ ][y5] |
916 | | // [x4] [ ][y6] [ ][y7] |
917 | | // |
918 | | // where M1: M2: |
919 | | // |
920 | | // [a c a f] [b d e g] |
921 | | // [a f -a -c] [d -g -b -e] |
922 | | // [a -f -a c] [e -b g d] |
923 | | // [a -c a -f] [g -e d -b] |
924 | | // |
925 | | // and the constants are as defined below.. |
926 | | // |
927 | | // If you know how many of the lower rows are zero, that can |
928 | | // be passed in to help speed things up. If you don't know, |
929 | | // just set zeroedRows=0. |
930 | | // |
931 | | |
932 | | // |
933 | | // Default implementation |
934 | | // |
935 | | |
936 | | template <int zeroedRows> |
937 | | void |
938 | | dctInverse8x8_scalar (float* data) |
939 | 0 | { |
940 | 0 | const float a = .5f * cosf (3.14159f / 4.0f); |
941 | 0 | const float b = .5f * cosf (3.14159f / 16.0f); |
942 | 0 | const float c = .5f * cosf (3.14159f / 8.0f); |
943 | 0 | const float d = .5f * cosf (3.f * 3.14159f / 16.0f); |
944 | 0 | const float e = .5f * cosf (5.f * 3.14159f / 16.0f); |
945 | 0 | const float f = .5f * cosf (3.f * 3.14159f / 8.0f); |
946 | 0 | const float g = .5f * cosf (7.f * 3.14159f / 16.0f); |
947 | |
|
948 | 0 | float alpha[4], beta[4], theta[4], gamma[4]; |
949 | |
|
950 | 0 | float* rowPtr = NULL; |
951 | | |
952 | | // |
953 | | // First pass - row wise. |
954 | | // |
955 | | // This looks less-compact than the description above in |
956 | | // an attempt to fold together common sub-expressions. |
957 | | // |
958 | |
|
959 | 0 | for (int row = 0; row < 8 - zeroedRows; ++row) |
960 | 0 | { |
961 | 0 | rowPtr = data + row * 8; |
962 | |
|
963 | 0 | alpha[0] = c * rowPtr[2]; |
964 | 0 | alpha[1] = f * rowPtr[2]; |
965 | 0 | alpha[2] = c * rowPtr[6]; |
966 | 0 | alpha[3] = f * rowPtr[6]; |
967 | |
|
968 | 0 | beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7]; |
969 | 0 | beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7]; |
970 | 0 | beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7]; |
971 | 0 | beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7]; |
972 | |
|
973 | 0 | theta[0] = a * (rowPtr[0] + rowPtr[4]); |
974 | 0 | theta[3] = a * (rowPtr[0] - rowPtr[4]); |
975 | |
|
976 | 0 | theta[1] = alpha[0] + alpha[3]; |
977 | 0 | theta[2] = alpha[1] - alpha[2]; |
978 | |
|
979 | 0 | gamma[0] = theta[0] + theta[1]; |
980 | 0 | gamma[1] = theta[3] + theta[2]; |
981 | 0 | gamma[2] = theta[3] - theta[2]; |
982 | 0 | gamma[3] = theta[0] - theta[1]; |
983 | |
|
984 | 0 | rowPtr[0] = gamma[0] + beta[0]; |
985 | 0 | rowPtr[1] = gamma[1] + beta[1]; |
986 | 0 | rowPtr[2] = gamma[2] + beta[2]; |
987 | 0 | rowPtr[3] = gamma[3] + beta[3]; |
988 | |
|
989 | 0 | rowPtr[4] = gamma[3] - beta[3]; |
990 | 0 | rowPtr[5] = gamma[2] - beta[2]; |
991 | 0 | rowPtr[6] = gamma[1] - beta[1]; |
992 | 0 | rowPtr[7] = gamma[0] - beta[0]; |
993 | 0 | } |
994 | | |
995 | | // |
996 | | // Second pass - column wise. |
997 | | // |
998 | |
|
999 | 0 | for (int column = 0; column < 8; ++column) |
1000 | 0 | { |
1001 | 0 | alpha[0] = c * data[16 + column]; |
1002 | 0 | alpha[1] = f * data[16 + column]; |
1003 | 0 | alpha[2] = c * data[48 + column]; |
1004 | 0 | alpha[3] = f * data[48 + column]; |
1005 | |
|
1006 | 0 | beta[0] = b * data[8 + column] + d * data[24 + column] + |
1007 | 0 | e * data[40 + column] + g * data[56 + column]; |
1008 | |
|
1009 | 0 | beta[1] = d * data[8 + column] - g * data[24 + column] - |
1010 | 0 | b * data[40 + column] - e * data[56 + column]; |
1011 | |
|
1012 | 0 | beta[2] = e * data[8 + column] - b * data[24 + column] + |
1013 | 0 | g * data[40 + column] + d * data[56 + column]; |
1014 | |
|
1015 | 0 | beta[3] = g * data[8 + column] - e * data[24 + column] + |
1016 | 0 | d * data[40 + column] - b * data[56 + column]; |
1017 | |
|
1018 | 0 | theta[0] = a * (data[column] + data[32 + column]); |
1019 | 0 | theta[3] = a * (data[column] - data[32 + column]); |
1020 | |
|
1021 | 0 | theta[1] = alpha[0] + alpha[3]; |
1022 | 0 | theta[2] = alpha[1] - alpha[2]; |
1023 | |
|
1024 | 0 | gamma[0] = theta[0] + theta[1]; |
1025 | 0 | gamma[1] = theta[3] + theta[2]; |
1026 | 0 | gamma[2] = theta[3] - theta[2]; |
1027 | 0 | gamma[3] = theta[0] - theta[1]; |
1028 | |
|
1029 | 0 | data[column] = gamma[0] + beta[0]; |
1030 | 0 | data[8 + column] = gamma[1] + beta[1]; |
1031 | 0 | data[16 + column] = gamma[2] + beta[2]; |
1032 | 0 | data[24 + column] = gamma[3] + beta[3]; |
1033 | |
|
1034 | 0 | data[32 + column] = gamma[3] - beta[3]; |
1035 | 0 | data[40 + column] = gamma[2] - beta[2]; |
1036 | 0 | data[48 + column] = gamma[1] - beta[1]; |
1037 | 0 | data[56 + column] = gamma[0] - beta[0]; |
1038 | 0 | } |
1039 | 0 | } Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<0>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<1>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<2>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<3>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<4>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<5>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<6>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_scalar<7>(float*) |
1040 | | |
1041 | | // |
1042 | | // SSE2 Implementation |
1043 | | // |
1044 | | |
1045 | | template <int zeroedRows> |
1046 | | void |
1047 | | dctInverse8x8_sse2 (float* data) |
1048 | 0 | { |
1049 | 0 | #ifdef IMF_HAVE_SSE2 |
1050 | 0 | __m128 a = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f}; |
1051 | 0 | __m128 b = {4.903927e-01f, 4.903927e-01f, 4.903927e-01f, 4.903927e-01f}; |
1052 | 0 | __m128 c = {4.619398e-01f, 4.619398e-01f, 4.619398e-01f, 4.619398e-01f}; |
1053 | 0 | __m128 d = {4.157349e-01f, 4.157349e-01f, 4.157349e-01f, 4.157349e-01f}; |
1054 | 0 | __m128 e = {2.777855e-01f, 2.777855e-01f, 2.777855e-01f, 2.777855e-01f}; |
1055 | 0 | __m128 f = {1.913422e-01f, 1.913422e-01f, 1.913422e-01f, 1.913422e-01f}; |
1056 | 0 | __m128 g = {9.754573e-02f, 9.754573e-02f, 9.754573e-02f, 9.754573e-02f}; |
1057 | |
|
1058 | 0 | __m128 c0 = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f}; |
1059 | 0 | __m128 c1 = {4.619398e-01f, 1.913422e-01f, -1.913422e-01f, -4.619398e-01f}; |
1060 | 0 | __m128 c2 = {3.535536e-01f, -3.535536e-01f, -3.535536e-01f, 3.535536e-01f}; |
1061 | 0 | __m128 c3 = {1.913422e-01f, -4.619398e-01f, 4.619398e-01f, -1.913422e-01f}; |
1062 | |
|
1063 | 0 | __m128 c4 = {4.903927e-01f, 4.157349e-01f, 2.777855e-01f, 9.754573e-02f}; |
1064 | 0 | __m128 c5 = {4.157349e-01f, -9.754573e-02f, -4.903927e-01f, -2.777855e-01f}; |
1065 | 0 | __m128 c6 = {2.777855e-01f, -4.903927e-01f, 9.754573e-02f, 4.157349e-01f}; |
1066 | 0 | __m128 c7 = {9.754573e-02f, -2.777855e-01f, 4.157349e-01f, -4.903927e-01f}; |
1067 | |
|
1068 | 0 | __m128* srcVec = (__m128*) data; |
1069 | 0 | __m128 x[8], evenSum, oddSum; |
1070 | 0 | __m128 in[8], alpha[4], beta[4], theta[4], gamma[4]; |
1071 | | |
1072 | | // |
1073 | | // Rows - |
1074 | | // |
1075 | | // Treat this just like matrix-vector multiplication. The |
1076 | | // trick is to note that: |
1077 | | // |
1078 | | // [M00 M01 M02 M03][v0] [(v0 M00) + (v1 M01) + (v2 M02) + (v3 M03)] |
1079 | | // [M10 M11 M12 M13][v1] = [(v0 M10) + (v1 M11) + (v2 M12) + (v3 M13)] |
1080 | | // [M20 M21 M22 M23][v2] [(v0 M20) + (v1 M21) + (v2 M22) + (v3 M23)] |
1081 | | // [M30 M31 M32 M33][v3] [(v0 M30) + (v1 M31) + (v2 M32) + (v3 M33)] |
1082 | | // |
1083 | | // Then, we can fill a register with v_i and multiply by the i-th column |
1084 | | // of M, accumulating across all i-s. |
1085 | | // |
1086 | | // The kids refer to the populating of a register with a single value |
1087 | | // "broadcasting", and it can be done with a shuffle instruction. It |
1088 | | // seems to be the slowest part of the whole ordeal. |
1089 | | // |
1090 | | // Our matrix columns are stored above in c0-c7. c0-3 make up M1, and |
1091 | | // c4-7 are from M2. |
1092 | | // |
1093 | |
|
1094 | 0 | # define DCT_INVERSE_8x8_SS2_ROW_LOOP(i) \ |
1095 | | /* \ |
1096 | | * Broadcast the components of the row \ |
1097 | | */ \ |
1098 | 0 | \ |
1099 | 0 | x[0] = _mm_shuffle_ps ( \ |
1100 | 0 | srcVec[2 * i], srcVec[2 * i], _MM_SHUFFLE (0, 0, 0, 0)); \ |
1101 | 0 | \ |
1102 | 0 | x[1] = _mm_shuffle_ps ( \ |
1103 | 0 | srcVec[2 * i], srcVec[2 * i], _MM_SHUFFLE (1, 1, 1, 1)); \ |
1104 | 0 | \ |
1105 | 0 | x[2] = _mm_shuffle_ps ( \ |
1106 | 0 | srcVec[2 * i], srcVec[2 * i], _MM_SHUFFLE (2, 2, 2, 2)); \ |
1107 | 0 | \ |
1108 | 0 | x[3] = _mm_shuffle_ps ( \ |
1109 | 0 | srcVec[2 * i], srcVec[2 * i], _MM_SHUFFLE (3, 3, 3, 3)); \ |
1110 | 0 | \ |
1111 | 0 | x[4] = _mm_shuffle_ps ( \ |
1112 | 0 | srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (0, 0, 0, 0)); \ |
1113 | 0 | \ |
1114 | 0 | x[5] = _mm_shuffle_ps ( \ |
1115 | 0 | srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (1, 1, 1, 1)); \ |
1116 | 0 | \ |
1117 | 0 | x[6] = _mm_shuffle_ps ( \ |
1118 | 0 | srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (2, 2, 2, 2)); \ |
1119 | 0 | \ |
1120 | 0 | x[7] = _mm_shuffle_ps ( \ |
1121 | 0 | srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (3, 3, 3, 3)); \ |
1122 | | /* \ |
1123 | | * Multiply the components by each column of the matrix \ |
1124 | | */ \ |
1125 | 0 | \ |
1126 | 0 | x[0] = _mm_mul_ps (x[0], c0); \ |
1127 | 0 | x[2] = _mm_mul_ps (x[2], c1); \ |
1128 | 0 | x[4] = _mm_mul_ps (x[4], c2); \ |
1129 | 0 | x[6] = _mm_mul_ps (x[6], c3); \ |
1130 | 0 | \ |
1131 | 0 | x[1] = _mm_mul_ps (x[1], c4); \ |
1132 | 0 | x[3] = _mm_mul_ps (x[3], c5); \ |
1133 | 0 | x[5] = _mm_mul_ps (x[5], c6); \ |
1134 | 0 | x[7] = _mm_mul_ps (x[7], c7); \ |
1135 | 0 | \ |
1136 | | /* \ |
1137 | | * Add across \ |
1138 | | */ \ |
1139 | 0 | \ |
1140 | 0 | evenSum = _mm_setzero_ps (); \ |
1141 | 0 | evenSum = _mm_add_ps (evenSum, x[0]); \ |
1142 | 0 | evenSum = _mm_add_ps (evenSum, x[2]); \ |
1143 | 0 | evenSum = _mm_add_ps (evenSum, x[4]); \ |
1144 | 0 | evenSum = _mm_add_ps (evenSum, x[6]); \ |
1145 | 0 | \ |
1146 | 0 | oddSum = _mm_setzero_ps (); \ |
1147 | 0 | oddSum = _mm_add_ps (oddSum, x[1]); \ |
1148 | 0 | oddSum = _mm_add_ps (oddSum, x[3]); \ |
1149 | 0 | oddSum = _mm_add_ps (oddSum, x[5]); \ |
1150 | 0 | oddSum = _mm_add_ps (oddSum, x[7]); \ |
1151 | 0 | \ |
1152 | | /* \ |
1153 | | * Final Sum: \ |
1154 | | * out [0, 1, 2, 3] = evenSum + oddSum \ |
1155 | | * out [7, 6, 5, 4] = evenSum - oddSum \ |
1156 | | */ \ |
1157 | 0 | \ |
1158 | 0 | srcVec[2 * i] = _mm_add_ps (evenSum, oddSum); \ |
1159 | 0 | srcVec[2 * i + 1] = _mm_sub_ps (evenSum, oddSum); \ |
1160 | 0 | srcVec[2 * i + 1] = _mm_shuffle_ps ( \ |
1161 | 0 | srcVec[2 * i + 1], srcVec[2 * i + 1], _MM_SHUFFLE (0, 1, 2, 3)) |
1162 | |
|
1163 | 0 | switch (zeroedRows) |
1164 | 0 | { |
1165 | 0 | case 0: |
1166 | 0 | default: |
1167 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0); |
1168 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1); |
1169 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2); |
1170 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3); |
1171 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (4); |
1172 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (5); |
1173 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (6); |
1174 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (7); |
1175 | 0 | break; |
1176 | | |
1177 | 0 | case 1: |
1178 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0); |
1179 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1); |
1180 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2); |
1181 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3); |
1182 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (4); |
1183 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (5); |
1184 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (6); |
1185 | 0 | break; |
1186 | | |
1187 | 0 | case 2: |
1188 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0); |
1189 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1); |
1190 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2); |
1191 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3); |
1192 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (4); |
1193 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (5); |
1194 | 0 | break; |
1195 | | |
1196 | 0 | case 3: |
1197 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0); |
1198 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1); |
1199 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2); |
1200 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3); |
1201 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (4); |
1202 | 0 | break; |
1203 | | |
1204 | 0 | case 4: |
1205 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0); |
1206 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1); |
1207 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2); |
1208 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (3); |
1209 | 0 | break; |
1210 | | |
1211 | 0 | case 5: |
1212 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0); |
1213 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1); |
1214 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (2); |
1215 | 0 | break; |
1216 | | |
1217 | 0 | case 6: |
1218 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (0); |
1219 | 0 | DCT_INVERSE_8x8_SS2_ROW_LOOP (1); |
1220 | 0 | break; |
1221 | | |
1222 | 0 | case 7: DCT_INVERSE_8x8_SS2_ROW_LOOP (0); break; |
1223 | 0 | } |
1224 | | |
1225 | 0 | # undef DCT_INVERSE_8x8_SS2_ROW_LOOP |
1226 | | // |
1227 | | // Columns - |
1228 | | // |
1229 | | // This is slightly more straightforward, if less readable. Here |
1230 | | // we just operate on 4 columns at a time, in two batches. |
1231 | | // |
1232 | | // The slight mess is to try and cache sub-expressions, which |
1233 | | // we ignore in the row-wise pass. |
1234 | | // |
1235 | | |
1236 | 0 | for (int col = 0; col < 2; ++col) |
1237 | 0 | { |
1238 | |
|
1239 | 0 | for (int i = 0; i < 8; ++i) |
1240 | 0 | in[i] = srcVec[2 * i + col]; |
1241 | |
|
1242 | 0 | alpha[0] = _mm_mul_ps (c, in[2]); |
1243 | 0 | alpha[1] = _mm_mul_ps (f, in[2]); |
1244 | 0 | alpha[2] = _mm_mul_ps (c, in[6]); |
1245 | 0 | alpha[3] = _mm_mul_ps (f, in[6]); |
1246 | |
|
1247 | 0 | beta[0] = _mm_add_ps ( |
1248 | 0 | _mm_add_ps (_mm_mul_ps (in[1], b), _mm_mul_ps (in[3], d)), |
1249 | 0 | _mm_add_ps (_mm_mul_ps (in[5], e), _mm_mul_ps (in[7], g))); |
1250 | |
|
1251 | 0 | beta[1] = _mm_sub_ps ( |
1252 | 0 | _mm_sub_ps (_mm_mul_ps (in[1], d), _mm_mul_ps (in[3], g)), |
1253 | 0 | _mm_add_ps (_mm_mul_ps (in[5], b), _mm_mul_ps (in[7], e))); |
1254 | |
|
1255 | 0 | beta[2] = _mm_add_ps ( |
1256 | 0 | _mm_sub_ps (_mm_mul_ps (in[1], e), _mm_mul_ps (in[3], b)), |
1257 | 0 | _mm_add_ps (_mm_mul_ps (in[5], g), _mm_mul_ps (in[7], d))); |
1258 | |
|
1259 | 0 | beta[3] = _mm_add_ps ( |
1260 | 0 | _mm_sub_ps (_mm_mul_ps (in[1], g), _mm_mul_ps (in[3], e)), |
1261 | 0 | _mm_sub_ps (_mm_mul_ps (in[5], d), _mm_mul_ps (in[7], b))); |
1262 | |
|
1263 | 0 | theta[0] = _mm_mul_ps (a, _mm_add_ps (in[0], in[4])); |
1264 | 0 | theta[3] = _mm_mul_ps (a, _mm_sub_ps (in[0], in[4])); |
1265 | |
|
1266 | 0 | theta[1] = _mm_add_ps (alpha[0], alpha[3]); |
1267 | 0 | theta[2] = _mm_sub_ps (alpha[1], alpha[2]); |
1268 | |
|
1269 | 0 | gamma[0] = _mm_add_ps (theta[0], theta[1]); |
1270 | 0 | gamma[1] = _mm_add_ps (theta[3], theta[2]); |
1271 | 0 | gamma[2] = _mm_sub_ps (theta[3], theta[2]); |
1272 | 0 | gamma[3] = _mm_sub_ps (theta[0], theta[1]); |
1273 | |
|
1274 | 0 | srcVec[col] = _mm_add_ps (gamma[0], beta[0]); |
1275 | 0 | srcVec[2 + col] = _mm_add_ps (gamma[1], beta[1]); |
1276 | 0 | srcVec[4 + col] = _mm_add_ps (gamma[2], beta[2]); |
1277 | 0 | srcVec[6 + col] = _mm_add_ps (gamma[3], beta[3]); |
1278 | |
|
1279 | 0 | srcVec[8 + col] = _mm_sub_ps (gamma[3], beta[3]); |
1280 | 0 | srcVec[10 + col] = _mm_sub_ps (gamma[2], beta[2]); |
1281 | 0 | srcVec[12 + col] = _mm_sub_ps (gamma[1], beta[1]); |
1282 | 0 | srcVec[14 + col] = _mm_sub_ps (gamma[0], beta[0]); |
1283 | 0 | } |
1284 | |
|
1285 | | #else /* IMF_HAVE_SSE2 */ |
1286 | | |
1287 | | dctInverse8x8_scalar<zeroedRows> (data); |
1288 | | |
1289 | | #endif /* IMF_HAVE_SSE2 */ |
1290 | 0 | } Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<0>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<1>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<2>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<3>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<4>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<5>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<6>(float*) Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_sse2<7>(float*) |
1291 | | |
1292 | | // |
1293 | | // AVX Implementation |
1294 | | // |
1295 | | |
1296 | | // clang-format off |
1297 | | |
1298 | | #define STR(A) #A |
1299 | | |
1300 | | #define IDCT_AVX_SETUP_2_ROWS(_DST0, _DST1, _TMP0, _TMP1, \ |
1301 | | _OFF00, _OFF01, _OFF10, _OFF11) \ |
1302 | | "vmovaps " STR(_OFF00) "(%0), %%xmm" STR(_TMP0) " \n" \ |
1303 | | "vmovaps " STR(_OFF01) "(%0), %%xmm" STR(_TMP1) " \n" \ |
1304 | | " \n" \ |
1305 | | "vinsertf128 $1, " STR(_OFF10) "(%0), %%ymm" STR(_TMP0) ", %%ymm" STR(_TMP0) " \n" \ |
1306 | | "vinsertf128 $1, " STR(_OFF11) "(%0), %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP1) " \n" \ |
1307 | | " \n" \ |
1308 | | "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \ |
1309 | | "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n" \ |
1310 | | " \n" \ |
1311 | | "vunpcklps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP0) " \n" \ |
1312 | | "vunpckhps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP1) " \n" \ |
1313 | | " \n" \ |
1314 | | "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \ |
1315 | | "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n" |
1316 | | |
1317 | | #define IDCT_AVX_MMULT_ROWS(_SRC) \ |
1318 | | /* Broadcast the source values into y12-y15 */ \ |
1319 | | "vpermilps $0x00, " STR(_SRC) ", %%ymm12 \n" \ |
1320 | | "vpermilps $0x55, " STR(_SRC) ", %%ymm13 \n" \ |
1321 | | "vpermilps $0xaa, " STR(_SRC) ", %%ymm14 \n" \ |
1322 | | "vpermilps $0xff, " STR(_SRC) ", %%ymm15 \n" \ |
1323 | | \ |
1324 | | /* Multiple coefs and the broadcasted values */ \ |
1325 | | "vmulps %%ymm12, %%ymm8, %%ymm12 \n" \ |
1326 | | "vmulps %%ymm13, %%ymm9, %%ymm13 \n" \ |
1327 | | "vmulps %%ymm14, %%ymm10, %%ymm14 \n" \ |
1328 | | "vmulps %%ymm15, %%ymm11, %%ymm15 \n" \ |
1329 | | \ |
1330 | | /* Accumulate the result back into the source */ \ |
1331 | | "vaddps %%ymm13, %%ymm12, %%ymm12 \n" \ |
1332 | | "vaddps %%ymm15, %%ymm14, %%ymm14 \n" \ |
1333 | | "vaddps %%ymm14, %%ymm12, " STR(_SRC) "\n" |
1334 | | |
1335 | | #define IDCT_AVX_EO_TO_ROW_HALVES(_EVEN, _ODD, _FRONT, _BACK) \ |
1336 | | "vsubps " STR(_ODD) "," STR(_EVEN) "," STR(_BACK) "\n" \ |
1337 | | "vaddps " STR(_ODD) "," STR(_EVEN) "," STR(_FRONT) "\n" \ |
1338 | | /* Reverse the back half */ \ |
1339 | | "vpermilps $0x1b," STR(_BACK) "," STR(_BACK) "\n" |
1340 | | |
1341 | | /* In order to allow for path paths when we know certain rows |
1342 | | * of the 8x8 block are zero, most of the body of the DCT is |
1343 | | * in the following macro. Statements are wrapped in a ROWn() |
1344 | | * macro, where n is the lowest row in the 8x8 block in which |
1345 | | * they depend. |
1346 | | * |
1347 | | * This should work for the cases where we have 2-8 full rows. |
1348 | | * the 1-row case is special, and we'll handle it separately. |
1349 | | */ |
1350 | | #define IDCT_AVX_BODY \ |
1351 | | /* ============================================== |
1352 | | * Row 1D DCT |
1353 | | * ---------------------------------------------- |
1354 | | */ \ |
1355 | | \ |
1356 | | /* Setup for the row-oriented 1D DCT. Assuming that (%0) holds |
1357 | | * the row-major 8x8 block, load ymm0-3 with the even columns |
1358 | | * and ymm4-7 with the odd columns. The lower half of the ymm |
1359 | | * holds one row, while the upper half holds the next row. |
1360 | | * |
1361 | | * If our source is: |
1362 | | * a0 a1 a2 a3 a4 a5 a6 a7 |
1363 | | * b0 b1 b2 b3 b4 b5 b6 b7 |
1364 | | * |
1365 | | * We'll be forming: |
1366 | | * a0 a2 a4 a6 b0 b2 b4 b6 |
1367 | | * a1 a3 a5 a7 b1 b3 b5 b7 |
1368 | | */ \ |
1369 | | ROW0( IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48) ) \ |
1370 | | ROW2( IDCT_AVX_SETUP_2_ROWS(1, 5, 12, 13, 64, 80, 96, 112) ) \ |
1371 | | ROW4( IDCT_AVX_SETUP_2_ROWS(2, 6, 10, 11, 128, 144, 160, 176) ) \ |
1372 | | ROW6( IDCT_AVX_SETUP_2_ROWS(3, 7, 8, 9, 192, 208, 224, 240) ) \ |
1373 | | \ |
1374 | | /* Multiple the even columns (ymm0-3) by the matrix M1 |
1375 | | * storing the results back in ymm0-3 |
1376 | | * |
1377 | | * Assume that (%1) holds the matrix in column major order |
1378 | | */ \ |
1379 | | "vbroadcastf128 (%1), %%ymm8 \n" \ |
1380 | | "vbroadcastf128 16(%1), %%ymm9 \n" \ |
1381 | | "vbroadcastf128 32(%1), %%ymm10 \n" \ |
1382 | | "vbroadcastf128 48(%1), %%ymm11 \n" \ |
1383 | | \ |
1384 | | ROW0( IDCT_AVX_MMULT_ROWS(%%ymm0) ) \ |
1385 | | ROW2( IDCT_AVX_MMULT_ROWS(%%ymm1) ) \ |
1386 | | ROW4( IDCT_AVX_MMULT_ROWS(%%ymm2) ) \ |
1387 | | ROW6( IDCT_AVX_MMULT_ROWS(%%ymm3) ) \ |
1388 | | \ |
1389 | | /* Repeat, but with the odd columns (ymm4-7) and the |
1390 | | * matrix M2 |
1391 | | */ \ |
1392 | | "vbroadcastf128 64(%1), %%ymm8 \n" \ |
1393 | | "vbroadcastf128 80(%1), %%ymm9 \n" \ |
1394 | | "vbroadcastf128 96(%1), %%ymm10 \n" \ |
1395 | | "vbroadcastf128 112(%1), %%ymm11 \n" \ |
1396 | | \ |
1397 | | ROW0( IDCT_AVX_MMULT_ROWS(%%ymm4) ) \ |
1398 | | ROW2( IDCT_AVX_MMULT_ROWS(%%ymm5) ) \ |
1399 | | ROW4( IDCT_AVX_MMULT_ROWS(%%ymm6) ) \ |
1400 | | ROW6( IDCT_AVX_MMULT_ROWS(%%ymm7) ) \ |
1401 | | \ |
1402 | | /* Sum the M1 (ymm0-3) and M2 (ymm4-7) results to get the |
1403 | | * front halves of the results, and difference to get the |
1404 | | * back halves. The front halfs end up in ymm0-3, the back |
1405 | | * halves end up in ymm12-15. |
1406 | | */ \ |
1407 | | ROW0( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) ) \ |
1408 | | ROW2( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm1, %%ymm5, %%ymm1, %%ymm13) ) \ |
1409 | | ROW4( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm2, %%ymm6, %%ymm2, %%ymm14) ) \ |
1410 | | ROW6( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm3, %%ymm7, %%ymm3, %%ymm15) ) \ |
1411 | | \ |
1412 | | /* Reassemble the rows halves into ymm0-7 */ \ |
1413 | | ROW7( "vperm2f128 $0x13, %%ymm3, %%ymm15, %%ymm7 \n" ) \ |
1414 | | ROW6( "vperm2f128 $0x02, %%ymm3, %%ymm15, %%ymm6 \n" ) \ |
1415 | | ROW5( "vperm2f128 $0x13, %%ymm2, %%ymm14, %%ymm5 \n" ) \ |
1416 | | ROW4( "vperm2f128 $0x02, %%ymm2, %%ymm14, %%ymm4 \n" ) \ |
1417 | | ROW3( "vperm2f128 $0x13, %%ymm1, %%ymm13, %%ymm3 \n" ) \ |
1418 | | ROW2( "vperm2f128 $0x02, %%ymm1, %%ymm13, %%ymm2 \n" ) \ |
1419 | | ROW1( "vperm2f128 $0x13, %%ymm0, %%ymm12, %%ymm1 \n" ) \ |
1420 | | ROW0( "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" ) \ |
1421 | | \ |
1422 | | \ |
1423 | | /* ============================================== |
1424 | | * Column 1D DCT |
1425 | | * ---------------------------------------------- |
1426 | | */ \ |
1427 | | \ |
1428 | | /* Rows should be in ymm0-7, and M2 columns should still be |
1429 | | * preserved in ymm8-11. M2 has 4 unique values (and +- |
1430 | | * versions of each), and all (positive) values appear in |
1431 | | * the first column (and row), which is in ymm8. |
1432 | | * |
1433 | | * For the column-wise DCT, we need to: |
1434 | | * 1) Broadcast each element a row of M2 into 4 vectors |
1435 | | * 2) Multiple the odd rows (ymm1,3,5,7) by the broadcasts. |
1436 | | * 3) Accumulate into ymm12-15 for the odd outputs. |
1437 | | * |
1438 | | * Instead of doing 16 broadcasts for each element in M2, |
1439 | | * do 4, filling y8-11 with: |
1440 | | * |
1441 | | * ymm8: [ b b b b | b b b b ] |
1442 | | * ymm9: [ d d d d | d d d d ] |
1443 | | * ymm10: [ e e e e | e e e e ] |
1444 | | * ymm11: [ g g g g | g g g g ] |
1445 | | * |
1446 | | * And deal with the negative values by subtracting during accum. |
1447 | | */ \ |
1448 | | "vpermilps $0xff, %%ymm8, %%ymm11 \n" \ |
1449 | | "vpermilps $0xaa, %%ymm8, %%ymm10 \n" \ |
1450 | | "vpermilps $0x55, %%ymm8, %%ymm9 \n" \ |
1451 | | "vpermilps $0x00, %%ymm8, %%ymm8 \n" \ |
1452 | | \ |
1453 | | /* This one is easy, since we have ymm12-15 open for scratch |
1454 | | * ymm12 = b ymm1 + d ymm3 + e ymm5 + g ymm7 |
1455 | | */ \ |
1456 | | ROW1( "vmulps %%ymm1, %%ymm8, %%ymm12 \n" ) \ |
1457 | | ROW3( "vmulps %%ymm3, %%ymm9, %%ymm13 \n" ) \ |
1458 | | ROW5( "vmulps %%ymm5, %%ymm10, %%ymm14 \n" ) \ |
1459 | | ROW7( "vmulps %%ymm7, %%ymm11, %%ymm15 \n" ) \ |
1460 | | \ |
1461 | | ROW3( "vaddps %%ymm12, %%ymm13, %%ymm12 \n" ) \ |
1462 | | ROW7( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \ |
1463 | | ROW5( "vaddps %%ymm12, %%ymm14, %%ymm12 \n" ) \ |
1464 | | \ |
1465 | | /* Tricker, since only y13-15 are open for scratch |
1466 | | * ymm13 = d ymm1 - g ymm3 - b ymm5 - e ymm7 |
1467 | | */ \ |
1468 | | ROW1( "vmulps %%ymm1, %%ymm9, %%ymm13 \n" ) \ |
1469 | | ROW3( "vmulps %%ymm3, %%ymm11, %%ymm14 \n" ) \ |
1470 | | ROW5( "vmulps %%ymm5, %%ymm8, %%ymm15 \n" ) \ |
1471 | | \ |
1472 | | ROW5( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \ |
1473 | | ROW3( "vsubps %%ymm14, %%ymm13, %%ymm13 \n" ) \ |
1474 | | \ |
1475 | | ROW7( "vmulps %%ymm7, %%ymm10, %%ymm15 \n" ) \ |
1476 | | ROW7( "vsubps %%ymm15, %%ymm13, %%ymm13 \n" ) \ |
1477 | | \ |
1478 | | /* Tricker still, as only y14-15 are open for scratch |
1479 | | * ymm14 = e ymm1 - b ymm3 + g ymm5 + d ymm7 |
1480 | | */ \ |
1481 | | ROW1( "vmulps %%ymm1, %%ymm10, %%ymm14 \n" ) \ |
1482 | | ROW3( "vmulps %%ymm3, %%ymm8, %%ymm15 \n" ) \ |
1483 | | \ |
1484 | | ROW3( "vsubps %%ymm15, %%ymm14, %%ymm14 \n" ) \ |
1485 | | \ |
1486 | | ROW5( "vmulps %%ymm5, %%ymm11, %%ymm15 \n" ) \ |
1487 | | ROW5( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \ |
1488 | | \ |
1489 | | ROW7( "vmulps %%ymm7, %%ymm9, %%ymm15 \n" ) \ |
1490 | | ROW7( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \ |
1491 | | \ |
1492 | | \ |
1493 | | /* Easy, as we can blow away ymm1,3,5,7 for scratch |
1494 | | * ymm15 = g ymm1 - e ymm3 + d ymm5 - b ymm7 |
1495 | | */ \ |
1496 | | ROW1( "vmulps %%ymm1, %%ymm11, %%ymm15 \n" ) \ |
1497 | | ROW3( "vmulps %%ymm3, %%ymm10, %%ymm3 \n" ) \ |
1498 | | ROW5( "vmulps %%ymm5, %%ymm9, %%ymm5 \n" ) \ |
1499 | | ROW7( "vmulps %%ymm7, %%ymm8, %%ymm7 \n" ) \ |
1500 | | \ |
1501 | | ROW5( "vaddps %%ymm15, %%ymm5, %%ymm15 \n" ) \ |
1502 | | ROW7( "vaddps %%ymm3, %%ymm7, %%ymm3 \n" ) \ |
1503 | | ROW3( "vsubps %%ymm3, %%ymm15, %%ymm15 \n" ) \ |
1504 | | \ |
1505 | | \ |
1506 | | /* Load coefs for M1. Because we're going to broadcast |
1507 | | * coefs, we don't need to load the actual structure from |
1508 | | * M1. Instead, just load enough that we can broadcast. |
1509 | | * There are only 6 unique values in M1, but they're in +- |
1510 | | * pairs, leaving only 3 unique coefs if we add and subtract |
1511 | | * properly. |
1512 | | * |
1513 | | * Fill ymm1 with coef[2] = [ a a c f | a a c f ] |
1514 | | * Broadcast ymm5 with [ f f f f | f f f f ] |
1515 | | * Broadcast ymm3 with [ c c c c | c c c c ] |
1516 | | * Broadcast ymm1 with [ a a a a | a a a a ] |
1517 | | */ \ |
1518 | | "vbroadcastf128 8(%1), %%ymm1 \n" \ |
1519 | | "vpermilps $0xff, %%ymm1, %%ymm5 \n" \ |
1520 | | "vpermilps $0xaa, %%ymm1, %%ymm3 \n" \ |
1521 | | "vpermilps $0x00, %%ymm1, %%ymm1 \n" \ |
1522 | | \ |
1523 | | /* If we expand E = [M1] [x0 x2 x4 x6]^t, we get the following |
1524 | | * common expressions: |
1525 | | * |
1526 | | * E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6) |
1527 | | * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6) |
1528 | | * |
1529 | | * E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6) |
1530 | | * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6) |
1531 | | * |
1532 | | * Afterwards, ymm8-11 will hold the even outputs. |
1533 | | */ \ |
1534 | | \ |
1535 | | /* ymm11 = (a ymm0 + a ymm4), ymm1 = (a ymm0 - a ymm4) */ \ |
1536 | | ROW0( "vmulps %%ymm1, %%ymm0, %%ymm11 \n" ) \ |
1537 | | ROW4( "vmulps %%ymm1, %%ymm4, %%ymm4 \n" ) \ |
1538 | | ROW0( "vmovaps %%ymm11, %%ymm1 \n" ) \ |
1539 | | ROW4( "vaddps %%ymm4, %%ymm11, %%ymm11 \n" ) \ |
1540 | | ROW4( "vsubps %%ymm4, %%ymm1, %%ymm1 \n" ) \ |
1541 | | \ |
1542 | | /* ymm7 = (c ymm2 + f ymm6) */ \ |
1543 | | ROW2( "vmulps %%ymm3, %%ymm2, %%ymm7 \n" ) \ |
1544 | | ROW6( "vmulps %%ymm5, %%ymm6, %%ymm9 \n" ) \ |
1545 | | ROW6( "vaddps %%ymm9, %%ymm7, %%ymm7 \n" ) \ |
1546 | | \ |
1547 | | /* E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6) |
1548 | | * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6) |
1549 | | */ \ |
1550 | | ROW0( "vmovaps %%ymm11, %%ymm8 \n" ) \ |
1551 | | ROW2( "vaddps %%ymm7, %%ymm8, %%ymm8 \n" ) \ |
1552 | | ROW2( "vsubps %%ymm7, %%ymm11, %%ymm11 \n" ) \ |
1553 | | \ |
1554 | | /* ymm7 = (f ymm2 - c ymm6) */ \ |
1555 | | ROW2( "vmulps %%ymm5, %%ymm2, %%ymm7 \n" ) \ |
1556 | | ROW6( "vmulps %%ymm3, %%ymm6, %%ymm9 \n" ) \ |
1557 | | ROW6( "vsubps %%ymm9, %%ymm7, %%ymm7 \n" ) \ |
1558 | | \ |
1559 | | /* E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6) |
1560 | | * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6) |
1561 | | */ \ |
1562 | | ROW0( "vmovaps %%ymm1, %%ymm9 \n" ) \ |
1563 | | ROW0( "vmovaps %%ymm1, %%ymm10 \n" ) \ |
1564 | | ROW2( "vaddps %%ymm7, %%ymm1, %%ymm9 \n" ) \ |
1565 | | ROW2( "vsubps %%ymm7, %%ymm1, %%ymm10 \n" ) \ |
1566 | | \ |
1567 | | /* Add the even (ymm8-11) and the odds (ymm12-15), |
1568 | | * placing the results into ymm0-7 |
1569 | | */ \ |
1570 | | "vaddps %%ymm12, %%ymm8, %%ymm0 \n" \ |
1571 | | "vaddps %%ymm13, %%ymm9, %%ymm1 \n" \ |
1572 | | "vaddps %%ymm14, %%ymm10, %%ymm2 \n" \ |
1573 | | "vaddps %%ymm15, %%ymm11, %%ymm3 \n" \ |
1574 | | \ |
1575 | | "vsubps %%ymm12, %%ymm8, %%ymm7 \n" \ |
1576 | | "vsubps %%ymm13, %%ymm9, %%ymm6 \n" \ |
1577 | | "vsubps %%ymm14, %%ymm10, %%ymm5 \n" \ |
1578 | | "vsubps %%ymm15, %%ymm11, %%ymm4 \n" \ |
1579 | | \ |
1580 | | /* Copy out the results from ymm0-7 */ \ |
1581 | | "vmovaps %%ymm0, (%0) \n" \ |
1582 | | "vmovaps %%ymm1, 32(%0) \n" \ |
1583 | | "vmovaps %%ymm2, 64(%0) \n" \ |
1584 | | "vmovaps %%ymm3, 96(%0) \n" \ |
1585 | | "vmovaps %%ymm4, 128(%0) \n" \ |
1586 | | "vmovaps %%ymm5, 160(%0) \n" \ |
1587 | | "vmovaps %%ymm6, 192(%0) \n" \ |
1588 | | "vmovaps %%ymm7, 224(%0) \n" |
1589 | | |
1590 | | /* Output, input, and clobber (OIC) sections of the inline asm */ |
1591 | | #define IDCT_AVX_OIC(_IN0) \ |
1592 | 226k | : /* Output */ \ |
1593 | 226k | : /* Input */ "r"(_IN0), "r"(sAvxCoef) \ |
1594 | 226k | : /* Clobber */ "memory", \ |
1595 | 226k | "%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
1596 | 226k | "%xmm4", "%xmm5", "%xmm6", "%xmm7", \ |
1597 | 226k | "%xmm8", "%xmm9", "%xmm10", "%xmm11",\ |
1598 | 226k | "%xmm12", "%xmm13", "%xmm14", "%xmm15" |
1599 | | |
1600 | | /* Include vzeroupper for non-AVX builds */ |
1601 | | #ifndef __AVX__ |
1602 | | #define IDCT_AVX_ASM(_IN0) \ |
1603 | 213k | __asm__( \ |
1604 | 213k | IDCT_AVX_BODY \ |
1605 | 213k | "vzeroupper \n" \ |
1606 | 213k | IDCT_AVX_OIC(_IN0) \ |
1607 | 213k | ); |
1608 | | #else /* __AVX__ */ |
1609 | | #define IDCT_AVX_ASM(_IN0) \ |
1610 | | __asm__( \ |
1611 | | IDCT_AVX_BODY \ |
1612 | | IDCT_AVX_OIC(_IN0) \ |
1613 | | ); |
1614 | | #endif /* __AVX__ */ |
1615 | | |
1616 | | // clang-format on |
1617 | | |
1618 | | template <int zeroedRows> |
1619 | | void |
1620 | | dctInverse8x8_avx (float* data) |
1621 | 226k | { |
1622 | 226k | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 |
1623 | | |
1624 | | /* The column-major version of M1, followed by the |
1625 | | * column-major version of M2: |
1626 | | * |
1627 | | * [ a c a f ] [ b d e g ] |
1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] |
1629 | | * [ a -f -a c ] [ e -b g d ] |
1630 | | * [ a -c a -f ] [ g -e d -b ] |
1631 | | */ |
1632 | 226k | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { |
1633 | 226k | 3.535536e-01, 3.535536e-01, |
1634 | 226k | 3.535536e-01, 3.535536e-01, /* a a a a */ |
1635 | 226k | 4.619398e-01, 1.913422e-01, |
1636 | 226k | -1.913422e-01, -4.619398e-01, /* c f -f -c */ |
1637 | 226k | 3.535536e-01, -3.535536e-01, |
1638 | 226k | -3.535536e-01, 3.535536e-01, /* a -a -a a */ |
1639 | 226k | 1.913422e-01, -4.619398e-01, |
1640 | 226k | 4.619398e-01, -1.913422e-01, /* f -c c -f */ |
1641 | | |
1642 | 226k | 4.903927e-01, 4.157349e-01, |
1643 | 226k | 2.777855e-01, 9.754573e-02, /* b d e g */ |
1644 | 226k | 4.157349e-01, -9.754573e-02, |
1645 | 226k | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ |
1646 | 226k | 2.777855e-01, -4.903927e-01, |
1647 | 226k | 9.754573e-02, 4.157349e-01, /* e -b g d */ |
1648 | 226k | 9.754573e-02, -2.777855e-01, |
1649 | 226k | 4.157349e-01, -4.903927e-01 /* g -e d -b */ |
1650 | 226k | }; |
1651 | | |
1652 | 226k | # define ROW0(_X) _X |
1653 | 226k | # define ROW1(_X) _X |
1654 | 226k | # define ROW2(_X) _X |
1655 | 226k | # define ROW3(_X) _X |
1656 | 226k | # define ROW4(_X) _X |
1657 | 226k | # define ROW5(_X) _X |
1658 | 226k | # define ROW6(_X) _X |
1659 | 226k | # define ROW7(_X) _X |
1660 | | |
1661 | 226k | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } |
1662 | 24.2k | else if (zeroedRows == 1) |
1663 | 1.71k | { |
1664 | | |
1665 | 1.71k | # undef ROW7 |
1666 | 1.71k | # define ROW7(_X) |
1667 | 1.71k | IDCT_AVX_ASM (data) |
1668 | 1.71k | } |
1669 | 22.5k | else if (zeroedRows == 2) |
1670 | 127 | { |
1671 | | |
1672 | 127 | # undef ROW6 |
1673 | 127 | # define ROW6(_X) |
1674 | 127 | IDCT_AVX_ASM (data) |
1675 | 127 | } |
1676 | 22.3k | else if (zeroedRows == 3) |
1677 | 1.61k | { |
1678 | | |
1679 | 1.61k | # undef ROW5 |
1680 | 1.61k | # define ROW5(_X) |
1681 | 1.61k | IDCT_AVX_ASM (data) |
1682 | 1.61k | } |
1683 | 20.7k | else if (zeroedRows == 4) |
1684 | 713 | { |
1685 | | |
1686 | 713 | # undef ROW4 |
1687 | 713 | # define ROW4(_X) |
1688 | 713 | IDCT_AVX_ASM (data) |
1689 | 713 | } |
1690 | 20.0k | else if (zeroedRows == 5) |
1691 | 6.10k | { |
1692 | | |
1693 | 6.10k | # undef ROW3 |
1694 | 6.10k | # define ROW3(_X) |
1695 | 6.10k | IDCT_AVX_ASM (data) |
1696 | 6.10k | } |
1697 | 13.9k | else if (zeroedRows == 6) |
1698 | 1.02k | { |
1699 | | |
1700 | 1.02k | # undef ROW2 |
1701 | 1.02k | # define ROW2(_X) |
1702 | 1.02k | IDCT_AVX_ASM (data) |
1703 | 1.02k | } |
1704 | 12.9k | else if (zeroedRows == 7) |
1705 | 12.9k | { |
1706 | | // autoformatting wants to add a space between the doubled %% |
1707 | | // clang-format off |
1708 | | |
1709 | 12.9k | __asm__( |
1710 | | |
1711 | | /* ============================================== |
1712 | | * Row 1D DCT |
1713 | | * ---------------------------------------------- |
1714 | | */ |
1715 | 12.9k | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) |
1716 | | |
1717 | 12.9k | "vbroadcastf128 (%1), %%ymm8 \n" |
1718 | 12.9k | "vbroadcastf128 16(%1), %%ymm9 \n" |
1719 | 12.9k | "vbroadcastf128 32(%1), %%ymm10 \n" |
1720 | 12.9k | "vbroadcastf128 48(%1), %%ymm11 \n" |
1721 | | |
1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ |
1723 | 12.9k | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" |
1724 | | |
1725 | 12.9k | IDCT_AVX_MMULT_ROWS (%% ymm0) |
1726 | | |
1727 | 12.9k | "vbroadcastf128 64(%1), %%ymm8 \n" |
1728 | 12.9k | "vbroadcastf128 80(%1), %%ymm9 \n" |
1729 | 12.9k | "vbroadcastf128 96(%1), %%ymm10 \n" |
1730 | 12.9k | "vbroadcastf128 112(%1), %%ymm11 \n" |
1731 | | |
1732 | 12.9k | IDCT_AVX_MMULT_ROWS (%% ymm4) |
1733 | | |
1734 | 12.9k | IDCT_AVX_EO_TO_ROW_HALVES ( |
1735 | 12.9k | %% ymm0, %% ymm4, %% ymm0, %% ymm12) |
1736 | | |
1737 | 12.9k | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" |
1738 | | |
1739 | | /* ============================================== |
1740 | | * Column 1D DCT |
1741 | | * ---------------------------------------------- |
1742 | | */ |
1743 | | |
1744 | | /* DC only, so multiple by a and we're done */ |
1745 | 12.9k | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" |
1746 | | |
1747 | | /* Copy out results */ |
1748 | 12.9k | "vmovaps %%ymm0, (%0) \n" |
1749 | 12.9k | "vmovaps %%ymm0, 32(%0) \n" |
1750 | 12.9k | "vmovaps %%ymm0, 64(%0) \n" |
1751 | 12.9k | "vmovaps %%ymm0, 96(%0) \n" |
1752 | 12.9k | "vmovaps %%ymm0, 128(%0) \n" |
1753 | 12.9k | "vmovaps %%ymm0, 160(%0) \n" |
1754 | 12.9k | "vmovaps %%ymm0, 192(%0) \n" |
1755 | 12.9k | "vmovaps %%ymm0, 224(%0) \n" |
1756 | | |
1757 | 12.9k | # ifndef __AVX__ |
1758 | 12.9k | "vzeroupper \n" |
1759 | 12.9k | # endif /* __AVX__ */ |
1760 | 12.9k | IDCT_AVX_OIC (data)); |
1761 | | // clang-format on |
1762 | 12.9k | } |
1763 | 0 | else |
1764 | 0 | { |
1765 | 0 | assert (false); // Invalid template instance parameter |
1766 | 0 | } |
1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ |
1768 | | |
1769 | | dctInverse8x8_scalar<zeroedRows> (data); |
1770 | | |
1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ |
1772 | 226k | } ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<0>(float*) Line | Count | Source | 1621 | 202k | { | 1622 | 202k | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 | 1623 | | | 1624 | | /* The column-major version of M1, followed by the | 1625 | | * column-major version of M2: | 1626 | | * | 1627 | | * [ a c a f ] [ b d e g ] | 1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] | 1629 | | * [ a -f -a c ] [ e -b g d ] | 1630 | | * [ a -c a -f ] [ g -e d -b ] | 1631 | | */ | 1632 | 202k | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { | 1633 | 202k | 3.535536e-01, 3.535536e-01, | 1634 | 202k | 3.535536e-01, 3.535536e-01, /* a a a a */ | 1635 | 202k | 4.619398e-01, 1.913422e-01, | 1636 | 202k | -1.913422e-01, -4.619398e-01, /* c f -f -c */ | 1637 | 202k | 3.535536e-01, -3.535536e-01, | 1638 | 202k | -3.535536e-01, 3.535536e-01, /* a -a -a a */ | 1639 | 202k | 1.913422e-01, -4.619398e-01, | 1640 | 202k | 4.619398e-01, -1.913422e-01, /* f -c c -f */ | 1641 | | | 1642 | 202k | 4.903927e-01, 4.157349e-01, | 1643 | 202k | 2.777855e-01, 9.754573e-02, /* b d e g */ | 1644 | 202k | 4.157349e-01, -9.754573e-02, | 1645 | 202k | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ | 1646 | 202k | 2.777855e-01, -4.903927e-01, | 1647 | 202k | 9.754573e-02, 4.157349e-01, /* e -b g d */ | 1648 | 202k | 9.754573e-02, -2.777855e-01, | 1649 | 202k | 4.157349e-01, -4.903927e-01 /* g -e d -b */ | 1650 | 202k | }; | 1651 | | | 1652 | 202k | # define ROW0(_X) _X | 1653 | 202k | # define ROW1(_X) _X | 1654 | 202k | # define ROW2(_X) _X | 1655 | 202k | # define ROW3(_X) _X | 1656 | 202k | # define ROW4(_X) _X | 1657 | 202k | # define ROW5(_X) _X | 1658 | 202k | # define ROW6(_X) _X | 1659 | 202k | # define ROW7(_X) _X | 1660 | | | 1661 | 202k | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } | 1662 | 0 | else if (zeroedRows == 1) | 1663 | 0 | { | 1664 | |
| 1665 | 0 | # undef ROW7 | 1666 | 0 | # define ROW7(_X) | 1667 | 0 | IDCT_AVX_ASM (data) | 1668 | 0 | } | 1669 | 0 | else if (zeroedRows == 2) | 1670 | 0 | { | 1671 | |
| 1672 | 0 | # undef ROW6 | 1673 | 0 | # define ROW6(_X) | 1674 | 0 | IDCT_AVX_ASM (data) | 1675 | 0 | } | 1676 | 0 | else if (zeroedRows == 3) | 1677 | 0 | { | 1678 | |
| 1679 | 0 | # undef ROW5 | 1680 | 0 | # define ROW5(_X) | 1681 | 0 | IDCT_AVX_ASM (data) | 1682 | 0 | } | 1683 | 0 | else if (zeroedRows == 4) | 1684 | 0 | { | 1685 | |
| 1686 | 0 | # undef ROW4 | 1687 | 0 | # define ROW4(_X) | 1688 | 0 | IDCT_AVX_ASM (data) | 1689 | 0 | } | 1690 | 0 | else if (zeroedRows == 5) | 1691 | 0 | { | 1692 | |
| 1693 | 0 | # undef ROW3 | 1694 | 0 | # define ROW3(_X) | 1695 | 0 | IDCT_AVX_ASM (data) | 1696 | 0 | } | 1697 | 0 | else if (zeroedRows == 6) | 1698 | 0 | { | 1699 | |
| 1700 | 0 | # undef ROW2 | 1701 | 0 | # define ROW2(_X) | 1702 | 0 | IDCT_AVX_ASM (data) | 1703 | 0 | } | 1704 | 0 | else if (zeroedRows == 7) | 1705 | 0 | { | 1706 | | // autoformatting wants to add a space between the doubled %% | 1707 | | // clang-format off | 1708 | |
| 1709 | 0 | __asm__( | 1710 | | | 1711 | | /* ============================================== | 1712 | | * Row 1D DCT | 1713 | | * ---------------------------------------------- | 1714 | | */ | 1715 | 0 | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) | 1716 | |
| 1717 | 0 | "vbroadcastf128 (%1), %%ymm8 \n" | 1718 | 0 | "vbroadcastf128 16(%1), %%ymm9 \n" | 1719 | 0 | "vbroadcastf128 32(%1), %%ymm10 \n" | 1720 | 0 | "vbroadcastf128 48(%1), %%ymm11 \n" | 1721 | | | 1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ | 1723 | 0 | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" | 1724 | |
| 1725 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm0) | 1726 | |
| 1727 | 0 | "vbroadcastf128 64(%1), %%ymm8 \n" | 1728 | 0 | "vbroadcastf128 80(%1), %%ymm9 \n" | 1729 | 0 | "vbroadcastf128 96(%1), %%ymm10 \n" | 1730 | 0 | "vbroadcastf128 112(%1), %%ymm11 \n" | 1731 | |
| 1732 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm4) | 1733 | |
| 1734 | 0 | IDCT_AVX_EO_TO_ROW_HALVES ( | 1735 | 0 | %% ymm0, %% ymm4, %% ymm0, %% ymm12) | 1736 | |
| 1737 | 0 | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" | 1738 | | | 1739 | | /* ============================================== | 1740 | | * Column 1D DCT | 1741 | | * ---------------------------------------------- | 1742 | | */ | 1743 | | | 1744 | | /* DC only, so multiple by a and we're done */ | 1745 | 0 | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" | 1746 | | | 1747 | | /* Copy out results */ | 1748 | 0 | "vmovaps %%ymm0, (%0) \n" | 1749 | 0 | "vmovaps %%ymm0, 32(%0) \n" | 1750 | 0 | "vmovaps %%ymm0, 64(%0) \n" | 1751 | 0 | "vmovaps %%ymm0, 96(%0) \n" | 1752 | 0 | "vmovaps %%ymm0, 128(%0) \n" | 1753 | 0 | "vmovaps %%ymm0, 160(%0) \n" | 1754 | 0 | "vmovaps %%ymm0, 192(%0) \n" | 1755 | 0 | "vmovaps %%ymm0, 224(%0) \n" | 1756 | |
| 1757 | 0 | # ifndef __AVX__ | 1758 | 0 | "vzeroupper \n" | 1759 | 0 | # endif /* __AVX__ */ | 1760 | 0 | IDCT_AVX_OIC (data)); | 1761 | | // clang-format on | 1762 | 0 | } | 1763 | 0 | else | 1764 | 0 | { | 1765 | 0 | assert (false); // Invalid template instance parameter | 1766 | 0 | } | 1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1768 | | | 1769 | | dctInverse8x8_scalar<zeroedRows> (data); | 1770 | | | 1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1772 | 202k | } |
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<1>(float*) Line | Count | Source | 1621 | 1.71k | { | 1622 | 1.71k | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 | 1623 | | | 1624 | | /* The column-major version of M1, followed by the | 1625 | | * column-major version of M2: | 1626 | | * | 1627 | | * [ a c a f ] [ b d e g ] | 1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] | 1629 | | * [ a -f -a c ] [ e -b g d ] | 1630 | | * [ a -c a -f ] [ g -e d -b ] | 1631 | | */ | 1632 | 1.71k | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { | 1633 | 1.71k | 3.535536e-01, 3.535536e-01, | 1634 | 1.71k | 3.535536e-01, 3.535536e-01, /* a a a a */ | 1635 | 1.71k | 4.619398e-01, 1.913422e-01, | 1636 | 1.71k | -1.913422e-01, -4.619398e-01, /* c f -f -c */ | 1637 | 1.71k | 3.535536e-01, -3.535536e-01, | 1638 | 1.71k | -3.535536e-01, 3.535536e-01, /* a -a -a a */ | 1639 | 1.71k | 1.913422e-01, -4.619398e-01, | 1640 | 1.71k | 4.619398e-01, -1.913422e-01, /* f -c c -f */ | 1641 | | | 1642 | 1.71k | 4.903927e-01, 4.157349e-01, | 1643 | 1.71k | 2.777855e-01, 9.754573e-02, /* b d e g */ | 1644 | 1.71k | 4.157349e-01, -9.754573e-02, | 1645 | 1.71k | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ | 1646 | 1.71k | 2.777855e-01, -4.903927e-01, | 1647 | 1.71k | 9.754573e-02, 4.157349e-01, /* e -b g d */ | 1648 | 1.71k | 9.754573e-02, -2.777855e-01, | 1649 | 1.71k | 4.157349e-01, -4.903927e-01 /* g -e d -b */ | 1650 | 1.71k | }; | 1651 | | | 1652 | 1.71k | # define ROW0(_X) _X | 1653 | 1.71k | # define ROW1(_X) _X | 1654 | 1.71k | # define ROW2(_X) _X | 1655 | 1.71k | # define ROW3(_X) _X | 1656 | 1.71k | # define ROW4(_X) _X | 1657 | 1.71k | # define ROW5(_X) _X | 1658 | 1.71k | # define ROW6(_X) _X | 1659 | 1.71k | # define ROW7(_X) _X | 1660 | | | 1661 | 1.71k | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } | 1662 | 1.71k | else if (zeroedRows == 1) | 1663 | 1.71k | { | 1664 | | | 1665 | 1.71k | # undef ROW7 | 1666 | 1.71k | # define ROW7(_X) | 1667 | 1.71k | IDCT_AVX_ASM (data) | 1668 | 1.71k | } | 1669 | 0 | else if (zeroedRows == 2) | 1670 | 0 | { | 1671 | |
| 1672 | 0 | # undef ROW6 | 1673 | 0 | # define ROW6(_X) | 1674 | 0 | IDCT_AVX_ASM (data) | 1675 | 0 | } | 1676 | 0 | else if (zeroedRows == 3) | 1677 | 0 | { | 1678 | |
| 1679 | 0 | # undef ROW5 | 1680 | 0 | # define ROW5(_X) | 1681 | 0 | IDCT_AVX_ASM (data) | 1682 | 0 | } | 1683 | 0 | else if (zeroedRows == 4) | 1684 | 0 | { | 1685 | |
| 1686 | 0 | # undef ROW4 | 1687 | 0 | # define ROW4(_X) | 1688 | 0 | IDCT_AVX_ASM (data) | 1689 | 0 | } | 1690 | 0 | else if (zeroedRows == 5) | 1691 | 0 | { | 1692 | |
| 1693 | 0 | # undef ROW3 | 1694 | 0 | # define ROW3(_X) | 1695 | 0 | IDCT_AVX_ASM (data) | 1696 | 0 | } | 1697 | 0 | else if (zeroedRows == 6) | 1698 | 0 | { | 1699 | |
| 1700 | 0 | # undef ROW2 | 1701 | 0 | # define ROW2(_X) | 1702 | 0 | IDCT_AVX_ASM (data) | 1703 | 0 | } | 1704 | 0 | else if (zeroedRows == 7) | 1705 | 0 | { | 1706 | | // autoformatting wants to add a space between the doubled %% | 1707 | | // clang-format off | 1708 | |
| 1709 | 0 | __asm__( | 1710 | | | 1711 | | /* ============================================== | 1712 | | * Row 1D DCT | 1713 | | * ---------------------------------------------- | 1714 | | */ | 1715 | 0 | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) | 1716 | |
| 1717 | 0 | "vbroadcastf128 (%1), %%ymm8 \n" | 1718 | 0 | "vbroadcastf128 16(%1), %%ymm9 \n" | 1719 | 0 | "vbroadcastf128 32(%1), %%ymm10 \n" | 1720 | 0 | "vbroadcastf128 48(%1), %%ymm11 \n" | 1721 | | | 1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ | 1723 | 0 | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" | 1724 | |
| 1725 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm0) | 1726 | |
| 1727 | 0 | "vbroadcastf128 64(%1), %%ymm8 \n" | 1728 | 0 | "vbroadcastf128 80(%1), %%ymm9 \n" | 1729 | 0 | "vbroadcastf128 96(%1), %%ymm10 \n" | 1730 | 0 | "vbroadcastf128 112(%1), %%ymm11 \n" | 1731 | |
| 1732 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm4) | 1733 | |
| 1734 | 0 | IDCT_AVX_EO_TO_ROW_HALVES ( | 1735 | 0 | %% ymm0, %% ymm4, %% ymm0, %% ymm12) | 1736 | |
| 1737 | 0 | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" | 1738 | | | 1739 | | /* ============================================== | 1740 | | * Column 1D DCT | 1741 | | * ---------------------------------------------- | 1742 | | */ | 1743 | | | 1744 | | /* DC only, so multiple by a and we're done */ | 1745 | 0 | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" | 1746 | | | 1747 | | /* Copy out results */ | 1748 | 0 | "vmovaps %%ymm0, (%0) \n" | 1749 | 0 | "vmovaps %%ymm0, 32(%0) \n" | 1750 | 0 | "vmovaps %%ymm0, 64(%0) \n" | 1751 | 0 | "vmovaps %%ymm0, 96(%0) \n" | 1752 | 0 | "vmovaps %%ymm0, 128(%0) \n" | 1753 | 0 | "vmovaps %%ymm0, 160(%0) \n" | 1754 | 0 | "vmovaps %%ymm0, 192(%0) \n" | 1755 | 0 | "vmovaps %%ymm0, 224(%0) \n" | 1756 | |
| 1757 | 0 | # ifndef __AVX__ | 1758 | 0 | "vzeroupper \n" | 1759 | 0 | # endif /* __AVX__ */ | 1760 | 0 | IDCT_AVX_OIC (data)); | 1761 | | // clang-format on | 1762 | 0 | } | 1763 | 0 | else | 1764 | 0 | { | 1765 | 0 | assert (false); // Invalid template instance parameter | 1766 | 0 | } | 1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1768 | | | 1769 | | dctInverse8x8_scalar<zeroedRows> (data); | 1770 | | | 1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1772 | 1.71k | } |
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<2>(float*) Line | Count | Source | 1621 | 127 | { | 1622 | 127 | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 | 1623 | | | 1624 | | /* The column-major version of M1, followed by the | 1625 | | * column-major version of M2: | 1626 | | * | 1627 | | * [ a c a f ] [ b d e g ] | 1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] | 1629 | | * [ a -f -a c ] [ e -b g d ] | 1630 | | * [ a -c a -f ] [ g -e d -b ] | 1631 | | */ | 1632 | 127 | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { | 1633 | 127 | 3.535536e-01, 3.535536e-01, | 1634 | 127 | 3.535536e-01, 3.535536e-01, /* a a a a */ | 1635 | 127 | 4.619398e-01, 1.913422e-01, | 1636 | 127 | -1.913422e-01, -4.619398e-01, /* c f -f -c */ | 1637 | 127 | 3.535536e-01, -3.535536e-01, | 1638 | 127 | -3.535536e-01, 3.535536e-01, /* a -a -a a */ | 1639 | 127 | 1.913422e-01, -4.619398e-01, | 1640 | 127 | 4.619398e-01, -1.913422e-01, /* f -c c -f */ | 1641 | | | 1642 | 127 | 4.903927e-01, 4.157349e-01, | 1643 | 127 | 2.777855e-01, 9.754573e-02, /* b d e g */ | 1644 | 127 | 4.157349e-01, -9.754573e-02, | 1645 | 127 | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ | 1646 | 127 | 2.777855e-01, -4.903927e-01, | 1647 | 127 | 9.754573e-02, 4.157349e-01, /* e -b g d */ | 1648 | 127 | 9.754573e-02, -2.777855e-01, | 1649 | 127 | 4.157349e-01, -4.903927e-01 /* g -e d -b */ | 1650 | 127 | }; | 1651 | | | 1652 | 127 | # define ROW0(_X) _X | 1653 | 127 | # define ROW1(_X) _X | 1654 | 127 | # define ROW2(_X) _X | 1655 | 127 | # define ROW3(_X) _X | 1656 | 127 | # define ROW4(_X) _X | 1657 | 127 | # define ROW5(_X) _X | 1658 | 127 | # define ROW6(_X) _X | 1659 | 127 | # define ROW7(_X) _X | 1660 | | | 1661 | 127 | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } | 1662 | 127 | else if (zeroedRows == 1) | 1663 | 0 | { | 1664 | |
| 1665 | 0 | # undef ROW7 | 1666 | 0 | # define ROW7(_X) | 1667 | 0 | IDCT_AVX_ASM (data) | 1668 | 0 | } | 1669 | 127 | else if (zeroedRows == 2) | 1670 | 127 | { | 1671 | | | 1672 | 127 | # undef ROW6 | 1673 | 127 | # define ROW6(_X) | 1674 | 127 | IDCT_AVX_ASM (data) | 1675 | 127 | } | 1676 | 0 | else if (zeroedRows == 3) | 1677 | 0 | { | 1678 | |
| 1679 | 0 | # undef ROW5 | 1680 | 0 | # define ROW5(_X) | 1681 | 0 | IDCT_AVX_ASM (data) | 1682 | 0 | } | 1683 | 0 | else if (zeroedRows == 4) | 1684 | 0 | { | 1685 | |
| 1686 | 0 | # undef ROW4 | 1687 | 0 | # define ROW4(_X) | 1688 | 0 | IDCT_AVX_ASM (data) | 1689 | 0 | } | 1690 | 0 | else if (zeroedRows == 5) | 1691 | 0 | { | 1692 | |
| 1693 | 0 | # undef ROW3 | 1694 | 0 | # define ROW3(_X) | 1695 | 0 | IDCT_AVX_ASM (data) | 1696 | 0 | } | 1697 | 0 | else if (zeroedRows == 6) | 1698 | 0 | { | 1699 | |
| 1700 | 0 | # undef ROW2 | 1701 | 0 | # define ROW2(_X) | 1702 | 0 | IDCT_AVX_ASM (data) | 1703 | 0 | } | 1704 | 0 | else if (zeroedRows == 7) | 1705 | 0 | { | 1706 | | // autoformatting wants to add a space between the doubled %% | 1707 | | // clang-format off | 1708 | |
| 1709 | 0 | __asm__( | 1710 | | | 1711 | | /* ============================================== | 1712 | | * Row 1D DCT | 1713 | | * ---------------------------------------------- | 1714 | | */ | 1715 | 0 | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) | 1716 | |
| 1717 | 0 | "vbroadcastf128 (%1), %%ymm8 \n" | 1718 | 0 | "vbroadcastf128 16(%1), %%ymm9 \n" | 1719 | 0 | "vbroadcastf128 32(%1), %%ymm10 \n" | 1720 | 0 | "vbroadcastf128 48(%1), %%ymm11 \n" | 1721 | | | 1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ | 1723 | 0 | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" | 1724 | |
| 1725 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm0) | 1726 | |
| 1727 | 0 | "vbroadcastf128 64(%1), %%ymm8 \n" | 1728 | 0 | "vbroadcastf128 80(%1), %%ymm9 \n" | 1729 | 0 | "vbroadcastf128 96(%1), %%ymm10 \n" | 1730 | 0 | "vbroadcastf128 112(%1), %%ymm11 \n" | 1731 | |
| 1732 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm4) | 1733 | |
| 1734 | 0 | IDCT_AVX_EO_TO_ROW_HALVES ( | 1735 | 0 | %% ymm0, %% ymm4, %% ymm0, %% ymm12) | 1736 | |
| 1737 | 0 | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" | 1738 | | | 1739 | | /* ============================================== | 1740 | | * Column 1D DCT | 1741 | | * ---------------------------------------------- | 1742 | | */ | 1743 | | | 1744 | | /* DC only, so multiple by a and we're done */ | 1745 | 0 | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" | 1746 | | | 1747 | | /* Copy out results */ | 1748 | 0 | "vmovaps %%ymm0, (%0) \n" | 1749 | 0 | "vmovaps %%ymm0, 32(%0) \n" | 1750 | 0 | "vmovaps %%ymm0, 64(%0) \n" | 1751 | 0 | "vmovaps %%ymm0, 96(%0) \n" | 1752 | 0 | "vmovaps %%ymm0, 128(%0) \n" | 1753 | 0 | "vmovaps %%ymm0, 160(%0) \n" | 1754 | 0 | "vmovaps %%ymm0, 192(%0) \n" | 1755 | 0 | "vmovaps %%ymm0, 224(%0) \n" | 1756 | |
| 1757 | 0 | # ifndef __AVX__ | 1758 | 0 | "vzeroupper \n" | 1759 | 0 | # endif /* __AVX__ */ | 1760 | 0 | IDCT_AVX_OIC (data)); | 1761 | | // clang-format on | 1762 | 0 | } | 1763 | 0 | else | 1764 | 0 | { | 1765 | 0 | assert (false); // Invalid template instance parameter | 1766 | 0 | } | 1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1768 | | | 1769 | | dctInverse8x8_scalar<zeroedRows> (data); | 1770 | | | 1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1772 | 127 | } |
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<3>(float*) Line | Count | Source | 1621 | 1.61k | { | 1622 | 1.61k | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 | 1623 | | | 1624 | | /* The column-major version of M1, followed by the | 1625 | | * column-major version of M2: | 1626 | | * | 1627 | | * [ a c a f ] [ b d e g ] | 1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] | 1629 | | * [ a -f -a c ] [ e -b g d ] | 1630 | | * [ a -c a -f ] [ g -e d -b ] | 1631 | | */ | 1632 | 1.61k | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { | 1633 | 1.61k | 3.535536e-01, 3.535536e-01, | 1634 | 1.61k | 3.535536e-01, 3.535536e-01, /* a a a a */ | 1635 | 1.61k | 4.619398e-01, 1.913422e-01, | 1636 | 1.61k | -1.913422e-01, -4.619398e-01, /* c f -f -c */ | 1637 | 1.61k | 3.535536e-01, -3.535536e-01, | 1638 | 1.61k | -3.535536e-01, 3.535536e-01, /* a -a -a a */ | 1639 | 1.61k | 1.913422e-01, -4.619398e-01, | 1640 | 1.61k | 4.619398e-01, -1.913422e-01, /* f -c c -f */ | 1641 | | | 1642 | 1.61k | 4.903927e-01, 4.157349e-01, | 1643 | 1.61k | 2.777855e-01, 9.754573e-02, /* b d e g */ | 1644 | 1.61k | 4.157349e-01, -9.754573e-02, | 1645 | 1.61k | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ | 1646 | 1.61k | 2.777855e-01, -4.903927e-01, | 1647 | 1.61k | 9.754573e-02, 4.157349e-01, /* e -b g d */ | 1648 | 1.61k | 9.754573e-02, -2.777855e-01, | 1649 | 1.61k | 4.157349e-01, -4.903927e-01 /* g -e d -b */ | 1650 | 1.61k | }; | 1651 | | | 1652 | 1.61k | # define ROW0(_X) _X | 1653 | 1.61k | # define ROW1(_X) _X | 1654 | 1.61k | # define ROW2(_X) _X | 1655 | 1.61k | # define ROW3(_X) _X | 1656 | 1.61k | # define ROW4(_X) _X | 1657 | 1.61k | # define ROW5(_X) _X | 1658 | 1.61k | # define ROW6(_X) _X | 1659 | 1.61k | # define ROW7(_X) _X | 1660 | | | 1661 | 1.61k | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } | 1662 | 1.61k | else if (zeroedRows == 1) | 1663 | 0 | { | 1664 | |
| 1665 | 0 | # undef ROW7 | 1666 | 0 | # define ROW7(_X) | 1667 | 0 | IDCT_AVX_ASM (data) | 1668 | 0 | } | 1669 | 1.61k | else if (zeroedRows == 2) | 1670 | 0 | { | 1671 | |
| 1672 | 0 | # undef ROW6 | 1673 | 0 | # define ROW6(_X) | 1674 | 0 | IDCT_AVX_ASM (data) | 1675 | 0 | } | 1676 | 1.61k | else if (zeroedRows == 3) | 1677 | 1.61k | { | 1678 | | | 1679 | 1.61k | # undef ROW5 | 1680 | 1.61k | # define ROW5(_X) | 1681 | 1.61k | IDCT_AVX_ASM (data) | 1682 | 1.61k | } | 1683 | 0 | else if (zeroedRows == 4) | 1684 | 0 | { | 1685 | |
| 1686 | 0 | # undef ROW4 | 1687 | 0 | # define ROW4(_X) | 1688 | 0 | IDCT_AVX_ASM (data) | 1689 | 0 | } | 1690 | 0 | else if (zeroedRows == 5) | 1691 | 0 | { | 1692 | |
| 1693 | 0 | # undef ROW3 | 1694 | 0 | # define ROW3(_X) | 1695 | 0 | IDCT_AVX_ASM (data) | 1696 | 0 | } | 1697 | 0 | else if (zeroedRows == 6) | 1698 | 0 | { | 1699 | |
| 1700 | 0 | # undef ROW2 | 1701 | 0 | # define ROW2(_X) | 1702 | 0 | IDCT_AVX_ASM (data) | 1703 | 0 | } | 1704 | 0 | else if (zeroedRows == 7) | 1705 | 0 | { | 1706 | | // autoformatting wants to add a space between the doubled %% | 1707 | | // clang-format off | 1708 | |
| 1709 | 0 | __asm__( | 1710 | | | 1711 | | /* ============================================== | 1712 | | * Row 1D DCT | 1713 | | * ---------------------------------------------- | 1714 | | */ | 1715 | 0 | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) | 1716 | |
| 1717 | 0 | "vbroadcastf128 (%1), %%ymm8 \n" | 1718 | 0 | "vbroadcastf128 16(%1), %%ymm9 \n" | 1719 | 0 | "vbroadcastf128 32(%1), %%ymm10 \n" | 1720 | 0 | "vbroadcastf128 48(%1), %%ymm11 \n" | 1721 | | | 1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ | 1723 | 0 | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" | 1724 | |
| 1725 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm0) | 1726 | |
| 1727 | 0 | "vbroadcastf128 64(%1), %%ymm8 \n" | 1728 | 0 | "vbroadcastf128 80(%1), %%ymm9 \n" | 1729 | 0 | "vbroadcastf128 96(%1), %%ymm10 \n" | 1730 | 0 | "vbroadcastf128 112(%1), %%ymm11 \n" | 1731 | |
| 1732 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm4) | 1733 | |
| 1734 | 0 | IDCT_AVX_EO_TO_ROW_HALVES ( | 1735 | 0 | %% ymm0, %% ymm4, %% ymm0, %% ymm12) | 1736 | |
| 1737 | 0 | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" | 1738 | | | 1739 | | /* ============================================== | 1740 | | * Column 1D DCT | 1741 | | * ---------------------------------------------- | 1742 | | */ | 1743 | | | 1744 | | /* DC only, so multiple by a and we're done */ | 1745 | 0 | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" | 1746 | | | 1747 | | /* Copy out results */ | 1748 | 0 | "vmovaps %%ymm0, (%0) \n" | 1749 | 0 | "vmovaps %%ymm0, 32(%0) \n" | 1750 | 0 | "vmovaps %%ymm0, 64(%0) \n" | 1751 | 0 | "vmovaps %%ymm0, 96(%0) \n" | 1752 | 0 | "vmovaps %%ymm0, 128(%0) \n" | 1753 | 0 | "vmovaps %%ymm0, 160(%0) \n" | 1754 | 0 | "vmovaps %%ymm0, 192(%0) \n" | 1755 | 0 | "vmovaps %%ymm0, 224(%0) \n" | 1756 | |
| 1757 | 0 | # ifndef __AVX__ | 1758 | 0 | "vzeroupper \n" | 1759 | 0 | # endif /* __AVX__ */ | 1760 | 0 | IDCT_AVX_OIC (data)); | 1761 | | // clang-format on | 1762 | 0 | } | 1763 | 0 | else | 1764 | 0 | { | 1765 | 0 | assert (false); // Invalid template instance parameter | 1766 | 0 | } | 1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1768 | | | 1769 | | dctInverse8x8_scalar<zeroedRows> (data); | 1770 | | | 1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1772 | 1.61k | } |
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<4>(float*) Line | Count | Source | 1621 | 713 | { | 1622 | 713 | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 | 1623 | | | 1624 | | /* The column-major version of M1, followed by the | 1625 | | * column-major version of M2: | 1626 | | * | 1627 | | * [ a c a f ] [ b d e g ] | 1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] | 1629 | | * [ a -f -a c ] [ e -b g d ] | 1630 | | * [ a -c a -f ] [ g -e d -b ] | 1631 | | */ | 1632 | 713 | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { | 1633 | 713 | 3.535536e-01, 3.535536e-01, | 1634 | 713 | 3.535536e-01, 3.535536e-01, /* a a a a */ | 1635 | 713 | 4.619398e-01, 1.913422e-01, | 1636 | 713 | -1.913422e-01, -4.619398e-01, /* c f -f -c */ | 1637 | 713 | 3.535536e-01, -3.535536e-01, | 1638 | 713 | -3.535536e-01, 3.535536e-01, /* a -a -a a */ | 1639 | 713 | 1.913422e-01, -4.619398e-01, | 1640 | 713 | 4.619398e-01, -1.913422e-01, /* f -c c -f */ | 1641 | | | 1642 | 713 | 4.903927e-01, 4.157349e-01, | 1643 | 713 | 2.777855e-01, 9.754573e-02, /* b d e g */ | 1644 | 713 | 4.157349e-01, -9.754573e-02, | 1645 | 713 | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ | 1646 | 713 | 2.777855e-01, -4.903927e-01, | 1647 | 713 | 9.754573e-02, 4.157349e-01, /* e -b g d */ | 1648 | 713 | 9.754573e-02, -2.777855e-01, | 1649 | 713 | 4.157349e-01, -4.903927e-01 /* g -e d -b */ | 1650 | 713 | }; | 1651 | | | 1652 | 713 | # define ROW0(_X) _X | 1653 | 713 | # define ROW1(_X) _X | 1654 | 713 | # define ROW2(_X) _X | 1655 | 713 | # define ROW3(_X) _X | 1656 | 713 | # define ROW4(_X) _X | 1657 | 713 | # define ROW5(_X) _X | 1658 | 713 | # define ROW6(_X) _X | 1659 | 713 | # define ROW7(_X) _X | 1660 | | | 1661 | 713 | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } | 1662 | 713 | else if (zeroedRows == 1) | 1663 | 0 | { | 1664 | |
| 1665 | 0 | # undef ROW7 | 1666 | 0 | # define ROW7(_X) | 1667 | 0 | IDCT_AVX_ASM (data) | 1668 | 0 | } | 1669 | 713 | else if (zeroedRows == 2) | 1670 | 0 | { | 1671 | |
| 1672 | 0 | # undef ROW6 | 1673 | 0 | # define ROW6(_X) | 1674 | 0 | IDCT_AVX_ASM (data) | 1675 | 0 | } | 1676 | 713 | else if (zeroedRows == 3) | 1677 | 0 | { | 1678 | |
| 1679 | 0 | # undef ROW5 | 1680 | 0 | # define ROW5(_X) | 1681 | 0 | IDCT_AVX_ASM (data) | 1682 | 0 | } | 1683 | 713 | else if (zeroedRows == 4) | 1684 | 713 | { | 1685 | | | 1686 | 713 | # undef ROW4 | 1687 | 713 | # define ROW4(_X) | 1688 | 713 | IDCT_AVX_ASM (data) | 1689 | 713 | } | 1690 | 0 | else if (zeroedRows == 5) | 1691 | 0 | { | 1692 | |
| 1693 | 0 | # undef ROW3 | 1694 | 0 | # define ROW3(_X) | 1695 | 0 | IDCT_AVX_ASM (data) | 1696 | 0 | } | 1697 | 0 | else if (zeroedRows == 6) | 1698 | 0 | { | 1699 | |
| 1700 | 0 | # undef ROW2 | 1701 | 0 | # define ROW2(_X) | 1702 | 0 | IDCT_AVX_ASM (data) | 1703 | 0 | } | 1704 | 0 | else if (zeroedRows == 7) | 1705 | 0 | { | 1706 | | // autoformatting wants to add a space between the doubled %% | 1707 | | // clang-format off | 1708 | |
| 1709 | 0 | __asm__( | 1710 | | | 1711 | | /* ============================================== | 1712 | | * Row 1D DCT | 1713 | | * ---------------------------------------------- | 1714 | | */ | 1715 | 0 | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) | 1716 | |
| 1717 | 0 | "vbroadcastf128 (%1), %%ymm8 \n" | 1718 | 0 | "vbroadcastf128 16(%1), %%ymm9 \n" | 1719 | 0 | "vbroadcastf128 32(%1), %%ymm10 \n" | 1720 | 0 | "vbroadcastf128 48(%1), %%ymm11 \n" | 1721 | | | 1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ | 1723 | 0 | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" | 1724 | |
| 1725 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm0) | 1726 | |
| 1727 | 0 | "vbroadcastf128 64(%1), %%ymm8 \n" | 1728 | 0 | "vbroadcastf128 80(%1), %%ymm9 \n" | 1729 | 0 | "vbroadcastf128 96(%1), %%ymm10 \n" | 1730 | 0 | "vbroadcastf128 112(%1), %%ymm11 \n" | 1731 | |
| 1732 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm4) | 1733 | |
| 1734 | 0 | IDCT_AVX_EO_TO_ROW_HALVES ( | 1735 | 0 | %% ymm0, %% ymm4, %% ymm0, %% ymm12) | 1736 | |
| 1737 | 0 | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" | 1738 | | | 1739 | | /* ============================================== | 1740 | | * Column 1D DCT | 1741 | | * ---------------------------------------------- | 1742 | | */ | 1743 | | | 1744 | | /* DC only, so multiple by a and we're done */ | 1745 | 0 | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" | 1746 | | | 1747 | | /* Copy out results */ | 1748 | 0 | "vmovaps %%ymm0, (%0) \n" | 1749 | 0 | "vmovaps %%ymm0, 32(%0) \n" | 1750 | 0 | "vmovaps %%ymm0, 64(%0) \n" | 1751 | 0 | "vmovaps %%ymm0, 96(%0) \n" | 1752 | 0 | "vmovaps %%ymm0, 128(%0) \n" | 1753 | 0 | "vmovaps %%ymm0, 160(%0) \n" | 1754 | 0 | "vmovaps %%ymm0, 192(%0) \n" | 1755 | 0 | "vmovaps %%ymm0, 224(%0) \n" | 1756 | |
| 1757 | 0 | # ifndef __AVX__ | 1758 | 0 | "vzeroupper \n" | 1759 | 0 | # endif /* __AVX__ */ | 1760 | 0 | IDCT_AVX_OIC (data)); | 1761 | | // clang-format on | 1762 | 0 | } | 1763 | 0 | else | 1764 | 0 | { | 1765 | 0 | assert (false); // Invalid template instance parameter | 1766 | 0 | } | 1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1768 | | | 1769 | | dctInverse8x8_scalar<zeroedRows> (data); | 1770 | | | 1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1772 | 713 | } |
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<5>(float*) Line | Count | Source | 1621 | 6.10k | { | 1622 | 6.10k | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 | 1623 | | | 1624 | | /* The column-major version of M1, followed by the | 1625 | | * column-major version of M2: | 1626 | | * | 1627 | | * [ a c a f ] [ b d e g ] | 1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] | 1629 | | * [ a -f -a c ] [ e -b g d ] | 1630 | | * [ a -c a -f ] [ g -e d -b ] | 1631 | | */ | 1632 | 6.10k | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { | 1633 | 6.10k | 3.535536e-01, 3.535536e-01, | 1634 | 6.10k | 3.535536e-01, 3.535536e-01, /* a a a a */ | 1635 | 6.10k | 4.619398e-01, 1.913422e-01, | 1636 | 6.10k | -1.913422e-01, -4.619398e-01, /* c f -f -c */ | 1637 | 6.10k | 3.535536e-01, -3.535536e-01, | 1638 | 6.10k | -3.535536e-01, 3.535536e-01, /* a -a -a a */ | 1639 | 6.10k | 1.913422e-01, -4.619398e-01, | 1640 | 6.10k | 4.619398e-01, -1.913422e-01, /* f -c c -f */ | 1641 | | | 1642 | 6.10k | 4.903927e-01, 4.157349e-01, | 1643 | 6.10k | 2.777855e-01, 9.754573e-02, /* b d e g */ | 1644 | 6.10k | 4.157349e-01, -9.754573e-02, | 1645 | 6.10k | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ | 1646 | 6.10k | 2.777855e-01, -4.903927e-01, | 1647 | 6.10k | 9.754573e-02, 4.157349e-01, /* e -b g d */ | 1648 | 6.10k | 9.754573e-02, -2.777855e-01, | 1649 | 6.10k | 4.157349e-01, -4.903927e-01 /* g -e d -b */ | 1650 | 6.10k | }; | 1651 | | | 1652 | 6.10k | # define ROW0(_X) _X | 1653 | 6.10k | # define ROW1(_X) _X | 1654 | 6.10k | # define ROW2(_X) _X | 1655 | 6.10k | # define ROW3(_X) _X | 1656 | 6.10k | # define ROW4(_X) _X | 1657 | 6.10k | # define ROW5(_X) _X | 1658 | 6.10k | # define ROW6(_X) _X | 1659 | 6.10k | # define ROW7(_X) _X | 1660 | | | 1661 | 6.10k | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } | 1662 | 6.10k | else if (zeroedRows == 1) | 1663 | 0 | { | 1664 | |
| 1665 | 0 | # undef ROW7 | 1666 | 0 | # define ROW7(_X) | 1667 | 0 | IDCT_AVX_ASM (data) | 1668 | 0 | } | 1669 | 6.10k | else if (zeroedRows == 2) | 1670 | 0 | { | 1671 | |
| 1672 | 0 | # undef ROW6 | 1673 | 0 | # define ROW6(_X) | 1674 | 0 | IDCT_AVX_ASM (data) | 1675 | 0 | } | 1676 | 6.10k | else if (zeroedRows == 3) | 1677 | 0 | { | 1678 | |
| 1679 | 0 | # undef ROW5 | 1680 | 0 | # define ROW5(_X) | 1681 | 0 | IDCT_AVX_ASM (data) | 1682 | 0 | } | 1683 | 6.10k | else if (zeroedRows == 4) | 1684 | 0 | { | 1685 | |
| 1686 | 0 | # undef ROW4 | 1687 | 0 | # define ROW4(_X) | 1688 | 0 | IDCT_AVX_ASM (data) | 1689 | 0 | } | 1690 | 6.10k | else if (zeroedRows == 5) | 1691 | 6.10k | { | 1692 | | | 1693 | 6.10k | # undef ROW3 | 1694 | 6.10k | # define ROW3(_X) | 1695 | 6.10k | IDCT_AVX_ASM (data) | 1696 | 6.10k | } | 1697 | 0 | else if (zeroedRows == 6) | 1698 | 0 | { | 1699 | |
| 1700 | 0 | # undef ROW2 | 1701 | 0 | # define ROW2(_X) | 1702 | 0 | IDCT_AVX_ASM (data) | 1703 | 0 | } | 1704 | 0 | else if (zeroedRows == 7) | 1705 | 0 | { | 1706 | | // autoformatting wants to add a space between the doubled %% | 1707 | | // clang-format off | 1708 | |
| 1709 | 0 | __asm__( | 1710 | | | 1711 | | /* ============================================== | 1712 | | * Row 1D DCT | 1713 | | * ---------------------------------------------- | 1714 | | */ | 1715 | 0 | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) | 1716 | |
| 1717 | 0 | "vbroadcastf128 (%1), %%ymm8 \n" | 1718 | 0 | "vbroadcastf128 16(%1), %%ymm9 \n" | 1719 | 0 | "vbroadcastf128 32(%1), %%ymm10 \n" | 1720 | 0 | "vbroadcastf128 48(%1), %%ymm11 \n" | 1721 | | | 1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ | 1723 | 0 | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" | 1724 | |
| 1725 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm0) | 1726 | |
| 1727 | 0 | "vbroadcastf128 64(%1), %%ymm8 \n" | 1728 | 0 | "vbroadcastf128 80(%1), %%ymm9 \n" | 1729 | 0 | "vbroadcastf128 96(%1), %%ymm10 \n" | 1730 | 0 | "vbroadcastf128 112(%1), %%ymm11 \n" | 1731 | |
| 1732 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm4) | 1733 | |
| 1734 | 0 | IDCT_AVX_EO_TO_ROW_HALVES ( | 1735 | 0 | %% ymm0, %% ymm4, %% ymm0, %% ymm12) | 1736 | |
| 1737 | 0 | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" | 1738 | | | 1739 | | /* ============================================== | 1740 | | * Column 1D DCT | 1741 | | * ---------------------------------------------- | 1742 | | */ | 1743 | | | 1744 | | /* DC only, so multiple by a and we're done */ | 1745 | 0 | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" | 1746 | | | 1747 | | /* Copy out results */ | 1748 | 0 | "vmovaps %%ymm0, (%0) \n" | 1749 | 0 | "vmovaps %%ymm0, 32(%0) \n" | 1750 | 0 | "vmovaps %%ymm0, 64(%0) \n" | 1751 | 0 | "vmovaps %%ymm0, 96(%0) \n" | 1752 | 0 | "vmovaps %%ymm0, 128(%0) \n" | 1753 | 0 | "vmovaps %%ymm0, 160(%0) \n" | 1754 | 0 | "vmovaps %%ymm0, 192(%0) \n" | 1755 | 0 | "vmovaps %%ymm0, 224(%0) \n" | 1756 | |
| 1757 | 0 | # ifndef __AVX__ | 1758 | 0 | "vzeroupper \n" | 1759 | 0 | # endif /* __AVX__ */ | 1760 | 0 | IDCT_AVX_OIC (data)); | 1761 | | // clang-format on | 1762 | 0 | } | 1763 | 0 | else | 1764 | 0 | { | 1765 | 0 | assert (false); // Invalid template instance parameter | 1766 | 0 | } | 1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1768 | | | 1769 | | dctInverse8x8_scalar<zeroedRows> (data); | 1770 | | | 1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1772 | 6.10k | } |
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<6>(float*) Line | Count | Source | 1621 | 1.02k | { | 1622 | 1.02k | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 | 1623 | | | 1624 | | /* The column-major version of M1, followed by the | 1625 | | * column-major version of M2: | 1626 | | * | 1627 | | * [ a c a f ] [ b d e g ] | 1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] | 1629 | | * [ a -f -a c ] [ e -b g d ] | 1630 | | * [ a -c a -f ] [ g -e d -b ] | 1631 | | */ | 1632 | 1.02k | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { | 1633 | 1.02k | 3.535536e-01, 3.535536e-01, | 1634 | 1.02k | 3.535536e-01, 3.535536e-01, /* a a a a */ | 1635 | 1.02k | 4.619398e-01, 1.913422e-01, | 1636 | 1.02k | -1.913422e-01, -4.619398e-01, /* c f -f -c */ | 1637 | 1.02k | 3.535536e-01, -3.535536e-01, | 1638 | 1.02k | -3.535536e-01, 3.535536e-01, /* a -a -a a */ | 1639 | 1.02k | 1.913422e-01, -4.619398e-01, | 1640 | 1.02k | 4.619398e-01, -1.913422e-01, /* f -c c -f */ | 1641 | | | 1642 | 1.02k | 4.903927e-01, 4.157349e-01, | 1643 | 1.02k | 2.777855e-01, 9.754573e-02, /* b d e g */ | 1644 | 1.02k | 4.157349e-01, -9.754573e-02, | 1645 | 1.02k | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ | 1646 | 1.02k | 2.777855e-01, -4.903927e-01, | 1647 | 1.02k | 9.754573e-02, 4.157349e-01, /* e -b g d */ | 1648 | 1.02k | 9.754573e-02, -2.777855e-01, | 1649 | 1.02k | 4.157349e-01, -4.903927e-01 /* g -e d -b */ | 1650 | 1.02k | }; | 1651 | | | 1652 | 1.02k | # define ROW0(_X) _X | 1653 | 1.02k | # define ROW1(_X) _X | 1654 | 1.02k | # define ROW2(_X) _X | 1655 | 1.02k | # define ROW3(_X) _X | 1656 | 1.02k | # define ROW4(_X) _X | 1657 | 1.02k | # define ROW5(_X) _X | 1658 | 1.02k | # define ROW6(_X) _X | 1659 | 1.02k | # define ROW7(_X) _X | 1660 | | | 1661 | 1.02k | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } | 1662 | 1.02k | else if (zeroedRows == 1) | 1663 | 0 | { | 1664 | |
| 1665 | 0 | # undef ROW7 | 1666 | 0 | # define ROW7(_X) | 1667 | 0 | IDCT_AVX_ASM (data) | 1668 | 0 | } | 1669 | 1.02k | else if (zeroedRows == 2) | 1670 | 0 | { | 1671 | |
| 1672 | 0 | # undef ROW6 | 1673 | 0 | # define ROW6(_X) | 1674 | 0 | IDCT_AVX_ASM (data) | 1675 | 0 | } | 1676 | 1.02k | else if (zeroedRows == 3) | 1677 | 0 | { | 1678 | |
| 1679 | 0 | # undef ROW5 | 1680 | 0 | # define ROW5(_X) | 1681 | 0 | IDCT_AVX_ASM (data) | 1682 | 0 | } | 1683 | 1.02k | else if (zeroedRows == 4) | 1684 | 0 | { | 1685 | |
| 1686 | 0 | # undef ROW4 | 1687 | 0 | # define ROW4(_X) | 1688 | 0 | IDCT_AVX_ASM (data) | 1689 | 0 | } | 1690 | 1.02k | else if (zeroedRows == 5) | 1691 | 0 | { | 1692 | |
| 1693 | 0 | # undef ROW3 | 1694 | 0 | # define ROW3(_X) | 1695 | 0 | IDCT_AVX_ASM (data) | 1696 | 0 | } | 1697 | 1.02k | else if (zeroedRows == 6) | 1698 | 1.02k | { | 1699 | | | 1700 | 1.02k | # undef ROW2 | 1701 | 1.02k | # define ROW2(_X) | 1702 | 1.02k | IDCT_AVX_ASM (data) | 1703 | 1.02k | } | 1704 | 0 | else if (zeroedRows == 7) | 1705 | 0 | { | 1706 | | // autoformatting wants to add a space between the doubled %% | 1707 | | // clang-format off | 1708 | |
| 1709 | 0 | __asm__( | 1710 | | | 1711 | | /* ============================================== | 1712 | | * Row 1D DCT | 1713 | | * ---------------------------------------------- | 1714 | | */ | 1715 | 0 | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) | 1716 | |
| 1717 | 0 | "vbroadcastf128 (%1), %%ymm8 \n" | 1718 | 0 | "vbroadcastf128 16(%1), %%ymm9 \n" | 1719 | 0 | "vbroadcastf128 32(%1), %%ymm10 \n" | 1720 | 0 | "vbroadcastf128 48(%1), %%ymm11 \n" | 1721 | | | 1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ | 1723 | 0 | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" | 1724 | |
| 1725 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm0) | 1726 | |
| 1727 | 0 | "vbroadcastf128 64(%1), %%ymm8 \n" | 1728 | 0 | "vbroadcastf128 80(%1), %%ymm9 \n" | 1729 | 0 | "vbroadcastf128 96(%1), %%ymm10 \n" | 1730 | 0 | "vbroadcastf128 112(%1), %%ymm11 \n" | 1731 | |
| 1732 | 0 | IDCT_AVX_MMULT_ROWS (%% ymm4) | 1733 | |
| 1734 | 0 | IDCT_AVX_EO_TO_ROW_HALVES ( | 1735 | 0 | %% ymm0, %% ymm4, %% ymm0, %% ymm12) | 1736 | |
| 1737 | 0 | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" | 1738 | | | 1739 | | /* ============================================== | 1740 | | * Column 1D DCT | 1741 | | * ---------------------------------------------- | 1742 | | */ | 1743 | | | 1744 | | /* DC only, so multiple by a and we're done */ | 1745 | 0 | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" | 1746 | | | 1747 | | /* Copy out results */ | 1748 | 0 | "vmovaps %%ymm0, (%0) \n" | 1749 | 0 | "vmovaps %%ymm0, 32(%0) \n" | 1750 | 0 | "vmovaps %%ymm0, 64(%0) \n" | 1751 | 0 | "vmovaps %%ymm0, 96(%0) \n" | 1752 | 0 | "vmovaps %%ymm0, 128(%0) \n" | 1753 | 0 | "vmovaps %%ymm0, 160(%0) \n" | 1754 | 0 | "vmovaps %%ymm0, 192(%0) \n" | 1755 | 0 | "vmovaps %%ymm0, 224(%0) \n" | 1756 | |
| 1757 | 0 | # ifndef __AVX__ | 1758 | 0 | "vzeroupper \n" | 1759 | 0 | # endif /* __AVX__ */ | 1760 | 0 | IDCT_AVX_OIC (data)); | 1761 | | // clang-format on | 1762 | 0 | } | 1763 | 0 | else | 1764 | 0 | { | 1765 | 0 | assert (false); // Invalid template instance parameter | 1766 | 0 | } | 1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1768 | | | 1769 | | dctInverse8x8_scalar<zeroedRows> (data); | 1770 | | | 1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1772 | 1.02k | } |
ImfDwaCompressor.cpp:void Imf_3_3::(anonymous namespace)::dctInverse8x8_avx<7>(float*) Line | Count | Source | 1621 | 12.9k | { | 1622 | 12.9k | #if defined IMF_HAVE_GCC_INLINEASM_X86_64 | 1623 | | | 1624 | | /* The column-major version of M1, followed by the | 1625 | | * column-major version of M2: | 1626 | | * | 1627 | | * [ a c a f ] [ b d e g ] | 1628 | | * M1 = [ a f -a -c ] M2 = [ d -g -b -e ] | 1629 | | * [ a -f -a c ] [ e -b g d ] | 1630 | | * [ a -c a -f ] [ g -e d -b ] | 1631 | | */ | 1632 | 12.9k | const float sAvxCoef[32] __attribute__ ((aligned (32))) = { | 1633 | 12.9k | 3.535536e-01, 3.535536e-01, | 1634 | 12.9k | 3.535536e-01, 3.535536e-01, /* a a a a */ | 1635 | 12.9k | 4.619398e-01, 1.913422e-01, | 1636 | 12.9k | -1.913422e-01, -4.619398e-01, /* c f -f -c */ | 1637 | 12.9k | 3.535536e-01, -3.535536e-01, | 1638 | 12.9k | -3.535536e-01, 3.535536e-01, /* a -a -a a */ | 1639 | 12.9k | 1.913422e-01, -4.619398e-01, | 1640 | 12.9k | 4.619398e-01, -1.913422e-01, /* f -c c -f */ | 1641 | | | 1642 | 12.9k | 4.903927e-01, 4.157349e-01, | 1643 | 12.9k | 2.777855e-01, 9.754573e-02, /* b d e g */ | 1644 | 12.9k | 4.157349e-01, -9.754573e-02, | 1645 | 12.9k | -4.903927e-01, -2.777855e-01, /* d -g -b -e */ | 1646 | 12.9k | 2.777855e-01, -4.903927e-01, | 1647 | 12.9k | 9.754573e-02, 4.157349e-01, /* e -b g d */ | 1648 | 12.9k | 9.754573e-02, -2.777855e-01, | 1649 | 12.9k | 4.157349e-01, -4.903927e-01 /* g -e d -b */ | 1650 | 12.9k | }; | 1651 | | | 1652 | 12.9k | # define ROW0(_X) _X | 1653 | 12.9k | # define ROW1(_X) _X | 1654 | 12.9k | # define ROW2(_X) _X | 1655 | 12.9k | # define ROW3(_X) _X | 1656 | 12.9k | # define ROW4(_X) _X | 1657 | 12.9k | # define ROW5(_X) _X | 1658 | 12.9k | # define ROW6(_X) _X | 1659 | 12.9k | # define ROW7(_X) _X | 1660 | | | 1661 | 12.9k | if (zeroedRows == 0) { IDCT_AVX_ASM (data) } | 1662 | 12.9k | else if (zeroedRows == 1) | 1663 | 0 | { | 1664 | |
| 1665 | 0 | # undef ROW7 | 1666 | 0 | # define ROW7(_X) | 1667 | 0 | IDCT_AVX_ASM (data) | 1668 | 0 | } | 1669 | 12.9k | else if (zeroedRows == 2) | 1670 | 0 | { | 1671 | |
| 1672 | 0 | # undef ROW6 | 1673 | 0 | # define ROW6(_X) | 1674 | 0 | IDCT_AVX_ASM (data) | 1675 | 0 | } | 1676 | 12.9k | else if (zeroedRows == 3) | 1677 | 0 | { | 1678 | |
| 1679 | 0 | # undef ROW5 | 1680 | 0 | # define ROW5(_X) | 1681 | 0 | IDCT_AVX_ASM (data) | 1682 | 0 | } | 1683 | 12.9k | else if (zeroedRows == 4) | 1684 | 0 | { | 1685 | |
| 1686 | 0 | # undef ROW4 | 1687 | 0 | # define ROW4(_X) | 1688 | 0 | IDCT_AVX_ASM (data) | 1689 | 0 | } | 1690 | 12.9k | else if (zeroedRows == 5) | 1691 | 0 | { | 1692 | |
| 1693 | 0 | # undef ROW3 | 1694 | 0 | # define ROW3(_X) | 1695 | 0 | IDCT_AVX_ASM (data) | 1696 | 0 | } | 1697 | 12.9k | else if (zeroedRows == 6) | 1698 | 0 | { | 1699 | |
| 1700 | 0 | # undef ROW2 | 1701 | 0 | # define ROW2(_X) | 1702 | 0 | IDCT_AVX_ASM (data) | 1703 | 0 | } | 1704 | 12.9k | else if (zeroedRows == 7) | 1705 | 12.9k | { | 1706 | | // autoformatting wants to add a space between the doubled %% | 1707 | | // clang-format off | 1708 | | | 1709 | 12.9k | __asm__( | 1710 | | | 1711 | | /* ============================================== | 1712 | | * Row 1D DCT | 1713 | | * ---------------------------------------------- | 1714 | | */ | 1715 | 12.9k | IDCT_AVX_SETUP_2_ROWS (0, 4, 14, 15, 0, 16, 32, 48) | 1716 | | | 1717 | 12.9k | "vbroadcastf128 (%1), %%ymm8 \n" | 1718 | 12.9k | "vbroadcastf128 16(%1), %%ymm9 \n" | 1719 | 12.9k | "vbroadcastf128 32(%1), %%ymm10 \n" | 1720 | 12.9k | "vbroadcastf128 48(%1), %%ymm11 \n" | 1721 | | | 1722 | | /* Stash a vector of [a a a a | a a a a] away in ymm2 */ | 1723 | 12.9k | "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n" | 1724 | | | 1725 | 12.9k | IDCT_AVX_MMULT_ROWS (%% ymm0) | 1726 | | | 1727 | 12.9k | "vbroadcastf128 64(%1), %%ymm8 \n" | 1728 | 12.9k | "vbroadcastf128 80(%1), %%ymm9 \n" | 1729 | 12.9k | "vbroadcastf128 96(%1), %%ymm10 \n" | 1730 | 12.9k | "vbroadcastf128 112(%1), %%ymm11 \n" | 1731 | | | 1732 | 12.9k | IDCT_AVX_MMULT_ROWS (%% ymm4) | 1733 | | | 1734 | 12.9k | IDCT_AVX_EO_TO_ROW_HALVES ( | 1735 | 12.9k | %% ymm0, %% ymm4, %% ymm0, %% ymm12) | 1736 | | | 1737 | 12.9k | "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" | 1738 | | | 1739 | | /* ============================================== | 1740 | | * Column 1D DCT | 1741 | | * ---------------------------------------------- | 1742 | | */ | 1743 | | | 1744 | | /* DC only, so multiple by a and we're done */ | 1745 | 12.9k | "vmulps %%ymm2, %%ymm0, %%ymm0 \n" | 1746 | | | 1747 | | /* Copy out results */ | 1748 | 12.9k | "vmovaps %%ymm0, (%0) \n" | 1749 | 12.9k | "vmovaps %%ymm0, 32(%0) \n" | 1750 | 12.9k | "vmovaps %%ymm0, 64(%0) \n" | 1751 | 12.9k | "vmovaps %%ymm0, 96(%0) \n" | 1752 | 12.9k | "vmovaps %%ymm0, 128(%0) \n" | 1753 | 12.9k | "vmovaps %%ymm0, 160(%0) \n" | 1754 | 12.9k | "vmovaps %%ymm0, 192(%0) \n" | 1755 | 12.9k | "vmovaps %%ymm0, 224(%0) \n" | 1756 | | | 1757 | 12.9k | # ifndef __AVX__ | 1758 | 12.9k | "vzeroupper \n" | 1759 | 12.9k | # endif /* __AVX__ */ | 1760 | 12.9k | IDCT_AVX_OIC (data)); | 1761 | | // clang-format on | 1762 | 12.9k | } | 1763 | 0 | else | 1764 | 0 | { | 1765 | 0 | assert (false); // Invalid template instance parameter | 1766 | 0 | } | 1767 | | #else /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1768 | | | 1769 | | dctInverse8x8_scalar<zeroedRows> (data); | 1770 | | | 1771 | | #endif /* IMF_HAVE_GCC_INLINEASM_X86_64 */ | 1772 | 12.9k | } |
|
1773 | | |
1774 | | #undef IDCT_AVX_SETUP_2_ROWS |
1775 | | #undef IDCT_AVX_MMULT_ROWS |
1776 | | #undef IDCT_AVX_EO_TO_ROW_HALVES |
1777 | | #undef IDCT_AVX_BODY |
1778 | | #undef IDCT_AVX_OIC |
1779 | | |
1780 | | // |
1781 | | // Full 8x8 Forward DCT: |
1782 | | // |
1783 | | // Base forward 8x8 DCT implementation. Works on the data in-place |
1784 | | // |
1785 | | // The implementation describedin Pennebaker + Mitchell, |
1786 | | // section 4.3.2, and illustrated in figure 4-7 |
1787 | | // |
1788 | | // The basic idea is that the 1D DCT math reduces to: |
1789 | | // |
1790 | | // 2*out_0 = c_4 [(s_07 + s_34) + (s_12 + s_56)] |
1791 | | // 2*out_4 = c_4 [(s_07 + s_34) - (s_12 + s_56)] |
1792 | | // |
1793 | | // {2*out_2, 2*out_6} = rot_6 ((d_12 - d_56), (s_07 - s_34)) |
1794 | | // |
1795 | | // {2*out_3, 2*out_5} = rot_-3 (d_07 - c_4 (s_12 - s_56), |
1796 | | // d_34 - c_4 (d_12 + d_56)) |
1797 | | // |
1798 | | // {2*out_1, 2*out_7} = rot_-1 (d_07 + c_4 (s_12 - s_56), |
1799 | | // -d_34 - c_4 (d_12 + d_56)) |
1800 | | // |
1801 | | // where: |
1802 | | // |
1803 | | // c_i = cos(i*pi/16) |
1804 | | // s_i = sin(i*pi/16) |
1805 | | // |
1806 | | // s_ij = in_i + in_j |
1807 | | // d_ij = in_i - in_j |
1808 | | // |
1809 | | // rot_i(x, y) = {c_i*x + s_i*y, -s_i*x + c_i*y} |
1810 | | // |
1811 | | // We'll run the DCT in two passes. First, run the 1D DCT on |
1812 | | // the rows, in-place. Then, run over the columns in-place, |
1813 | | // and be done with it. |
1814 | | // |
1815 | | |
1816 | | #ifndef IMF_HAVE_SSE2 |
1817 | | |
1818 | | // |
1819 | | // Default implementation |
1820 | | // |
1821 | | |
1822 | | void |
1823 | | dctForward8x8 (float* data) |
1824 | | { |
1825 | | float A0, A1, A2, A3, A4, A5, A6, A7; |
1826 | | float K0, K1, rot_x, rot_y; |
1827 | | |
1828 | | float* srcPtr = data; |
1829 | | float* dstPtr = data; |
1830 | | |
1831 | | const float c1 = cosf (3.14159f * 1.0f / 16.0f); |
1832 | | const float c2 = cosf (3.14159f * 2.0f / 16.0f); |
1833 | | const float c3 = cosf (3.14159f * 3.0f / 16.0f); |
1834 | | const float c4 = cosf (3.14159f * 4.0f / 16.0f); |
1835 | | const float c5 = cosf (3.14159f * 5.0f / 16.0f); |
1836 | | const float c6 = cosf (3.14159f * 6.0f / 16.0f); |
1837 | | const float c7 = cosf (3.14159f * 7.0f / 16.0f); |
1838 | | |
1839 | | const float c1Half = .5f * c1; |
1840 | | const float c2Half = .5f * c2; |
1841 | | const float c3Half = .5f * c3; |
1842 | | const float c5Half = .5f * c5; |
1843 | | const float c6Half = .5f * c6; |
1844 | | const float c7Half = .5f * c7; |
1845 | | |
1846 | | // |
1847 | | // First pass - do a 1D DCT over the rows and write the |
1848 | | // results back in place |
1849 | | // |
1850 | | |
1851 | | for (int row = 0; row < 8; ++row) |
1852 | | { |
1853 | | float* srcRowPtr = srcPtr + 8 * row; |
1854 | | float* dstRowPtr = dstPtr + 8 * row; |
1855 | | |
1856 | | A0 = srcRowPtr[0] + srcRowPtr[7]; |
1857 | | A1 = srcRowPtr[1] + srcRowPtr[2]; |
1858 | | A2 = srcRowPtr[1] - srcRowPtr[2]; |
1859 | | A3 = srcRowPtr[3] + srcRowPtr[4]; |
1860 | | A4 = srcRowPtr[3] - srcRowPtr[4]; |
1861 | | A5 = srcRowPtr[5] + srcRowPtr[6]; |
1862 | | A6 = srcRowPtr[5] - srcRowPtr[6]; |
1863 | | A7 = srcRowPtr[0] - srcRowPtr[7]; |
1864 | | |
1865 | | K0 = c4 * (A0 + A3); |
1866 | | K1 = c4 * (A1 + A5); |
1867 | | |
1868 | | dstRowPtr[0] = .5f * (K0 + K1); |
1869 | | dstRowPtr[4] = .5f * (K0 - K1); |
1870 | | |
1871 | | // |
1872 | | // (2*dst2, 2*dst6) = rot 6 (d12 - d56, s07 - s34) |
1873 | | // |
1874 | | |
1875 | | rot_x = A2 - A6; |
1876 | | rot_y = A0 - A3; |
1877 | | |
1878 | | dstRowPtr[2] = c6Half * rot_x + c2Half * rot_y; |
1879 | | dstRowPtr[6] = c6Half * rot_y - c2Half * rot_x; |
1880 | | |
1881 | | // |
1882 | | // K0, K1 are active until after dst[1],dst[7] |
1883 | | // as well as dst[3], dst[5] are computed. |
1884 | | // |
1885 | | |
1886 | | K0 = c4 * (A1 - A5); |
1887 | | K1 = -1 * c4 * (A2 + A6); |
1888 | | |
1889 | | // |
1890 | | // Two ways to do a rotation: |
1891 | | // |
1892 | | // rot i (x, y) = |
1893 | | // X = c_i*x + s_i*y |
1894 | | // Y = -s_i*x + c_i*y |
1895 | | // |
1896 | | // OR |
1897 | | // |
1898 | | // X = c_i*(x+y) + (s_i-c_i)*y |
1899 | | // Y = c_i*y - (s_i+c_i)*x |
1900 | | // |
1901 | | // the first case has 4 multiplies, but fewer constants, |
1902 | | // while the 2nd case has fewer multiplies but takes more space. |
1903 | | |
1904 | | // |
1905 | | // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 ) |
1906 | | // |
1907 | | |
1908 | | rot_x = A7 - K0; |
1909 | | rot_y = A4 + K1; |
1910 | | |
1911 | | dstRowPtr[3] = c3Half * rot_x - c5Half * rot_y; |
1912 | | dstRowPtr[5] = c5Half * rot_x + c3Half * rot_y; |
1913 | | |
1914 | | // |
1915 | | // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 ) |
1916 | | // |
1917 | | |
1918 | | rot_x = A7 + K0; |
1919 | | rot_y = K1 - A4; |
1920 | | |
1921 | | // |
1922 | | // A: 4, 7 are inactive. All A's are inactive |
1923 | | // |
1924 | | |
1925 | | dstRowPtr[1] = c1Half * rot_x - c7Half * rot_y; |
1926 | | dstRowPtr[7] = c7Half * rot_x + c1Half * rot_y; |
1927 | | } |
1928 | | |
1929 | | // |
1930 | | // Second pass - do the same, but on the columns |
1931 | | // |
1932 | | |
1933 | | for (int column = 0; column < 8; ++column) |
1934 | | { |
1935 | | |
1936 | | A0 = srcPtr[column] + srcPtr[56 + column]; |
1937 | | A7 = srcPtr[column] - srcPtr[56 + column]; |
1938 | | |
1939 | | A1 = srcPtr[8 + column] + srcPtr[16 + column]; |
1940 | | A2 = srcPtr[8 + column] - srcPtr[16 + column]; |
1941 | | |
1942 | | A3 = srcPtr[24 + column] + srcPtr[32 + column]; |
1943 | | A4 = srcPtr[24 + column] - srcPtr[32 + column]; |
1944 | | |
1945 | | A5 = srcPtr[40 + column] + srcPtr[48 + column]; |
1946 | | A6 = srcPtr[40 + column] - srcPtr[48 + column]; |
1947 | | |
1948 | | K0 = c4 * (A0 + A3); |
1949 | | K1 = c4 * (A1 + A5); |
1950 | | |
1951 | | dstPtr[column] = .5f * (K0 + K1); |
1952 | | dstPtr[32 + column] = .5f * (K0 - K1); |
1953 | | |
1954 | | // |
1955 | | // (2*dst2, 2*dst6) = rot 6 ( d12 - d56, s07 - s34 ) |
1956 | | // |
1957 | | |
1958 | | rot_x = A2 - A6; |
1959 | | rot_y = A0 - A3; |
1960 | | |
1961 | | dstPtr[16 + column] = .5f * (c6 * rot_x + c2 * rot_y); |
1962 | | dstPtr[48 + column] = .5f * (c6 * rot_y - c2 * rot_x); |
1963 | | |
1964 | | // |
1965 | | // K0, K1 are active until after dst[1],dst[7] |
1966 | | // as well as dst[3], dst[5] are computed. |
1967 | | // |
1968 | | |
1969 | | K0 = c4 * (A1 - A5); |
1970 | | K1 = -1 * c4 * (A2 + A6); |
1971 | | |
1972 | | // |
1973 | | // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 ) |
1974 | | // |
1975 | | |
1976 | | rot_x = A7 - K0; |
1977 | | rot_y = A4 + K1; |
1978 | | |
1979 | | dstPtr[24 + column] = .5f * (c3 * rot_x - c5 * rot_y); |
1980 | | dstPtr[40 + column] = .5f * (c5 * rot_x + c3 * rot_y); |
1981 | | |
1982 | | // |
1983 | | // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 ) |
1984 | | // |
1985 | | |
1986 | | rot_x = A7 + K0; |
1987 | | rot_y = K1 - A4; |
1988 | | |
1989 | | dstPtr[8 + column] = .5f * (c1 * rot_x - c7 * rot_y); |
1990 | | dstPtr[56 + column] = .5f * (c7 * rot_x + c1 * rot_y); |
1991 | | } |
1992 | | } |
1993 | | |
1994 | | #else /* IMF_HAVE_SSE2 */ |
1995 | | |
1996 | | // |
1997 | | // SSE2 implementation |
1998 | | // |
1999 | | // Here, we're always doing a column-wise operation |
2000 | | // plus transposes. This might be faster to do differently |
2001 | | // between rows-wise and column-wise |
2002 | | // |
2003 | | |
2004 | | void |
2005 | | dctForward8x8 (float* data) |
2006 | 0 | { |
2007 | 0 | __m128* srcVec = (__m128*) data; |
2008 | 0 | __m128 a0Vec, a1Vec, a2Vec, a3Vec, a4Vec, a5Vec, a6Vec, a7Vec; |
2009 | 0 | __m128 k0Vec, k1Vec, rotXVec, rotYVec; |
2010 | 0 | __m128 transTmp[4], transTmp2[4]; |
2011 | |
|
2012 | 0 | __m128 c4Vec = {.70710678f, .70710678f, .70710678f, .70710678f}; |
2013 | 0 | __m128 c4NegVec = {-.70710678f, -.70710678f, -.70710678f, -.70710678f}; |
2014 | |
|
2015 | 0 | __m128 c1HalfVec = {.490392640f, .490392640f, .490392640f, .490392640f}; |
2016 | 0 | __m128 c2HalfVec = {.461939770f, .461939770f, .461939770f, .461939770f}; |
2017 | 0 | __m128 c3HalfVec = {.415734810f, .415734810f, .415734810f, .415734810f}; |
2018 | 0 | __m128 c5HalfVec = {.277785120f, .277785120f, .277785120f, .277785120f}; |
2019 | 0 | __m128 c6HalfVec = {.191341720f, .191341720f, .191341720f, .191341720f}; |
2020 | 0 | __m128 c7HalfVec = {.097545161f, .097545161f, .097545161f, .097545161f}; |
2021 | |
|
2022 | 0 | __m128 halfVec = {.5f, .5f, .5f, .5f}; |
2023 | |
|
2024 | 0 | for (int iter = 0; iter < 2; ++iter) |
2025 | 0 | { |
2026 | | // |
2027 | | // Operate on 4 columns at a time. The |
2028 | | // offsets into our row-major array are: |
2029 | | // 0: 0 1 |
2030 | | // 1: 2 3 |
2031 | | // 2: 4 5 |
2032 | | // 3: 6 7 |
2033 | | // 4: 8 9 |
2034 | | // 5: 10 11 |
2035 | | // 6: 12 13 |
2036 | | // 7: 14 15 |
2037 | | // |
2038 | |
|
2039 | 0 | for (int pass = 0; pass < 2; ++pass) |
2040 | 0 | { |
2041 | 0 | a0Vec = _mm_add_ps (srcVec[0 + pass], srcVec[14 + pass]); |
2042 | 0 | a1Vec = _mm_add_ps (srcVec[2 + pass], srcVec[4 + pass]); |
2043 | 0 | a3Vec = _mm_add_ps (srcVec[6 + pass], srcVec[8 + pass]); |
2044 | 0 | a5Vec = _mm_add_ps (srcVec[10 + pass], srcVec[12 + pass]); |
2045 | |
|
2046 | 0 | a7Vec = _mm_sub_ps (srcVec[0 + pass], srcVec[14 + pass]); |
2047 | 0 | a2Vec = _mm_sub_ps (srcVec[2 + pass], srcVec[4 + pass]); |
2048 | 0 | a4Vec = _mm_sub_ps (srcVec[6 + pass], srcVec[8 + pass]); |
2049 | 0 | a6Vec = _mm_sub_ps (srcVec[10 + pass], srcVec[12 + pass]); |
2050 | | |
2051 | | // |
2052 | | // First stage; Compute out_0 and out_4 |
2053 | | // |
2054 | |
|
2055 | 0 | k0Vec = _mm_add_ps (a0Vec, a3Vec); |
2056 | 0 | k1Vec = _mm_add_ps (a1Vec, a5Vec); |
2057 | |
|
2058 | 0 | k0Vec = _mm_mul_ps (c4Vec, k0Vec); |
2059 | 0 | k1Vec = _mm_mul_ps (c4Vec, k1Vec); |
2060 | |
|
2061 | 0 | srcVec[0 + pass] = _mm_add_ps (k0Vec, k1Vec); |
2062 | 0 | srcVec[8 + pass] = _mm_sub_ps (k0Vec, k1Vec); |
2063 | |
|
2064 | 0 | srcVec[0 + pass] = _mm_mul_ps (srcVec[0 + pass], halfVec); |
2065 | 0 | srcVec[8 + pass] = _mm_mul_ps (srcVec[8 + pass], halfVec); |
2066 | | |
2067 | | // |
2068 | | // Second stage; Compute out_2 and out_6 |
2069 | | // |
2070 | |
|
2071 | 0 | k0Vec = _mm_sub_ps (a2Vec, a6Vec); |
2072 | 0 | k1Vec = _mm_sub_ps (a0Vec, a3Vec); |
2073 | |
|
2074 | 0 | srcVec[4 + pass] = _mm_add_ps ( |
2075 | 0 | _mm_mul_ps (c6HalfVec, k0Vec), _mm_mul_ps (c2HalfVec, k1Vec)); |
2076 | |
|
2077 | 0 | srcVec[12 + pass] = _mm_sub_ps ( |
2078 | 0 | _mm_mul_ps (c6HalfVec, k1Vec), _mm_mul_ps (c2HalfVec, k0Vec)); |
2079 | | |
2080 | | // |
2081 | | // Precompute K0 and K1 for the remaining stages |
2082 | | // |
2083 | |
|
2084 | 0 | k0Vec = _mm_mul_ps (_mm_sub_ps (a1Vec, a5Vec), c4Vec); |
2085 | 0 | k1Vec = _mm_mul_ps (_mm_add_ps (a2Vec, a6Vec), c4NegVec); |
2086 | | |
2087 | | // |
2088 | | // Third Stage, compute out_3 and out_5 |
2089 | | // |
2090 | |
|
2091 | 0 | rotXVec = _mm_sub_ps (a7Vec, k0Vec); |
2092 | 0 | rotYVec = _mm_add_ps (a4Vec, k1Vec); |
2093 | |
|
2094 | 0 | srcVec[6 + pass] = _mm_sub_ps ( |
2095 | 0 | _mm_mul_ps (c3HalfVec, rotXVec), |
2096 | 0 | _mm_mul_ps (c5HalfVec, rotYVec)); |
2097 | |
|
2098 | 0 | srcVec[10 + pass] = _mm_add_ps ( |
2099 | 0 | _mm_mul_ps (c5HalfVec, rotXVec), |
2100 | 0 | _mm_mul_ps (c3HalfVec, rotYVec)); |
2101 | | |
2102 | | // |
2103 | | // Fourth Stage, compute out_1 and out_7 |
2104 | | // |
2105 | |
|
2106 | 0 | rotXVec = _mm_add_ps (a7Vec, k0Vec); |
2107 | 0 | rotYVec = _mm_sub_ps (k1Vec, a4Vec); |
2108 | |
|
2109 | 0 | srcVec[2 + pass] = _mm_sub_ps ( |
2110 | 0 | _mm_mul_ps (c1HalfVec, rotXVec), |
2111 | 0 | _mm_mul_ps (c7HalfVec, rotYVec)); |
2112 | |
|
2113 | 0 | srcVec[14 + pass] = _mm_add_ps ( |
2114 | 0 | _mm_mul_ps (c7HalfVec, rotXVec), |
2115 | 0 | _mm_mul_ps (c1HalfVec, rotYVec)); |
2116 | 0 | } |
2117 | | |
2118 | | // |
2119 | | // Transpose the matrix, in 4x4 blocks. So, if we have our |
2120 | | // 8x8 matrix divied into 4x4 blocks: |
2121 | | // |
2122 | | // M0 | M1 M0t | M2t |
2123 | | // ----+--- --> -----+------ |
2124 | | // M2 | M3 M1t | M3t |
2125 | | // |
2126 | | |
2127 | | // |
2128 | | // M0t, done in place, the first half. |
2129 | | // |
2130 | |
|
2131 | 0 | transTmp[0] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0x44); |
2132 | 0 | transTmp[1] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0x44); |
2133 | 0 | transTmp[3] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0xEE); |
2134 | 0 | transTmp[2] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0xEE); |
2135 | | |
2136 | | // |
2137 | | // M3t, also done in place, the first half. |
2138 | | // |
2139 | |
|
2140 | 0 | transTmp2[0] = _mm_shuffle_ps (srcVec[9], srcVec[11], 0x44); |
2141 | 0 | transTmp2[1] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0x44); |
2142 | 0 | transTmp2[2] = _mm_shuffle_ps (srcVec[9], srcVec[11], 0xEE); |
2143 | 0 | transTmp2[3] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0xEE); |
2144 | | |
2145 | | // |
2146 | | // M0t, the second half. |
2147 | | // |
2148 | |
|
2149 | 0 | srcVec[0] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88); |
2150 | 0 | srcVec[4] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88); |
2151 | 0 | srcVec[2] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD); |
2152 | 0 | srcVec[6] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD); |
2153 | | |
2154 | | // |
2155 | | // M3t, the second half. |
2156 | | // |
2157 | |
|
2158 | 0 | srcVec[9] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88); |
2159 | 0 | srcVec[13] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88); |
2160 | 0 | srcVec[11] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD); |
2161 | 0 | srcVec[15] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD); |
2162 | | |
2163 | | // |
2164 | | // M1 and M2 need to be done at the same time, because we're |
2165 | | // swapping. |
2166 | | // |
2167 | | // First, the first half of M1t |
2168 | | // |
2169 | |
|
2170 | 0 | transTmp[0] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0x44); |
2171 | 0 | transTmp[1] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0x44); |
2172 | 0 | transTmp[2] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0xEE); |
2173 | 0 | transTmp[3] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0xEE); |
2174 | | |
2175 | | // |
2176 | | // And the first half of M2t |
2177 | | // |
2178 | |
|
2179 | 0 | transTmp2[0] = _mm_shuffle_ps (srcVec[8], srcVec[10], 0x44); |
2180 | 0 | transTmp2[1] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0x44); |
2181 | 0 | transTmp2[2] = _mm_shuffle_ps (srcVec[8], srcVec[10], 0xEE); |
2182 | 0 | transTmp2[3] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0xEE); |
2183 | | |
2184 | | // |
2185 | | // Second half of M1t |
2186 | | // |
2187 | |
|
2188 | 0 | srcVec[8] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88); |
2189 | 0 | srcVec[12] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88); |
2190 | 0 | srcVec[10] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD); |
2191 | 0 | srcVec[14] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD); |
2192 | | |
2193 | | // |
2194 | | // Second half of M2 |
2195 | | // |
2196 | |
|
2197 | 0 | srcVec[1] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88); |
2198 | 0 | srcVec[5] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88); |
2199 | 0 | srcVec[3] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD); |
2200 | 0 | srcVec[7] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD); |
2201 | 0 | } |
2202 | 0 | } |
2203 | | |
2204 | | #endif /* IMF_HAVE_SSE2 */ |
2205 | | |
2206 | | } // namespace |
2207 | | |
2208 | | OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT |
2209 | | |
2210 | | #endif |