/src/freeimage-svn/FreeImage/trunk/Source/OpenEXR/IlmImf/ImfOptimizedPixelReading.h
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////////// |
2 | | // |
3 | | // Copyright (c) 2012, Autodesk, Inc. |
4 | | // |
5 | | // All rights reserved. |
6 | | // |
7 | | // Implementation of IIF-specific file format and speed optimizations |
8 | | // provided by Innobec Technologies inc on behalf of Autodesk. |
9 | | // |
10 | | // Redistribution and use in source and binary forms, with or without |
11 | | // modification, are permitted provided that the following conditions are |
12 | | // met: |
13 | | // * Redistributions of source code must retain the above copyright |
14 | | // notice, this list of conditions and the following disclaimer. |
15 | | // * Redistributions in binary form must reproduce the above |
16 | | // copyright notice, this list of conditions and the following disclaimer |
17 | | // in the documentation and/or other materials provided with the |
18 | | // distribution. |
19 | | // * Neither the name of Industrial Light & Magic nor the names of |
20 | | // its contributors may be used to endorse or promote products derived |
21 | | // from this software without specific prior written permission. |
22 | | // |
23 | | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
24 | | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
25 | | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
26 | | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
27 | | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
28 | | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
29 | | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
30 | | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
31 | | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
32 | | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
33 | | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
34 | | // |
35 | | /////////////////////////////////////////////////////////////////////////// |
36 | | |
37 | | #pragma once |
38 | | |
39 | | #ifndef INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H |
40 | | #define INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H |
41 | | |
42 | | #include "ImfSimd.h" |
43 | | #include "ImfSystemSpecific.h" |
44 | | #include <iostream> |
45 | | #include "ImfChannelList.h" |
46 | | #include "ImfFrameBuffer.h" |
47 | | #include "ImfStringVectorAttribute.h" |
48 | | |
49 | | OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER |
50 | | |
51 | | class OptimizationMode |
52 | | { |
53 | | public: |
54 | | |
55 | | |
56 | | bool _optimizable; |
57 | | int _ySampling; |
58 | 0 | OptimizationMode() : _optimizable(false) {} |
59 | | |
60 | | }; |
61 | | |
62 | | |
63 | | #if IMF_HAVE_SSE2 |
64 | | |
65 | | |
66 | | //------------------------------------------------------------------------ |
67 | | // Test for SSE pointer alignemnt |
68 | | //------------------------------------------------------------------------ |
69 | | EXR_FORCEINLINE |
70 | | bool |
71 | | isPointerSSEAligned (const void* EXR_RESTRICT pPointer) |
72 | 0 | { |
73 | 0 | unsigned long trailingBits = ((unsigned long)pPointer) & 15; |
74 | 0 | return trailingBits == 0; |
75 | 0 | } |
76 | | |
77 | | //------------------------------------------------------------------------ |
78 | | // Load SSE from address into register |
79 | | //------------------------------------------------------------------------ |
80 | | template<bool IS_ALIGNED> |
81 | | EXR_FORCEINLINE |
82 | | __m128i loadSSE (__m128i*& loadAddress) |
83 | | { |
84 | | // throw exception :: this is not accepted |
85 | | return _mm_loadu_si128 (loadAddress); |
86 | | } |
87 | | |
88 | | template<> |
89 | | EXR_FORCEINLINE |
90 | | __m128i loadSSE<false> (__m128i*& loadAddress) |
91 | 0 | { |
92 | 0 | return _mm_loadu_si128 (loadAddress); |
93 | 0 | } |
94 | | |
95 | | template<> |
96 | | EXR_FORCEINLINE |
97 | | __m128i loadSSE<true> (__m128i*& loadAddress) |
98 | 0 | { |
99 | 0 | return _mm_load_si128 (loadAddress); |
100 | 0 | } |
101 | | |
102 | | //------------------------------------------------------------------------ |
103 | | // Store SSE from register into address |
104 | | //------------------------------------------------------------------------ |
105 | | template<bool IS_ALIGNED> |
106 | | EXR_FORCEINLINE |
107 | | void storeSSE (__m128i*& storeAddress, __m128i& dataToStore) |
108 | | { |
109 | | |
110 | | } |
111 | | |
112 | | template<> |
113 | | EXR_FORCEINLINE |
114 | | void |
115 | | storeSSE<false> (__m128i*& storeAddress, __m128i& dataToStore) |
116 | 0 | { |
117 | 0 | _mm_storeu_si128 (storeAddress, dataToStore); |
118 | 0 | } |
119 | | |
120 | | template<> |
121 | | EXR_FORCEINLINE |
122 | | void |
123 | | storeSSE<true> (__m128i*& storeAddress, __m128i& dataToStore) |
124 | 0 | { |
125 | 0 | _mm_stream_si128 (storeAddress, dataToStore); |
126 | 0 | } |
127 | | |
128 | | |
129 | | |
130 | | //------------------------------------------------------------------------ |
131 | | // |
132 | | // Write to RGBA |
133 | | // |
134 | | //------------------------------------------------------------------------ |
135 | | |
136 | | // |
137 | | // Using SSE intrinsics |
138 | | // |
139 | | template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED> |
140 | | EXR_FORCEINLINE |
141 | | void writeToRGBASSETemplate |
142 | | (__m128i*& readPtrSSERed, |
143 | | __m128i*& readPtrSSEGreen, |
144 | | __m128i*& readPtrSSEBlue, |
145 | | __m128i*& readPtrSSEAlpha, |
146 | | __m128i*& writePtrSSE, |
147 | | const size_t& lPixelsToCopySSE) |
148 | 0 | { |
149 | 0 | for (size_t i = 0; i < lPixelsToCopySSE; ++i) |
150 | 0 | { |
151 | 0 | __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed); |
152 | 0 | __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen); |
153 | 0 | __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue); |
154 | 0 | __m128i alphaRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEAlpha); |
155 | |
|
156 | 0 | __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister, |
157 | 0 | greenRegister); |
158 | 0 | __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister, |
159 | 0 | alphaRegister); |
160 | |
|
161 | 0 | __m128i pixel12Register = _mm_unpacklo_epi32 (redGreenRegister, |
162 | 0 | blueAlphaRegister); |
163 | 0 | __m128i pixel34Register = _mm_unpackhi_epi32 (redGreenRegister, |
164 | 0 | blueAlphaRegister); |
165 | |
|
166 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register); |
167 | 0 | ++writePtrSSE; |
168 | |
|
169 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register); |
170 | 0 | ++writePtrSSE; |
171 | |
|
172 | 0 | redGreenRegister = _mm_unpackhi_epi16 (redRegister, greenRegister); |
173 | 0 | blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister, alphaRegister); |
174 | |
|
175 | 0 | pixel12Register = _mm_unpacklo_epi32 (redGreenRegister, |
176 | 0 | blueAlphaRegister); |
177 | 0 | pixel34Register = _mm_unpackhi_epi32 (redGreenRegister, |
178 | 0 | blueAlphaRegister); |
179 | |
|
180 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register); |
181 | 0 | ++writePtrSSE; |
182 | | |
183 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register); |
184 | 0 | ++writePtrSSE; |
185 | |
|
186 | 0 | ++readPtrSSEAlpha; |
187 | 0 | ++readPtrSSEBlue; |
188 | 0 | ++readPtrSSEGreen; |
189 | 0 | ++readPtrSSERed; |
190 | 0 | } |
191 | 0 | } Unexecuted instantiation: void Imf_2_2::writeToRGBASSETemplate<false, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBASSETemplate<false, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBASSETemplate<true, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBASSETemplate<true, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&) |
192 | | |
193 | | // |
194 | | // Not using SSE intrinsics. This is still faster than the alternative |
195 | | // because we have multiple read pointers and therefore we are able to |
196 | | // take advantage of data locality for write operations. |
197 | | // |
198 | | EXR_FORCEINLINE |
199 | | void writeToRGBANormal (unsigned short*& readPtrRed, |
200 | | unsigned short*& readPtrGreen, |
201 | | unsigned short*& readPtrBlue, |
202 | | unsigned short*& readPtrAlpha, |
203 | | unsigned short*& writePtr, |
204 | | const size_t& lPixelsToCopy) |
205 | 0 | { |
206 | 0 | for (size_t i = 0; i < lPixelsToCopy; ++i) |
207 | 0 | { |
208 | 0 | *(writePtr++) = *(readPtrRed++); |
209 | 0 | *(writePtr++) = *(readPtrGreen++); |
210 | 0 | *(writePtr++) = *(readPtrBlue++); |
211 | 0 | *(writePtr++) = *(readPtrAlpha++); |
212 | 0 | } |
213 | 0 | } |
214 | | |
215 | | // |
216 | | // Determine which (template) version to use by checking whether pointers |
217 | | // are aligned |
218 | | // |
219 | | EXR_FORCEINLINE |
220 | | void optimizedWriteToRGBA (unsigned short*& readPtrRed, |
221 | | unsigned short*& readPtrGreen, |
222 | | unsigned short*& readPtrBlue, |
223 | | unsigned short*& readPtrAlpha, |
224 | | unsigned short*& writePtr, |
225 | | const size_t& pixelsToCopySSE, |
226 | | const size_t& pixelsToCopyNormal) |
227 | 0 | { |
228 | 0 | bool readPtrAreAligned = true; |
229 | |
|
230 | 0 | readPtrAreAligned &= isPointerSSEAligned(readPtrRed); |
231 | 0 | readPtrAreAligned &= isPointerSSEAligned(readPtrGreen); |
232 | 0 | readPtrAreAligned &= isPointerSSEAligned(readPtrBlue); |
233 | 0 | readPtrAreAligned &= isPointerSSEAligned(readPtrAlpha); |
234 | |
|
235 | 0 | bool writePtrIsAligned = isPointerSSEAligned(writePtr); |
236 | |
|
237 | 0 | if (!readPtrAreAligned && !writePtrIsAligned) |
238 | 0 | { |
239 | 0 | writeToRGBASSETemplate<false, false> ((__m128i*&)readPtrRed, |
240 | 0 | (__m128i*&)readPtrGreen, |
241 | 0 | (__m128i*&)readPtrBlue, |
242 | 0 | (__m128i*&)readPtrAlpha, |
243 | 0 | (__m128i*&)writePtr, |
244 | 0 | pixelsToCopySSE); |
245 | 0 | } |
246 | 0 | else if (!readPtrAreAligned && writePtrIsAligned) |
247 | 0 | { |
248 | 0 | writeToRGBASSETemplate<false, true> ((__m128i*&)readPtrRed, |
249 | 0 | (__m128i*&)readPtrGreen, |
250 | 0 | (__m128i*&)readPtrBlue, |
251 | 0 | (__m128i*&)readPtrAlpha, |
252 | 0 | (__m128i*&)writePtr, |
253 | 0 | pixelsToCopySSE); |
254 | 0 | } |
255 | 0 | else if (readPtrAreAligned && !writePtrIsAligned) |
256 | 0 | { |
257 | 0 | writeToRGBASSETemplate<true, false> ((__m128i*&)readPtrRed, |
258 | 0 | (__m128i*&)readPtrGreen, |
259 | 0 | (__m128i*&)readPtrBlue, |
260 | 0 | (__m128i*&)readPtrAlpha, |
261 | 0 | (__m128i*&)writePtr, |
262 | 0 | pixelsToCopySSE); |
263 | 0 | } |
264 | 0 | else if(readPtrAreAligned && writePtrIsAligned) |
265 | 0 | { |
266 | 0 | writeToRGBASSETemplate<true, true> ((__m128i*&)readPtrRed, |
267 | 0 | (__m128i*&)readPtrGreen, |
268 | 0 | (__m128i*&)readPtrBlue, |
269 | 0 | (__m128i*&)readPtrAlpha, |
270 | 0 | (__m128i*&)writePtr, |
271 | 0 | pixelsToCopySSE); |
272 | 0 | } |
273 | |
|
274 | 0 | writeToRGBANormal (readPtrRed, readPtrGreen, readPtrBlue, readPtrAlpha, |
275 | 0 | writePtr, pixelsToCopyNormal); |
276 | 0 | } |
277 | | |
278 | | |
279 | | |
280 | | //------------------------------------------------------------------------ |
281 | | // |
282 | | // Write to RGBA Fill A |
283 | | // |
284 | | //------------------------------------------------------------------------ |
285 | | |
286 | | // |
287 | | // Using SSE intrinsics |
288 | | // |
289 | | template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED> |
290 | | EXR_FORCEINLINE |
291 | | void |
292 | | writeToRGBAFillASSETemplate (__m128i*& readPtrSSERed, |
293 | | __m128i*& readPtrSSEGreen, |
294 | | __m128i*& readPtrSSEBlue, |
295 | | const unsigned short& alphaFillValue, |
296 | | __m128i*& writePtrSSE, |
297 | | const size_t& pixelsToCopySSE) |
298 | 0 | { |
299 | 0 | const __m128i dummyAlphaRegister = _mm_set_epi16 (alphaFillValue, |
300 | 0 | alphaFillValue, |
301 | 0 | alphaFillValue, |
302 | 0 | alphaFillValue, |
303 | 0 | alphaFillValue, |
304 | 0 | alphaFillValue, |
305 | 0 | alphaFillValue, |
306 | 0 | alphaFillValue); |
307 | |
|
308 | 0 | for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter) |
309 | 0 | { |
310 | 0 | __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed); |
311 | 0 | __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen); |
312 | 0 | __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue); |
313 | |
|
314 | 0 | __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister, |
315 | 0 | greenRegister); |
316 | 0 | __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister, |
317 | 0 | dummyAlphaRegister); |
318 | |
|
319 | 0 | __m128i pixel12Register = _mm_unpacklo_epi32 (redGreenRegister, |
320 | 0 | blueAlphaRegister); |
321 | 0 | __m128i pixel34Register = _mm_unpackhi_epi32 (redGreenRegister, |
322 | 0 | blueAlphaRegister); |
323 | |
|
324 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register); |
325 | 0 | ++writePtrSSE; |
326 | |
|
327 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register); |
328 | 0 | ++writePtrSSE; |
329 | |
|
330 | 0 | redGreenRegister = _mm_unpackhi_epi16 (redRegister, |
331 | 0 | greenRegister); |
332 | 0 | blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister, |
333 | 0 | dummyAlphaRegister); |
334 | |
|
335 | 0 | pixel12Register = _mm_unpacklo_epi32 (redGreenRegister, |
336 | 0 | blueAlphaRegister); |
337 | 0 | pixel34Register = _mm_unpackhi_epi32 (redGreenRegister, |
338 | 0 | blueAlphaRegister); |
339 | |
|
340 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register); |
341 | 0 | ++writePtrSSE; |
342 | |
|
343 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register); |
344 | 0 | ++writePtrSSE; |
345 | |
|
346 | 0 | ++readPtrSSEBlue; |
347 | 0 | ++readPtrSSEGreen; |
348 | 0 | ++readPtrSSERed; |
349 | 0 | } |
350 | 0 | } Unexecuted instantiation: void Imf_2_2::writeToRGBAFillASSETemplate<false, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned short const&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBAFillASSETemplate<false, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned short const&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBAFillASSETemplate<true, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned short const&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBAFillASSETemplate<true, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned short const&, long long __vector(2)*&, unsigned long const&) |
351 | | |
352 | | // |
353 | | // Not using SSE intrinsics. This is still faster than the alternative |
354 | | // because we have multiple read pointers and therefore we are able to |
355 | | // take advantage of data locality for write operations. |
356 | | // |
357 | | EXR_FORCEINLINE |
358 | | void |
359 | | writeToRGBAFillANormal (unsigned short*& readPtrRed, |
360 | | unsigned short*& readPtrGreen, |
361 | | unsigned short*& readPtrBlue, |
362 | | const unsigned short& alphaFillValue, |
363 | | unsigned short*& writePtr, |
364 | | const size_t& pixelsToCopy) |
365 | 0 | { |
366 | 0 | for (size_t i = 0; i < pixelsToCopy; ++i) |
367 | 0 | { |
368 | 0 | *(writePtr++) = *(readPtrRed++); |
369 | 0 | *(writePtr++) = *(readPtrGreen++); |
370 | 0 | *(writePtr++) = *(readPtrBlue++); |
371 | 0 | *(writePtr++) = alphaFillValue; |
372 | 0 | } |
373 | 0 | } |
374 | | |
375 | | // |
376 | | // Determine which (template) version to use by checking whether pointers |
377 | | // are aligned. |
378 | | // |
379 | | EXR_FORCEINLINE |
380 | | void |
381 | | optimizedWriteToRGBAFillA (unsigned short*& readPtrRed, |
382 | | unsigned short*& readPtrGreen, |
383 | | unsigned short*& readPtrBlue, |
384 | | const unsigned short& alphaFillValue, |
385 | | unsigned short*& writePtr, |
386 | | const size_t& pixelsToCopySSE, |
387 | | const size_t& pixelsToCopyNormal) |
388 | 0 | { |
389 | 0 | bool readPtrAreAligned = true; |
390 | |
|
391 | 0 | readPtrAreAligned &= isPointerSSEAligned (readPtrRed); |
392 | 0 | readPtrAreAligned &= isPointerSSEAligned (readPtrGreen); |
393 | 0 | readPtrAreAligned &= isPointerSSEAligned (readPtrBlue); |
394 | |
|
395 | 0 | bool writePtrIsAligned = isPointerSSEAligned (writePtr); |
396 | |
|
397 | 0 | if (!readPtrAreAligned && !writePtrIsAligned) |
398 | 0 | { |
399 | 0 | writeToRGBAFillASSETemplate<false, false> ((__m128i*&)readPtrRed, |
400 | 0 | (__m128i*&)readPtrGreen, |
401 | 0 | (__m128i*&)readPtrBlue, |
402 | 0 | alphaFillValue, |
403 | 0 | (__m128i*&)writePtr, |
404 | 0 | pixelsToCopySSE); |
405 | 0 | } |
406 | 0 | else if (!readPtrAreAligned && writePtrIsAligned) |
407 | 0 | { |
408 | 0 | writeToRGBAFillASSETemplate<false, true> ((__m128i*&)readPtrRed, |
409 | 0 | (__m128i*&)readPtrGreen, |
410 | 0 | (__m128i*&)readPtrBlue, |
411 | 0 | alphaFillValue, |
412 | 0 | (__m128i*&)writePtr, |
413 | 0 | pixelsToCopySSE); |
414 | 0 | } |
415 | 0 | else if (readPtrAreAligned && !writePtrIsAligned) |
416 | 0 | { |
417 | 0 | writeToRGBAFillASSETemplate<true, false> ((__m128i*&)readPtrRed, |
418 | 0 | (__m128i*&)readPtrGreen, |
419 | 0 | (__m128i*&)readPtrBlue, |
420 | 0 | alphaFillValue, |
421 | 0 | (__m128i*&)writePtr, |
422 | 0 | pixelsToCopySSE); |
423 | 0 | } |
424 | 0 | else if (readPtrAreAligned && writePtrIsAligned) |
425 | 0 | { |
426 | 0 | writeToRGBAFillASSETemplate<true, true> ((__m128i*&)readPtrRed, |
427 | 0 | (__m128i*&)readPtrGreen, |
428 | 0 | (__m128i*&)readPtrBlue, |
429 | 0 | alphaFillValue, |
430 | 0 | (__m128i*&)writePtr, |
431 | 0 | pixelsToCopySSE); |
432 | 0 | } |
433 | |
|
434 | 0 | writeToRGBAFillANormal (readPtrRed, |
435 | 0 | readPtrGreen, readPtrBlue, alphaFillValue, |
436 | 0 | writePtr, pixelsToCopyNormal); |
437 | 0 | } |
438 | | |
439 | | |
440 | | |
441 | | //------------------------------------------------------------------------ |
442 | | // |
443 | | // Write to RGB |
444 | | // |
445 | | //------------------------------------------------------------------------ |
446 | | |
447 | | // |
448 | | // Using SSE intrinsics |
449 | | // |
450 | | template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED> |
451 | | EXR_FORCEINLINE |
452 | | void |
453 | | writeToRGBSSETemplate (__m128i*& readPtrSSERed, |
454 | | __m128i*& readPtrSSEGreen, |
455 | | __m128i*& readPtrSSEBlue, |
456 | | __m128i*& writePtrSSE, |
457 | | const size_t& pixelsToCopySSE) |
458 | 0 | { |
459 | |
|
460 | 0 | for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter) |
461 | 0 | { |
462 | | // |
463 | | // Need to shuffle and unpack pointers to obtain my first register |
464 | | // We must save 8 pixels at a time, so we must have the following three registers at the end: |
465 | | // 1) R1 G1 B1 R2 G2 B2 R3 G3 |
466 | | // 2) B3 R4 G4 B4 R5 G5 B5 R6 |
467 | | // 3) G6 B6 R7 G7 B7 R8 G8 B8 |
468 | | // |
469 | 0 | __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed); |
470 | 0 | __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen); |
471 | 0 | __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue); |
472 | | |
473 | | // |
474 | | // First register: R1 G1 B1 R2 G2 B2 R3 G3 |
475 | | // Construct 2 registers and then unpack them to obtain our final result: |
476 | | // |
477 | 0 | __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister, |
478 | 0 | greenRegister); |
479 | 0 | __m128i redBlueRegister = _mm_unpacklo_epi16 (redRegister, |
480 | 0 | blueRegister); |
481 | 0 | __m128i greenBlueRegister = _mm_unpacklo_epi16 (greenRegister, |
482 | 0 | blueRegister); |
483 | | |
484 | | // Left Part (R1 G1 B1 R2) |
485 | 0 | __m128i quarterRight = _mm_shufflelo_epi16 (redBlueRegister, |
486 | 0 | _MM_SHUFFLE(3,0,2,1)); |
487 | 0 | __m128i halfLeft = _mm_unpacklo_epi32 (redGreenRegister, |
488 | 0 | quarterRight); |
489 | | |
490 | | // Right Part (G2 B2 R3 G3) |
491 | 0 | __m128i quarterLeft = _mm_shuffle_epi32 (greenBlueRegister, |
492 | 0 | _MM_SHUFFLE(3,2,0,1)); |
493 | 0 | quarterRight = _mm_shuffle_epi32 (redGreenRegister, |
494 | 0 | _MM_SHUFFLE(3,0,1,2)); |
495 | 0 | __m128i halfRight = _mm_unpacklo_epi32 (quarterLeft, quarterRight); |
496 | |
|
497 | 0 | __m128i fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight); |
498 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister); |
499 | 0 | ++writePtrSSE; |
500 | | |
501 | | // |
502 | | // Second register: B3 R4 G4 B4 R5 G5 B5 R6 |
503 | | // |
504 | | |
505 | | // Left Part (B3, R4, G4, B4) |
506 | 0 | quarterLeft = _mm_shufflehi_epi16 (redBlueRegister, |
507 | 0 | _MM_SHUFFLE(0, 3, 2, 1)); |
508 | 0 | quarterRight = _mm_shufflehi_epi16 (greenBlueRegister, |
509 | 0 | _MM_SHUFFLE(1, 0, 3, 2)); |
510 | 0 | halfLeft = _mm_unpackhi_epi32 (quarterLeft, quarterRight); |
511 | | |
512 | | // Update the registers |
513 | 0 | redGreenRegister = _mm_unpackhi_epi16 (redRegister, greenRegister); |
514 | 0 | redBlueRegister = _mm_unpackhi_epi16 (redRegister, blueRegister); |
515 | 0 | greenBlueRegister = _mm_unpackhi_epi16 (greenRegister, blueRegister); |
516 | | |
517 | | // Right Part (R5 G5 B5 R6) |
518 | 0 | quarterRight = _mm_shufflelo_epi16 (redBlueRegister, |
519 | 0 | _MM_SHUFFLE(3,0,2,1)); |
520 | 0 | halfRight = _mm_unpacklo_epi32 (redGreenRegister, quarterRight); |
521 | |
|
522 | 0 | fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight); |
523 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister); |
524 | 0 | ++writePtrSSE; |
525 | | |
526 | | // |
527 | | // Third register: G6 B6 R7 G7 B7 R8 G8 B8 |
528 | | // |
529 | | |
530 | | // Left part (G6 B6 R7 G7) |
531 | 0 | quarterLeft = _mm_shuffle_epi32 (greenBlueRegister, |
532 | 0 | _MM_SHUFFLE(3,2,0,1)); |
533 | 0 | quarterRight = _mm_shuffle_epi32 (redGreenRegister, |
534 | 0 | _MM_SHUFFLE(3,0,1,2)); |
535 | 0 | halfLeft = _mm_unpacklo_epi32 (quarterLeft, quarterRight); |
536 | | |
537 | | // Right part (B7 R8 G8 B8) |
538 | 0 | quarterLeft = _mm_shufflehi_epi16 (redBlueRegister, |
539 | 0 | _MM_SHUFFLE(0, 3, 2, 1)); |
540 | 0 | quarterRight = _mm_shufflehi_epi16 (greenBlueRegister, |
541 | 0 | _MM_SHUFFLE(1, 0, 3, 2)); |
542 | 0 | halfRight = _mm_unpackhi_epi32 (quarterLeft, quarterRight); |
543 | |
|
544 | 0 | fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight); |
545 | 0 | storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister); |
546 | 0 | ++writePtrSSE; |
547 | | |
548 | | // |
549 | | // Increment read pointers |
550 | | // |
551 | 0 | ++readPtrSSEBlue; |
552 | 0 | ++readPtrSSEGreen; |
553 | 0 | ++readPtrSSERed; |
554 | 0 | } |
555 | 0 | } Unexecuted instantiation: void Imf_2_2::writeToRGBSSETemplate<false, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBSSETemplate<false, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBSSETemplate<true, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&) Unexecuted instantiation: void Imf_2_2::writeToRGBSSETemplate<true, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&) |
556 | | |
557 | | // |
558 | | // Not using SSE intrinsics. This is still faster than the alternative |
559 | | // because we have multiple read pointers and therefore we are able to |
560 | | // take advantage of data locality for write operations. |
561 | | // |
562 | | EXR_FORCEINLINE |
563 | | void |
564 | | writeToRGBNormal (unsigned short*& readPtrRed, |
565 | | unsigned short*& readPtrGreen, |
566 | | unsigned short*& readPtrBlue, |
567 | | unsigned short*& writePtr, |
568 | | const size_t& pixelsToCopy) |
569 | 0 | { |
570 | 0 | for (size_t i = 0; i < pixelsToCopy; ++i) |
571 | 0 | { |
572 | 0 | *(writePtr++) = *(readPtrRed++); |
573 | 0 | *(writePtr++) = *(readPtrGreen++); |
574 | 0 | *(writePtr++) = *(readPtrBlue++); |
575 | 0 | } |
576 | 0 | } |
577 | | |
578 | | // |
579 | | // Determine which (template) version to use by checking whether pointers |
580 | | // are aligned |
581 | | // |
582 | | EXR_FORCEINLINE |
583 | | void optimizedWriteToRGB (unsigned short*& readPtrRed, |
584 | | unsigned short*& readPtrGreen, |
585 | | unsigned short*& readPtrBlue, |
586 | | unsigned short*& writePtr, |
587 | | const size_t& pixelsToCopySSE, |
588 | | const size_t& pixelsToCopyNormal) |
589 | 0 | { |
590 | 0 | bool readPtrAreAligned = true; |
591 | |
|
592 | 0 | readPtrAreAligned &= isPointerSSEAligned(readPtrRed); |
593 | 0 | readPtrAreAligned &= isPointerSSEAligned(readPtrGreen); |
594 | 0 | readPtrAreAligned &= isPointerSSEAligned(readPtrBlue); |
595 | |
|
596 | 0 | bool writePtrIsAligned = isPointerSSEAligned(writePtr); |
597 | |
|
598 | 0 | if (!readPtrAreAligned && !writePtrIsAligned) |
599 | 0 | { |
600 | 0 | writeToRGBSSETemplate<false, false> ((__m128i*&)readPtrRed, |
601 | 0 | (__m128i*&)readPtrGreen, |
602 | 0 | (__m128i*&)readPtrBlue, |
603 | 0 | (__m128i*&)writePtr, |
604 | 0 | pixelsToCopySSE); |
605 | 0 | } |
606 | 0 | else if (!readPtrAreAligned && writePtrIsAligned) |
607 | 0 | { |
608 | 0 | writeToRGBSSETemplate<false, true> ((__m128i*&)readPtrRed, |
609 | 0 | (__m128i*&)readPtrGreen, |
610 | 0 | (__m128i*&)readPtrBlue, |
611 | 0 | (__m128i*&)writePtr, |
612 | 0 | pixelsToCopySSE); |
613 | 0 | } |
614 | 0 | else if (readPtrAreAligned && !writePtrIsAligned) |
615 | 0 | { |
616 | 0 | writeToRGBSSETemplate<true, false> ((__m128i*&)readPtrRed, |
617 | 0 | (__m128i*&)readPtrGreen, |
618 | 0 | (__m128i*&)readPtrBlue, |
619 | 0 | (__m128i*&)writePtr, |
620 | 0 | pixelsToCopySSE); |
621 | 0 | } |
622 | 0 | else if (readPtrAreAligned && writePtrIsAligned) |
623 | 0 | { |
624 | 0 | writeToRGBSSETemplate<true, true> ((__m128i*&)readPtrRed, |
625 | 0 | (__m128i*&)readPtrGreen, |
626 | 0 | (__m128i*&)readPtrBlue, |
627 | 0 | (__m128i*&)writePtr, |
628 | 0 | pixelsToCopySSE); |
629 | 0 | } |
630 | | |
631 | |
|
632 | 0 | writeToRGBNormal (readPtrRed, readPtrGreen, readPtrBlue, |
633 | 0 | writePtr, pixelsToCopyNormal); |
634 | 0 | } |
635 | | |
636 | | |
637 | | |
638 | | |
639 | | #else // ! defined IMF_HAVE_SSE2 |
640 | | |
641 | | #endif // defined IMF_HAVE_SSE2 |
642 | | |
643 | | |
644 | | OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT |
645 | | |
646 | | #endif |