/src/gdal/gcore/gdalsse_priv.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /******************************************************************************  | 
2  |  |  *  | 
3  |  |  * Project:  GDAL  | 
4  |  |  * Purpose:  SSE2 helper  | 
5  |  |  * Author:   Even Rouault <even dot rouault at spatialys dot com>  | 
6  |  |  *  | 
7  |  |  ******************************************************************************  | 
8  |  |  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>  | 
9  |  |  *  | 
10  |  |  * SPDX-License-Identifier: MIT  | 
11  |  |  ****************************************************************************/  | 
12  |  |  | 
13  |  | #ifndef GDALSSE_PRIV_H_INCLUDED  | 
14  |  | #define GDALSSE_PRIV_H_INCLUDED  | 
15  |  |  | 
16  |  | #ifndef DOXYGEN_SKIP  | 
17  |  |  | 
18  |  | #include "cpl_port.h"  | 
19  |  |  | 
20  |  | /* We restrict to 64bit processors because they are guaranteed to have SSE2 */  | 
21  |  | /* Could possibly be used too on 32bit, but we would need to check at runtime */  | 
22  |  | #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) &&             \  | 
23  |  |     !defined(USE_SSE2_EMULATION)  | 
24  |  |  | 
25  |  | #include <string.h>  | 
26  |  |  | 
27  |  | #ifdef USE_NEON_OPTIMIZATIONS  | 
28  |  | #include "include_sse2neon.h"  | 
29  |  | #else  | 
30  |  | /* Requires SSE2 */  | 
31  |  | #include <emmintrin.h>  | 
32  |  |  | 
33  |  | #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)  | 
34  |  | #include <smmintrin.h>  | 
35  |  | #endif  | 
36  |  | #endif  | 
37  |  |  | 
38  |  | #include "gdal_priv_templates.hpp"  | 
39  |  |  | 
40  |  | static inline __m128i GDALCopyInt16ToXMM(const void *ptr)  | 
41  | 0  | { | 
42  | 0  |     unsigned short s;  | 
43  | 0  |     memcpy(&s, ptr, 2);  | 
44  | 0  |     return _mm_cvtsi32_si128(s);  | 
45  | 0  | } Unexecuted instantiation: overview.cpp:GDALCopyInt16ToXMM(void const*) Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt16ToXMM(void const*) Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt16ToXMM(void const*) Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt16ToXMM(void const*) Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt16ToXMM(void const*)  | 
46  |  |  | 
47  |  | static inline __m128i GDALCopyInt32ToXMM(const void *ptr)  | 
48  | 0  | { | 
49  | 0  |     GInt32 i;  | 
50  | 0  |     memcpy(&i, ptr, 4);  | 
51  | 0  |     return _mm_cvtsi32_si128(i);  | 
52  | 0  | } Unexecuted instantiation: overview.cpp:GDALCopyInt32ToXMM(void const*) Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt32ToXMM(void const*) Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt32ToXMM(void const*) Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt32ToXMM(void const*) Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt32ToXMM(void const*)  | 
53  |  |  | 
54  |  | static inline __m128i GDALCopyInt64ToXMM(const void *ptr)  | 
55  | 0  | { | 
56  |  | #if defined(__i386__) || defined(_M_IX86)  | 
57  |  |     return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));  | 
58  |  | #else  | 
59  | 0  |     GInt64 i;  | 
60  | 0  |     memcpy(&i, ptr, 8);  | 
61  | 0  |     return _mm_cvtsi64_si128(i);  | 
62  | 0  | #endif  | 
63  | 0  | } Unexecuted instantiation: overview.cpp:GDALCopyInt64ToXMM(void const*) Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt64ToXMM(void const*) Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt64ToXMM(void const*) Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt64ToXMM(void const*) Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt64ToXMM(void const*)  | 
64  |  |  | 
65  |  | static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)  | 
66  | 0  | { | 
67  | 0  |     GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));  | 
68  | 0  |     memcpy(pDest, &i, 2);  | 
69  | 0  | } Unexecuted instantiation: overview.cpp:GDALCopyXMMToInt16(long long __vector(2), void*) Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyXMMToInt16(long long __vector(2), void*) Unexecuted instantiation: pixelfunctions.cpp:GDALCopyXMMToInt16(long long __vector(2), void*) Unexecuted instantiation: gdal_rpc.cpp:GDALCopyXMMToInt16(long long __vector(2), void*) Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)  | 
70  |  |  | 
71  |  | class XMMReg4Int;  | 
72  |  |  | 
73  |  | class XMMReg4Double;  | 
74  |  |  | 
75  |  | class XMMReg4Float  | 
76  |  | { | 
77  |  |   public:  | 
78  |  |     __m128 xmm;  | 
79  |  |  | 
80  |  |     XMMReg4Float()  | 
81  |  | #if !defined(_MSC_VER)  | 
82  | 0  |         : xmm(_mm_undefined_ps())  | 
83  |  | #endif  | 
84  | 0  |     { | 
85  | 0  |     }  | 
86  |  |  | 
87  | 0  |     XMMReg4Float(const XMMReg4Float &other) : xmm(other.xmm)  | 
88  | 0  |     { | 
89  | 0  |     }  | 
90  |  |  | 
91  |  |     static inline XMMReg4Float Zero()  | 
92  | 0  |     { | 
93  | 0  |         XMMReg4Float reg;  | 
94  | 0  |         reg.Zeroize();  | 
95  | 0  |         return reg;  | 
96  | 0  |     }  | 
97  |  |  | 
98  |  |     static inline XMMReg4Float Set1(float f)  | 
99  | 0  |     { | 
100  | 0  |         XMMReg4Float reg;  | 
101  | 0  |         reg.xmm = _mm_set1_ps(f);  | 
102  | 0  |         return reg;  | 
103  | 0  |     }  | 
104  |  |  | 
105  |  |     static inline XMMReg4Float Load4Val(const float *ptr)  | 
106  | 0  |     { | 
107  | 0  |         XMMReg4Float reg;  | 
108  | 0  |         reg.nsLoad4Val(ptr);  | 
109  | 0  |         return reg;  | 
110  | 0  |     }  | 
111  |  |  | 
112  |  |     static inline XMMReg4Float Load4Val(const unsigned char *ptr)  | 
113  | 0  |     { | 
114  | 0  |         XMMReg4Float reg;  | 
115  | 0  |         reg.nsLoad4Val(ptr);  | 
116  | 0  |         return reg;  | 
117  | 0  |     }  | 
118  |  |  | 
119  |  |     static inline XMMReg4Float Load4Val(const short *ptr)  | 
120  | 0  |     { | 
121  | 0  |         XMMReg4Float reg;  | 
122  | 0  |         reg.nsLoad4Val(ptr);  | 
123  | 0  |         return reg;  | 
124  | 0  |     }  | 
125  |  |  | 
126  |  |     static inline XMMReg4Float Load4Val(const unsigned short *ptr)  | 
127  | 0  |     { | 
128  | 0  |         XMMReg4Float reg;  | 
129  | 0  |         reg.nsLoad4Val(ptr);  | 
130  | 0  |         return reg;  | 
131  | 0  |     }  | 
132  |  |  | 
133  |  |     static inline XMMReg4Float Load4Val(const int *ptr)  | 
134  | 0  |     { | 
135  | 0  |         XMMReg4Float reg;  | 
136  | 0  |         reg.nsLoad4Val(ptr);  | 
137  | 0  |         return reg;  | 
138  | 0  |     }  | 
139  |  |  | 
140  |  |     static inline XMMReg4Float Equals(const XMMReg4Float &expr1,  | 
141  |  |                                       const XMMReg4Float &expr2)  | 
142  | 0  |     { | 
143  | 0  |         XMMReg4Float reg;  | 
144  | 0  |         reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);  | 
145  | 0  |         return reg;  | 
146  | 0  |     }  | 
147  |  |  | 
148  |  |     static inline XMMReg4Float NotEquals(const XMMReg4Float &expr1,  | 
149  |  |                                          const XMMReg4Float &expr2)  | 
150  | 0  |     { | 
151  | 0  |         XMMReg4Float reg;  | 
152  | 0  |         reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);  | 
153  | 0  |         return reg;  | 
154  | 0  |     }  | 
155  |  |  | 
156  |  |     static inline XMMReg4Float Lesser(const XMMReg4Float &expr1,  | 
157  |  |                                       const XMMReg4Float &expr2)  | 
158  | 0  |     { | 
159  | 0  |         XMMReg4Float reg;  | 
160  | 0  |         reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);  | 
161  | 0  |         return reg;  | 
162  | 0  |     }  | 
163  |  |  | 
164  |  |     static inline XMMReg4Float Greater(const XMMReg4Float &expr1,  | 
165  |  |                                        const XMMReg4Float &expr2)  | 
166  | 0  |     { | 
167  | 0  |         XMMReg4Float reg;  | 
168  | 0  |         reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);  | 
169  | 0  |         return reg;  | 
170  | 0  |     }  | 
171  |  |  | 
172  |  |     static inline XMMReg4Float And(const XMMReg4Float &expr1,  | 
173  |  |                                    const XMMReg4Float &expr2)  | 
174  | 0  |     { | 
175  | 0  |         XMMReg4Float reg;  | 
176  | 0  |         reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);  | 
177  | 0  |         return reg;  | 
178  | 0  |     }  | 
179  |  |  | 
180  |  |     static inline XMMReg4Float Ternary(const XMMReg4Float &cond,  | 
181  |  |                                        const XMMReg4Float &true_expr,  | 
182  |  |                                        const XMMReg4Float &false_expr)  | 
183  | 0  |     { | 
184  | 0  |         XMMReg4Float reg;  | 
185  | 0  |         reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),  | 
186  | 0  |                             _mm_andnot_ps(cond.xmm, false_expr.xmm));  | 
187  | 0  |         return reg;  | 
188  | 0  |     }  | 
189  |  |  | 
190  |  |     static inline XMMReg4Float Min(const XMMReg4Float &expr1,  | 
191  |  |                                    const XMMReg4Float &expr2)  | 
192  | 0  |     { | 
193  | 0  |         XMMReg4Float reg;  | 
194  | 0  |         reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);  | 
195  | 0  |         return reg;  | 
196  | 0  |     }  | 
197  |  |  | 
198  |  |     static inline XMMReg4Float Max(const XMMReg4Float &expr1,  | 
199  |  |                                    const XMMReg4Float &expr2)  | 
200  | 0  |     { | 
201  | 0  |         XMMReg4Float reg;  | 
202  | 0  |         reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);  | 
203  | 0  |         return reg;  | 
204  | 0  |     }  | 
205  |  |  | 
206  |  |     inline void nsLoad4Val(const float *ptr)  | 
207  | 0  |     { | 
208  | 0  |         xmm = _mm_loadu_ps(ptr);  | 
209  | 0  |     }  | 
210  |  |  | 
211  |  |     static inline void Load16Val(const float *ptr, XMMReg4Float &r0,  | 
212  |  |                                  XMMReg4Float &r1, XMMReg4Float &r2,  | 
213  |  |                                  XMMReg4Float &r3)  | 
214  | 0  |     { | 
215  | 0  |         r0.nsLoad4Val(ptr);  | 
216  | 0  |         r1.nsLoad4Val(ptr + 4);  | 
217  | 0  |         r2.nsLoad4Val(ptr + 8);  | 
218  | 0  |         r3.nsLoad4Val(ptr + 12);  | 
219  | 0  |     }  | 
220  |  |  | 
221  |  |     inline void nsLoad4Val(const int *ptr)  | 
222  | 0  |     { | 
223  | 0  |         const __m128i xmm_i =  | 
224  | 0  |             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));  | 
225  | 0  |         xmm = _mm_cvtepi32_ps(xmm_i);  | 
226  | 0  |     }  | 
227  |  |  | 
228  |  |     static inline void Load16Val(const int *ptr, XMMReg4Float &r0,  | 
229  |  |                                  XMMReg4Float &r1, XMMReg4Float &r2,  | 
230  |  |                                  XMMReg4Float &r3)  | 
231  | 0  |     { | 
232  | 0  |         r0.nsLoad4Val(ptr);  | 
233  | 0  |         r1.nsLoad4Val(ptr + 4);  | 
234  | 0  |         r2.nsLoad4Val(ptr + 8);  | 
235  | 0  |         r3.nsLoad4Val(ptr + 12);  | 
236  | 0  |     }  | 
237  |  |  | 
238  |  |     static inline __m128i cvtepu8_epi32(__m128i x)  | 
239  | 0  |     { | 
240  |  | #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)  | 
241  |  |         return _mm_cvtepu8_epi32(x);  | 
242  |  | #else  | 
243  | 0  |         return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),  | 
244  | 0  |                                   _mm_setzero_si128());  | 
245  | 0  | #endif  | 
246  | 0  |     }  | 
247  |  |  | 
248  |  |     inline void nsLoad4Val(const unsigned char *ptr)  | 
249  | 0  |     { | 
250  | 0  |         const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);  | 
251  | 0  |         xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));  | 
252  | 0  |     }  | 
253  |  |  | 
254  |  |     static inline void Load8Val(const unsigned char *ptr, XMMReg4Float &r0,  | 
255  |  |                                 XMMReg4Float &r1)  | 
256  | 0  |     { | 
257  | 0  |         const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);  | 
258  | 0  |         r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));  | 
259  | 0  |         r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));  | 
260  | 0  |     }  | 
261  |  |  | 
262  |  |     static inline void Load16Val(const unsigned char *ptr, XMMReg4Float &r0,  | 
263  |  |                                  XMMReg4Float &r1, XMMReg4Float &r2,  | 
264  |  |                                  XMMReg4Float &r3)  | 
265  | 0  |     { | 
266  | 0  |         const __m128i xmm_i =  | 
267  | 0  |             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));  | 
268  | 0  |         r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));  | 
269  | 0  |         r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));  | 
270  | 0  |         r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));  | 
271  | 0  |         r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));  | 
272  | 0  |     }  | 
273  |  |  | 
274  |  |     static inline __m128i cvtepi16_epi32(__m128i x)  | 
275  | 0  |     { | 
276  |  | #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)  | 
277  |  |         return _mm_cvtepi16_epi32(x);  | 
278  |  | #else  | 
279  |  |         /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */  | 
280  | 0  |         return _mm_srai_epi32(  | 
281  |  |             /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */  | 
282  | 0  |             _mm_unpacklo_epi16(x, x), 16);  | 
283  | 0  | #endif  | 
284  | 0  |     }  | 
285  |  |  | 
286  |  |     inline void nsLoad4Val(const short *ptr)  | 
287  | 0  |     { | 
288  | 0  |         const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);  | 
289  | 0  |         xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));  | 
290  | 0  |     }  | 
291  |  |  | 
292  |  |     static inline void Load8Val(const short *ptr, XMMReg4Float &r0,  | 
293  |  |                                 XMMReg4Float &r1)  | 
294  | 0  |     { | 
295  | 0  |         const __m128i xmm_i =  | 
296  | 0  |             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));  | 
297  | 0  |         r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));  | 
298  | 0  |         r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));  | 
299  | 0  |     }  | 
300  |  |  | 
301  |  |     static inline void Load16Val(const short *ptr, XMMReg4Float &r0,  | 
302  |  |                                  XMMReg4Float &r1, XMMReg4Float &r2,  | 
303  |  |                                  XMMReg4Float &r3)  | 
304  | 0  |     { | 
305  | 0  |         Load8Val(ptr, r0, r1);  | 
306  | 0  |         Load8Val(ptr + 8, r2, r3);  | 
307  | 0  |     }  | 
308  |  |  | 
309  |  |     static inline __m128i cvtepu16_epi32(__m128i x)  | 
310  | 0  |     { | 
311  |  | #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)  | 
312  |  |         return _mm_cvtepu16_epi32(x);  | 
313  |  | #else  | 
314  | 0  |         return _mm_unpacklo_epi16(  | 
315  | 0  |             x, _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */  | 
316  | 0  | #endif  | 
317  | 0  |     }  | 
318  |  |  | 
319  |  |     inline void nsLoad4Val(const unsigned short *ptr)  | 
320  | 0  |     { | 
321  | 0  |         const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);  | 
322  | 0  |         xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));  | 
323  | 0  |     }  | 
324  |  |  | 
325  |  |     static inline void Load8Val(const unsigned short *ptr, XMMReg4Float &r0,  | 
326  |  |                                 XMMReg4Float &r1)  | 
327  | 0  |     { | 
328  | 0  |         const __m128i xmm_i =  | 
329  | 0  |             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));  | 
330  | 0  |         r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));  | 
331  | 0  |         r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));  | 
332  | 0  |     }  | 
333  |  |  | 
334  |  |     static inline void Load16Val(const unsigned short *ptr, XMMReg4Float &r0,  | 
335  |  |                                  XMMReg4Float &r1, XMMReg4Float &r2,  | 
336  |  |                                  XMMReg4Float &r3)  | 
337  | 0  |     { | 
338  | 0  |         Load8Val(ptr, r0, r1);  | 
339  | 0  |         Load8Val(ptr + 8, r2, r3);  | 
340  | 0  |     }  | 
341  |  |  | 
342  |  |     inline void Zeroize()  | 
343  | 0  |     { | 
344  | 0  |         xmm = _mm_setzero_ps();  | 
345  | 0  |     }  | 
346  |  |  | 
347  |  |     inline XMMReg4Float &operator=(const XMMReg4Float &other)  | 
348  | 0  |     { | 
349  | 0  |         xmm = other.xmm;  | 
350  | 0  |         return *this;  | 
351  | 0  |     }  | 
352  |  |  | 
353  |  |     inline XMMReg4Float &operator+=(const XMMReg4Float &other)  | 
354  | 0  |     { | 
355  | 0  |         xmm = _mm_add_ps(xmm, other.xmm);  | 
356  | 0  |         return *this;  | 
357  | 0  |     }  | 
358  |  |  | 
359  |  |     inline XMMReg4Float &operator-=(const XMMReg4Float &other)  | 
360  | 0  |     { | 
361  | 0  |         xmm = _mm_sub_ps(xmm, other.xmm);  | 
362  | 0  |         return *this;  | 
363  | 0  |     }  | 
364  |  |  | 
365  |  |     inline XMMReg4Float &operator*=(const XMMReg4Float &other)  | 
366  | 0  |     { | 
367  | 0  |         xmm = _mm_mul_ps(xmm, other.xmm);  | 
368  | 0  |         return *this;  | 
369  | 0  |     }  | 
370  |  |  | 
371  |  |     inline XMMReg4Float operator+(const XMMReg4Float &other) const  | 
372  | 0  |     { | 
373  | 0  |         XMMReg4Float ret;  | 
374  | 0  |         ret.xmm = _mm_add_ps(xmm, other.xmm);  | 
375  | 0  |         return ret;  | 
376  | 0  |     }  | 
377  |  |  | 
378  |  |     inline XMMReg4Float operator-(const XMMReg4Float &other) const  | 
379  | 0  |     { | 
380  | 0  |         XMMReg4Float ret;  | 
381  | 0  |         ret.xmm = _mm_sub_ps(xmm, other.xmm);  | 
382  | 0  |         return ret;  | 
383  | 0  |     }  | 
384  |  |  | 
385  |  |     inline XMMReg4Float operator*(const XMMReg4Float &other) const  | 
386  | 0  |     { | 
387  | 0  |         XMMReg4Float ret;  | 
388  | 0  |         ret.xmm = _mm_mul_ps(xmm, other.xmm);  | 
389  | 0  |         return ret;  | 
390  | 0  |     }  | 
391  |  |  | 
392  |  |     inline XMMReg4Float operator/(const XMMReg4Float &other) const  | 
393  | 0  |     { | 
394  | 0  |         XMMReg4Float ret;  | 
395  | 0  |         ret.xmm = _mm_div_ps(xmm, other.xmm);  | 
396  | 0  |         return ret;  | 
397  | 0  |     }  | 
398  |  |  | 
399  |  |     inline XMMReg4Float inverse() const  | 
400  | 0  |     { | 
401  | 0  |         XMMReg4Float ret;  | 
402  | 0  |         ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);  | 
403  | 0  |         return ret;  | 
404  | 0  |     }  | 
405  |  |  | 
406  |  |     inline XMMReg4Int truncate_to_int() const;  | 
407  |  |  | 
408  |  |     inline XMMReg4Double cast_to_double() const;  | 
409  |  |  | 
410  |  |     inline void Store4Val(float *ptr) const  | 
411  | 0  |     { | 
412  | 0  |         _mm_storeu_ps(ptr, xmm);  | 
413  | 0  |     }  | 
414  |  |  | 
415  |  |     inline void Store4ValAligned(float *ptr) const  | 
416  | 0  |     { | 
417  | 0  |         _mm_store_ps(ptr, xmm);  | 
418  | 0  |     }  | 
419  |  |  | 
420  |  |     inline operator float() const  | 
421  | 0  |     { | 
422  | 0  |         return _mm_cvtss_f32(xmm);  | 
423  | 0  |     }  | 
424  |  | };  | 
425  |  |  | 
426  |  | class XMMReg4Int  | 
427  |  | { | 
428  |  |   public:  | 
429  |  |     __m128i xmm;  | 
430  |  |  | 
431  |  |     XMMReg4Int()  | 
432  |  | #if !defined(_MSC_VER)  | 
433  |  |         : xmm(_mm_undefined_si128())  | 
434  |  | #endif  | 
435  | 0  |     { | 
436  | 0  |     }  | 
437  |  |  | 
438  |  |     XMMReg4Int(const XMMReg4Int &other) : xmm(other.xmm)  | 
439  | 0  |     { | 
440  | 0  |     }  | 
441  |  |  | 
442  |  |     static inline XMMReg4Int Zero()  | 
443  | 0  |     { | 
444  | 0  |         XMMReg4Int reg;  | 
445  | 0  |         reg.xmm = _mm_setzero_si128();  | 
446  | 0  |         return reg;  | 
447  | 0  |     }  | 
448  |  |  | 
449  |  |     static inline XMMReg4Int Set1(int i)  | 
450  | 0  |     { | 
451  | 0  |         XMMReg4Int reg;  | 
452  | 0  |         reg.xmm = _mm_set1_epi32(i);  | 
453  | 0  |         return reg;  | 
454  | 0  |     }  | 
455  |  |  | 
456  |  |     static inline XMMReg4Int Load4Val(const int *ptr)  | 
457  | 0  |     { | 
458  | 0  |         XMMReg4Int reg;  | 
459  | 0  |         reg.nsLoad4Val(ptr);  | 
460  | 0  |         return reg;  | 
461  | 0  |     }  | 
462  |  |  | 
463  |  |     inline void nsLoad4Val(const int *ptr)  | 
464  | 0  |     { | 
465  | 0  |         xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));  | 
466  | 0  |     }  | 
467  |  |  | 
468  |  |     static inline XMMReg4Int Equals(const XMMReg4Int &expr1,  | 
469  |  |                                     const XMMReg4Int &expr2)  | 
470  | 0  |     { | 
471  | 0  |         XMMReg4Int reg;  | 
472  | 0  |         reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);  | 
473  | 0  |         return reg;  | 
474  | 0  |     }  | 
475  |  |  | 
476  |  |     static inline XMMReg4Int Ternary(const XMMReg4Int &cond,  | 
477  |  |                                      const XMMReg4Int &true_expr,  | 
478  |  |                                      const XMMReg4Int &false_expr)  | 
479  | 0  |     { | 
480  | 0  |         XMMReg4Int reg;  | 
481  | 0  |         reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),  | 
482  | 0  |                                _mm_andnot_si128(cond.xmm, false_expr.xmm));  | 
483  | 0  |         return reg;  | 
484  | 0  |     }  | 
485  |  |  | 
486  |  |     inline XMMReg4Int &operator+=(const XMMReg4Int &other)  | 
487  | 0  |     { | 
488  | 0  |         xmm = _mm_add_epi32(xmm, other.xmm);  | 
489  | 0  |         return *this;  | 
490  | 0  |     }  | 
491  |  |  | 
492  |  |     inline XMMReg4Int &operator-=(const XMMReg4Int &other)  | 
493  | 0  |     { | 
494  | 0  |         xmm = _mm_sub_epi32(xmm, other.xmm);  | 
495  | 0  |         return *this;  | 
496  | 0  |     }  | 
497  |  |  | 
498  |  |     inline XMMReg4Int operator+(const XMMReg4Int &other) const  | 
499  | 0  |     { | 
500  | 0  |         XMMReg4Int ret;  | 
501  | 0  |         ret.xmm = _mm_add_epi32(xmm, other.xmm);  | 
502  | 0  |         return ret;  | 
503  | 0  |     }  | 
504  |  |  | 
505  |  |     inline XMMReg4Int operator-(const XMMReg4Int &other) const  | 
506  | 0  |     { | 
507  | 0  |         XMMReg4Int ret;  | 
508  | 0  |         ret.xmm = _mm_sub_epi32(xmm, other.xmm);  | 
509  | 0  |         return ret;  | 
510  | 0  |     }  | 
511  |  |  | 
512  |  |     XMMReg4Double cast_to_double() const;  | 
513  |  |  | 
514  |  |     XMMReg4Float to_float() const  | 
515  | 0  |     { | 
516  | 0  |         XMMReg4Float ret;  | 
517  | 0  |         ret.xmm = _mm_cvtepi32_ps(xmm);  | 
518  | 0  |         return ret;  | 
519  | 0  |     }  | 
520  |  | };  | 
521  |  |  | 
522  |  | inline XMMReg4Int XMMReg4Float::truncate_to_int() const  | 
523  | 0  | { | 
524  | 0  |     XMMReg4Int ret;  | 
525  | 0  |     ret.xmm = _mm_cvttps_epi32(xmm);  | 
526  | 0  |     return ret;  | 
527  | 0  | }  | 
528  |  |  | 
529  |  | class XMMReg8Byte  | 
530  |  | { | 
531  |  |   public:  | 
532  |  |     __m128i xmm;  | 
533  |  |  | 
534  |  |     XMMReg8Byte()  | 
535  |  | #if !defined(_MSC_VER)  | 
536  |  |         : xmm(_mm_undefined_si128())  | 
537  |  | #endif  | 
538  | 0  |     { | 
539  | 0  |     }  | 
540  |  |  | 
541  |  |     XMMReg8Byte(const XMMReg8Byte &other) : xmm(other.xmm)  | 
542  | 0  |     { | 
543  | 0  |     }  | 
544  |  |  | 
545  |  |     static inline XMMReg8Byte Zero()  | 
546  | 0  |     { | 
547  | 0  |         XMMReg8Byte reg;  | 
548  | 0  |         reg.xmm = _mm_setzero_si128();  | 
549  | 0  |         return reg;  | 
550  | 0  |     }  | 
551  |  |  | 
552  |  |     static inline XMMReg8Byte Set1(char i)  | 
553  | 0  |     { | 
554  | 0  |         XMMReg8Byte reg;  | 
555  | 0  |         reg.xmm = _mm_set1_epi8(i);  | 
556  | 0  |         return reg;  | 
557  | 0  |     }  | 
558  |  |  | 
559  |  |     static inline XMMReg8Byte Equals(const XMMReg8Byte &expr1,  | 
560  |  |                                      const XMMReg8Byte &expr2)  | 
561  | 0  |     { | 
562  | 0  |         XMMReg8Byte reg;  | 
563  | 0  |         reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);  | 
564  | 0  |         return reg;  | 
565  | 0  |     }  | 
566  |  |  | 
567  |  |     static inline XMMReg8Byte Or(const XMMReg8Byte &expr1,  | 
568  |  |                                  const XMMReg8Byte &expr2)  | 
569  | 0  |     { | 
570  | 0  |         XMMReg8Byte reg;  | 
571  | 0  |         reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);  | 
572  | 0  |         return reg;  | 
573  | 0  |     }  | 
574  |  |  | 
575  |  |     static inline XMMReg8Byte Ternary(const XMMReg8Byte &cond,  | 
576  |  |                                       const XMMReg8Byte &true_expr,  | 
577  |  |                                       const XMMReg8Byte &false_expr)  | 
578  | 0  |     { | 
579  | 0  |         XMMReg8Byte reg;  | 
580  | 0  |         reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),  | 
581  | 0  |                                _mm_andnot_si128(cond.xmm, false_expr.xmm));  | 
582  | 0  |         return reg;  | 
583  | 0  |     }  | 
584  |  |  | 
585  |  |     inline XMMReg8Byte operator+(const XMMReg8Byte &other) const  | 
586  | 0  |     { | 
587  | 0  |         XMMReg8Byte ret;  | 
588  | 0  |         ret.xmm = _mm_add_epi8(xmm, other.xmm);  | 
589  | 0  |         return ret;  | 
590  | 0  |     }  | 
591  |  |  | 
592  |  |     inline XMMReg8Byte operator-(const XMMReg8Byte &other) const  | 
593  | 0  |     { | 
594  | 0  |         XMMReg8Byte ret;  | 
595  | 0  |         ret.xmm = _mm_sub_epi8(xmm, other.xmm);  | 
596  | 0  |         return ret;  | 
597  | 0  |     }  | 
598  |  |  | 
599  |  |     static inline XMMReg8Byte Pack(const XMMReg4Int &r0, const XMMReg4Int &r1)  | 
600  | 0  |     { | 
601  | 0  |         XMMReg8Byte reg;  | 
602  | 0  |         reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);  | 
603  | 0  |         reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);  | 
604  | 0  |         return reg;  | 
605  | 0  |     }  | 
606  |  |  | 
607  |  |     inline void Store8Val(unsigned char *ptr) const  | 
608  | 0  |     { | 
609  | 0  |         GDALCopyXMMToInt64(xmm, reinterpret_cast<GInt64 *>(ptr));  | 
610  | 0  |     }  | 
611  |  | };  | 
612  |  |  | 
613  |  | class XMMReg2Double  | 
614  |  | { | 
615  |  |   public:  | 
616  |  |     __m128d xmm;  | 
617  |  |  | 
618  |  |     XMMReg2Double()  | 
619  |  | #if !defined(_MSC_VER)  | 
620  | 0  |         : xmm(_mm_undefined_pd())  | 
621  |  | #endif  | 
622  | 0  |     { | 
623  | 0  |     }  | 
624  |  |  | 
625  |  |     XMMReg2Double(double val) : xmm(_mm_load_sd(&val))  | 
626  | 0  |     { | 
627  | 0  |     }  | 
628  |  |  | 
629  | 0  |     XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)  | 
630  | 0  |     { | 
631  | 0  |     }  | 
632  |  |  | 
633  |  |     static inline XMMReg2Double Set1(double d)  | 
634  | 0  |     { | 
635  | 0  |         XMMReg2Double reg;  | 
636  | 0  |         reg.xmm = _mm_set1_pd(d);  | 
637  | 0  |         return reg;  | 
638  | 0  |     }  | 
639  |  |  | 
640  |  |     static inline XMMReg2Double Zero()  | 
641  | 0  |     { | 
642  | 0  |         XMMReg2Double reg;  | 
643  | 0  |         reg.Zeroize();  | 
644  | 0  |         return reg;  | 
645  | 0  |     }  | 
646  |  |  | 
647  |  |     static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)  | 
648  | 0  |     { | 
649  | 0  |         XMMReg2Double reg;  | 
650  | 0  |         reg.nsLoad1ValHighAndLow(ptr);  | 
651  | 0  |         return reg;  | 
652  | 0  |     }  | 
653  |  |  | 
654  |  |     static inline XMMReg2Double Load2Val(const double *ptr)  | 
655  | 0  |     { | 
656  | 0  |         XMMReg2Double reg;  | 
657  | 0  |         reg.nsLoad2Val(ptr);  | 
658  | 0  |         return reg;  | 
659  | 0  |     }  | 
660  |  |  | 
661  |  |     static inline XMMReg2Double Load2Val(const float *ptr)  | 
662  | 0  |     { | 
663  | 0  |         XMMReg2Double reg;  | 
664  | 0  |         reg.nsLoad2Val(ptr);  | 
665  | 0  |         return reg;  | 
666  | 0  |     }  | 
667  |  |  | 
668  |  |     static inline XMMReg2Double Load2ValAligned(const double *ptr)  | 
669  | 0  |     { | 
670  | 0  |         XMMReg2Double reg;  | 
671  | 0  |         reg.nsLoad2ValAligned(ptr);  | 
672  | 0  |         return reg;  | 
673  | 0  |     }  | 
674  |  |  | 
675  |  |     static inline XMMReg2Double Load2Val(const unsigned char *ptr)  | 
676  | 0  |     { | 
677  | 0  |         XMMReg2Double reg;  | 
678  | 0  |         reg.nsLoad2Val(ptr);  | 
679  | 0  |         return reg;  | 
680  | 0  |     }  | 
681  |  |  | 
682  |  |     static inline XMMReg2Double Load2Val(const short *ptr)  | 
683  | 0  |     { | 
684  | 0  |         XMMReg2Double reg;  | 
685  | 0  |         reg.nsLoad2Val(ptr);  | 
686  | 0  |         return reg;  | 
687  | 0  |     }  | 
688  |  |  | 
689  |  |     static inline XMMReg2Double Load2Val(const unsigned short *ptr)  | 
690  | 0  |     { | 
691  | 0  |         XMMReg2Double reg;  | 
692  | 0  |         reg.nsLoad2Val(ptr);  | 
693  | 0  |         return reg;  | 
694  | 0  |     }  | 
695  |  |  | 
696  |  |     static inline XMMReg2Double Load2Val(const int *ptr)  | 
697  | 0  |     { | 
698  | 0  |         XMMReg2Double reg;  | 
699  | 0  |         reg.nsLoad2Val(ptr);  | 
700  | 0  |         return reg;  | 
701  | 0  |     }  | 
702  |  |  | 
703  |  |     static inline XMMReg2Double Equals(const XMMReg2Double &expr1,  | 
704  |  |                                        const XMMReg2Double &expr2)  | 
705  | 0  |     { | 
706  | 0  |         XMMReg2Double reg;  | 
707  | 0  |         reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);  | 
708  | 0  |         return reg;  | 
709  | 0  |     }  | 
710  |  |  | 
711  |  |     static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,  | 
712  |  |                                           const XMMReg2Double &expr2)  | 
713  | 0  |     { | 
714  | 0  |         XMMReg2Double reg;  | 
715  | 0  |         reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);  | 
716  | 0  |         return reg;  | 
717  | 0  |     }  | 
718  |  |  | 
719  |  |     static inline XMMReg2Double Greater(const XMMReg2Double &expr1,  | 
720  |  |                                         const XMMReg2Double &expr2)  | 
721  | 0  |     { | 
722  | 0  |         XMMReg2Double reg;  | 
723  | 0  |         reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);  | 
724  | 0  |         return reg;  | 
725  | 0  |     }  | 
726  |  |  | 
727  |  |     static inline XMMReg2Double And(const XMMReg2Double &expr1,  | 
728  |  |                                     const XMMReg2Double &expr2)  | 
729  | 0  |     { | 
730  | 0  |         XMMReg2Double reg;  | 
731  | 0  |         reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);  | 
732  | 0  |         return reg;  | 
733  | 0  |     }  | 
734  |  |  | 
735  |  |     static inline XMMReg2Double Ternary(const XMMReg2Double &cond,  | 
736  |  |                                         const XMMReg2Double &true_expr,  | 
737  |  |                                         const XMMReg2Double &false_expr)  | 
738  | 0  |     { | 
739  | 0  |         XMMReg2Double reg;  | 
740  | 0  |         reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),  | 
741  | 0  |                             _mm_andnot_pd(cond.xmm, false_expr.xmm));  | 
742  | 0  |         return reg;  | 
743  | 0  |     }  | 
744  |  |  | 
745  |  |     static inline XMMReg2Double Min(const XMMReg2Double &expr1,  | 
746  |  |                                     const XMMReg2Double &expr2)  | 
747  | 0  |     { | 
748  | 0  |         XMMReg2Double reg;  | 
749  | 0  |         reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);  | 
750  | 0  |         return reg;  | 
751  | 0  |     }  | 
752  |  |  | 
753  |  |     inline void nsLoad1ValHighAndLow(const double *ptr)  | 
754  | 0  |     { | 
755  | 0  |         xmm = _mm_load1_pd(ptr);  | 
756  | 0  |     }  | 
757  |  |  | 
758  |  |     inline void nsLoad2Val(const double *ptr)  | 
759  | 0  |     { | 
760  | 0  |         xmm = _mm_loadu_pd(ptr);  | 
761  | 0  |     }  | 
762  |  |  | 
763  |  |     inline void nsLoad2ValAligned(const double *ptr)  | 
764  | 0  |     { | 
765  | 0  |         xmm = _mm_load_pd(ptr);  | 
766  | 0  |     }  | 
767  |  |  | 
768  |  |     inline void nsLoad2Val(const float *ptr)  | 
769  | 0  |     { | 
770  | 0  |         xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));  | 
771  | 0  |     }  | 
772  |  |  | 
773  |  |     inline void nsLoad2Val(const int *ptr)  | 
774  | 0  |     { | 
775  | 0  |         xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));  | 
776  | 0  |     }  | 
777  |  |  | 
778  |  |     inline void nsLoad2Val(const unsigned char *ptr)  | 
779  | 0  |     { | 
780  | 0  |         __m128i xmm_i = GDALCopyInt16ToXMM(ptr);  | 
781  |  | #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)  | 
782  |  |         xmm_i = _mm_cvtepu8_epi32(xmm_i);  | 
783  |  | #else  | 
784  | 0  |         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());  | 
785  | 0  |         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());  | 
786  | 0  | #endif  | 
787  | 0  |         xmm = _mm_cvtepi32_pd(xmm_i);  | 
788  | 0  |     }  | 
789  |  |  | 
790  |  |     inline void nsLoad2Val(const short *ptr)  | 
791  | 0  |     { | 
792  | 0  |         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);  | 
793  |  | #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)  | 
794  |  |         xmm_i = _mm_cvtepi16_epi32(xmm_i);  | 
795  |  | #else  | 
796  | 0  |         xmm_i = _mm_unpacklo_epi16(  | 
797  | 0  |             xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */  | 
798  | 0  |         xmm_i = _mm_srai_epi32(  | 
799  | 0  |             xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */  | 
800  | 0  | #endif  | 
801  | 0  |         xmm = _mm_cvtepi32_pd(xmm_i);  | 
802  | 0  |     }  | 
803  |  |  | 
804  |  |     inline void nsLoad2Val(const unsigned short *ptr)  | 
805  | 0  |     { | 
806  | 0  |         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);  | 
807  |  | #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)  | 
808  |  |         xmm_i = _mm_cvtepu16_epi32(xmm_i);  | 
809  |  | #else  | 
810  | 0  |         xmm_i = _mm_unpacklo_epi16(  | 
811  | 0  |             xmm_i,  | 
812  | 0  |             _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */  | 
813  | 0  | #endif  | 
814  | 0  |         xmm = _mm_cvtepi32_pd(xmm_i);  | 
815  | 0  |     }  | 
816  |  |  | 
817  |  |     static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,  | 
818  |  |                                 XMMReg2Double &high)  | 
819  | 0  |     { | 
820  | 0  |         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);  | 
821  |  | #if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)  | 
822  |  |         xmm_i = _mm_cvtepu8_epi32(xmm_i);  | 
823  |  | #else  | 
824  | 0  |         xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());  | 
825  | 0  |         xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());  | 
826  | 0  | #endif  | 
827  | 0  |         low.xmm = _mm_cvtepi32_pd(xmm_i);  | 
828  | 0  |         high.xmm =  | 
829  | 0  |             _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));  | 
830  | 0  |     }  | 
831  |  |  | 
832  |  |     static inline void Load4Val(const short *ptr, XMMReg2Double &low,  | 
833  |  |                                 XMMReg2Double &high)  | 
834  | 0  |     { | 
835  | 0  |         low.nsLoad2Val(ptr);  | 
836  | 0  |         high.nsLoad2Val(ptr + 2);  | 
837  | 0  |     }  | 
838  |  |  | 
839  |  |     static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,  | 
840  |  |                                 XMMReg2Double &high)  | 
841  | 0  |     { | 
842  | 0  |         low.nsLoad2Val(ptr);  | 
843  | 0  |         high.nsLoad2Val(ptr + 2);  | 
844  | 0  |     }  | 
845  |  |  | 
846  |  |     static inline void Load4Val(const double *ptr, XMMReg2Double &low,  | 
847  |  |                                 XMMReg2Double &high)  | 
848  | 0  |     { | 
849  | 0  |         low.nsLoad2Val(ptr);  | 
850  | 0  |         high.nsLoad2Val(ptr + 2);  | 
851  | 0  |     }  | 
852  |  |  | 
853  |  |     static inline void Load4Val(const float *ptr, XMMReg2Double &low,  | 
854  |  |                                 XMMReg2Double &high)  | 
855  | 0  |     { | 
856  | 0  |         __m128 temp1 = _mm_loadu_ps(ptr);  | 
857  | 0  |         __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));  | 
858  | 0  |         low.xmm = _mm_cvtps_pd(temp1);  | 
859  | 0  |         high.xmm = _mm_cvtps_pd(temp2);  | 
860  | 0  |     }  | 
861  |  |  | 
862  |  |     inline void Zeroize()  | 
863  | 0  |     { | 
864  | 0  |         xmm = _mm_setzero_pd();  | 
865  | 0  |     }  | 
866  |  |  | 
867  |  |     inline XMMReg2Double &operator=(const XMMReg2Double &other)  | 
868  | 0  |     { | 
869  | 0  |         xmm = other.xmm;  | 
870  | 0  |         return *this;  | 
871  | 0  |     }  | 
872  |  |  | 
873  |  |     inline XMMReg2Double &operator+=(const XMMReg2Double &other)  | 
874  | 0  |     { | 
875  | 0  |         xmm = _mm_add_pd(xmm, other.xmm);  | 
876  | 0  |         return *this;  | 
877  | 0  |     }  | 
878  |  |  | 
879  |  |     inline XMMReg2Double &operator*=(const XMMReg2Double &other)  | 
880  | 0  |     { | 
881  | 0  |         xmm = _mm_mul_pd(xmm, other.xmm);  | 
882  | 0  |         return *this;  | 
883  | 0  |     }  | 
884  |  |  | 
885  |  |     inline XMMReg2Double operator+(const XMMReg2Double &other) const  | 
886  | 0  |     { | 
887  | 0  |         XMMReg2Double ret;  | 
888  | 0  |         ret.xmm = _mm_add_pd(xmm, other.xmm);  | 
889  | 0  |         return ret;  | 
890  | 0  |     }  | 
891  |  |  | 
892  |  |     inline XMMReg2Double operator-(const XMMReg2Double &other) const  | 
893  | 0  |     { | 
894  | 0  |         XMMReg2Double ret;  | 
895  | 0  |         ret.xmm = _mm_sub_pd(xmm, other.xmm);  | 
896  | 0  |         return ret;  | 
897  | 0  |     }  | 
898  |  |  | 
899  |  |     inline XMMReg2Double operator*(const XMMReg2Double &other) const  | 
900  | 0  |     { | 
901  | 0  |         XMMReg2Double ret;  | 
902  | 0  |         ret.xmm = _mm_mul_pd(xmm, other.xmm);  | 
903  | 0  |         return ret;  | 
904  | 0  |     }  | 
905  |  |  | 
906  |  |     inline XMMReg2Double operator/(const XMMReg2Double &other) const  | 
907  | 0  |     { | 
908  | 0  |         XMMReg2Double ret;  | 
909  | 0  |         ret.xmm = _mm_div_pd(xmm, other.xmm);  | 
910  | 0  |         return ret;  | 
911  | 0  |     }  | 
912  |  |  | 
913  |  |     inline double GetHorizSum() const  | 
914  | 0  |     { | 
915  | 0  |         __m128d xmm2;  | 
916  | 0  |         xmm2 = _mm_shuffle_pd(  | 
917  | 0  |             xmm, xmm,  | 
918  | 0  |             _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */  | 
919  | 0  |         return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));  | 
920  | 0  |     }  | 
921  |  |  | 
922  |  |     inline void Store2Val(double *ptr) const  | 
923  | 0  |     { | 
924  | 0  |         _mm_storeu_pd(ptr, xmm);  | 
925  | 0  |     }  | 
926  |  |  | 
927  |  |     inline void Store2ValAligned(double *ptr) const  | 
928  | 0  |     { | 
929  | 0  |         _mm_store_pd(ptr, xmm);  | 
930  | 0  |     }  | 
931  |  |  | 
932  |  |     inline void Store2Val(float *ptr) const  | 
933  | 0  |     { | 
934  | 0  |         __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));  | 
935  | 0  |         GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));  | 
936  | 0  |     }  | 
937  |  |  | 
938  |  |     inline void Store2Val(unsigned char *ptr) const  | 
939  | 0  |     { | 
940  | 0  |         __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(  | 
941  | 0  |             xmm,  | 
942  | 0  |             _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */  | 
943  | 0  |         tmp = _mm_packs_epi32(tmp, tmp);  | 
944  | 0  |         tmp = _mm_packus_epi16(tmp, tmp);  | 
945  | 0  |         GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));  | 
946  | 0  |     }  | 
947  |  |  | 
948  |  |     inline void Store2Val(unsigned short *ptr) const  | 
949  | 0  |     { | 
950  | 0  |         __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(  | 
951  | 0  |             xmm,  | 
952  | 0  |             _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */  | 
953  |  |         // X X X X 0 B 0 A --> X X X X A A B A  | 
954  | 0  |         tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));  | 
955  | 0  |         GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));  | 
956  | 0  |     }  | 
957  |  |  | 
958  |  |     inline void StoreMask(unsigned char *ptr) const  | 
959  | 0  |     { | 
960  | 0  |         _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),  | 
961  | 0  |                          _mm_castpd_si128(xmm));  | 
962  | 0  |     }  | 
963  |  |  | 
964  |  |     inline operator double() const  | 
965  | 0  |     { | 
966  | 0  |         return _mm_cvtsd_f64(xmm);  | 
967  | 0  |     }  | 
968  |  | };  | 
969  |  |  | 
970  |  | #else  | 
971  |  |  | 
972  |  | #ifndef NO_WARN_USE_SSE2_EMULATION  | 
973  |  | #warning "Software emulation of SSE2 !"  | 
974  |  | #endif  | 
975  |  |  | 
976  |  | class XMMReg2Double  | 
977  |  | { | 
978  |  |   public:  | 
979  |  |     double low;  | 
980  |  |     double high;  | 
981  |  |  | 
982  |  |     // cppcheck-suppress uninitMemberVar  | 
983  |  |     XMMReg2Double() = default;  | 
984  |  |  | 
985  |  |     explicit XMMReg2Double(double val)  | 
986  |  |     { | 
987  |  |         low = val;  | 
988  |  |         high = 0.0;  | 
989  |  |     }  | 
990  |  |  | 
991  |  |     XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)  | 
992  |  |     { | 
993  |  |     }  | 
994  |  |  | 
995  |  |     static inline XMMReg2Double Zero()  | 
996  |  |     { | 
997  |  |         XMMReg2Double reg;  | 
998  |  |         reg.Zeroize();  | 
999  |  |         return reg;  | 
1000  |  |     }  | 
1001  |  |  | 
1002  |  |     static inline XMMReg2Double Set1(double d)  | 
1003  |  |     { | 
1004  |  |         XMMReg2Double reg;  | 
1005  |  |         reg.low = d;  | 
1006  |  |         reg.high = d;  | 
1007  |  |         return reg;  | 
1008  |  |     }  | 
1009  |  |  | 
1010  |  |     static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)  | 
1011  |  |     { | 
1012  |  |         XMMReg2Double reg;  | 
1013  |  |         reg.nsLoad1ValHighAndLow(ptr);  | 
1014  |  |         return reg;  | 
1015  |  |     }  | 
1016  |  |  | 
1017  |  |     static inline XMMReg2Double Equals(const XMMReg2Double &expr1,  | 
1018  |  |                                        const XMMReg2Double &expr2)  | 
1019  |  |     { | 
1020  |  |         XMMReg2Double reg;  | 
1021  |  |  | 
1022  |  |         if (expr1.low == expr2.low)  | 
1023  |  |             memset(&(reg.low), 0xFF, sizeof(double));  | 
1024  |  |         else  | 
1025  |  |             reg.low = 0;  | 
1026  |  |  | 
1027  |  |         if (expr1.high == expr2.high)  | 
1028  |  |             memset(&(reg.high), 0xFF, sizeof(double));  | 
1029  |  |         else  | 
1030  |  |             reg.high = 0;  | 
1031  |  |  | 
1032  |  |         return reg;  | 
1033  |  |     }  | 
1034  |  |  | 
1035  |  |     static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,  | 
1036  |  |                                           const XMMReg2Double &expr2)  | 
1037  |  |     { | 
1038  |  |         XMMReg2Double reg;  | 
1039  |  |  | 
1040  |  |         if (expr1.low != expr2.low)  | 
1041  |  |             memset(&(reg.low), 0xFF, sizeof(double));  | 
1042  |  |         else  | 
1043  |  |             reg.low = 0;  | 
1044  |  |  | 
1045  |  |         if (expr1.high != expr2.high)  | 
1046  |  |             memset(&(reg.high), 0xFF, sizeof(double));  | 
1047  |  |         else  | 
1048  |  |             reg.high = 0;  | 
1049  |  |  | 
1050  |  |         return reg;  | 
1051  |  |     }  | 
1052  |  |  | 
1053  |  |     static inline XMMReg2Double Greater(const XMMReg2Double &expr1,  | 
1054  |  |                                         const XMMReg2Double &expr2)  | 
1055  |  |     { | 
1056  |  |         XMMReg2Double reg;  | 
1057  |  |  | 
1058  |  |         if (expr1.low > expr2.low)  | 
1059  |  |             memset(&(reg.low), 0xFF, sizeof(double));  | 
1060  |  |         else  | 
1061  |  |             reg.low = 0;  | 
1062  |  |  | 
1063  |  |         if (expr1.high > expr2.high)  | 
1064  |  |             memset(&(reg.high), 0xFF, sizeof(double));  | 
1065  |  |         else  | 
1066  |  |             reg.high = 0;  | 
1067  |  |  | 
1068  |  |         return reg;  | 
1069  |  |     }  | 
1070  |  |  | 
1071  |  |     static inline XMMReg2Double And(const XMMReg2Double &expr1,  | 
1072  |  |                                     const XMMReg2Double &expr2)  | 
1073  |  |     { | 
1074  |  |         XMMReg2Double reg;  | 
1075  |  |         int low1[2], high1[2];  | 
1076  |  |         int low2[2], high2[2];  | 
1077  |  |         memcpy(low1, &expr1.low, sizeof(double));  | 
1078  |  |         memcpy(high1, &expr1.high, sizeof(double));  | 
1079  |  |         memcpy(low2, &expr2.low, sizeof(double));  | 
1080  |  |         memcpy(high2, &expr2.high, sizeof(double));  | 
1081  |  |         low1[0] &= low2[0];  | 
1082  |  |         low1[1] &= low2[1];  | 
1083  |  |         high1[0] &= high2[0];  | 
1084  |  |         high1[1] &= high2[1];  | 
1085  |  |         memcpy(®.low, low1, sizeof(double));  | 
1086  |  |         memcpy(®.high, high1, sizeof(double));  | 
1087  |  |         return reg;  | 
1088  |  |     }  | 
1089  |  |  | 
1090  |  |     static inline XMMReg2Double Ternary(const XMMReg2Double &cond,  | 
1091  |  |                                         const XMMReg2Double &true_expr,  | 
1092  |  |                                         const XMMReg2Double &false_expr)  | 
1093  |  |     { | 
1094  |  |         XMMReg2Double reg;  | 
1095  |  |         if (cond.low != 0)  | 
1096  |  |             reg.low = true_expr.low;  | 
1097  |  |         else  | 
1098  |  |             reg.low = false_expr.low;  | 
1099  |  |         if (cond.high != 0)  | 
1100  |  |             reg.high = true_expr.high;  | 
1101  |  |         else  | 
1102  |  |             reg.high = false_expr.high;  | 
1103  |  |         return reg;  | 
1104  |  |     }  | 
1105  |  |  | 
1106  |  |     static inline XMMReg2Double Min(const XMMReg2Double &expr1,  | 
1107  |  |                                     const XMMReg2Double &expr2)  | 
1108  |  |     { | 
1109  |  |         XMMReg2Double reg;  | 
1110  |  |         reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;  | 
1111  |  |         reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;  | 
1112  |  |         return reg;  | 
1113  |  |     }  | 
1114  |  |  | 
1115  |  |     static inline XMMReg2Double Load2Val(const double *ptr)  | 
1116  |  |     { | 
1117  |  |         XMMReg2Double reg;  | 
1118  |  |         reg.nsLoad2Val(ptr);  | 
1119  |  |         return reg;  | 
1120  |  |     }  | 
1121  |  |  | 
1122  |  |     static inline XMMReg2Double Load2ValAligned(const double *ptr)  | 
1123  |  |     { | 
1124  |  |         XMMReg2Double reg;  | 
1125  |  |         reg.nsLoad2ValAligned(ptr);  | 
1126  |  |         return reg;  | 
1127  |  |     }  | 
1128  |  |  | 
1129  |  |     static inline XMMReg2Double Load2Val(const float *ptr)  | 
1130  |  |     { | 
1131  |  |         XMMReg2Double reg;  | 
1132  |  |         reg.nsLoad2Val(ptr);  | 
1133  |  |         return reg;  | 
1134  |  |     }  | 
1135  |  |  | 
1136  |  |     static inline XMMReg2Double Load2Val(const unsigned char *ptr)  | 
1137  |  |     { | 
1138  |  |         XMMReg2Double reg;  | 
1139  |  |         reg.nsLoad2Val(ptr);  | 
1140  |  |         return reg;  | 
1141  |  |     }  | 
1142  |  |  | 
1143  |  |     static inline XMMReg2Double Load2Val(const short *ptr)  | 
1144  |  |     { | 
1145  |  |         XMMReg2Double reg;  | 
1146  |  |         reg.nsLoad2Val(ptr);  | 
1147  |  |         return reg;  | 
1148  |  |     }  | 
1149  |  |  | 
1150  |  |     static inline XMMReg2Double Load2Val(const unsigned short *ptr)  | 
1151  |  |     { | 
1152  |  |         XMMReg2Double reg;  | 
1153  |  |         reg.nsLoad2Val(ptr);  | 
1154  |  |         return reg;  | 
1155  |  |     }  | 
1156  |  |  | 
1157  |  |     static inline XMMReg2Double Load2Val(const int *ptr)  | 
1158  |  |     { | 
1159  |  |         XMMReg2Double reg;  | 
1160  |  |         reg.nsLoad2Val(ptr);  | 
1161  |  |         return reg;  | 
1162  |  |     }  | 
1163  |  |  | 
1164  |  |     inline void nsLoad1ValHighAndLow(const double *ptr)  | 
1165  |  |     { | 
1166  |  |         low = ptr[0];  | 
1167  |  |         high = ptr[0];  | 
1168  |  |     }  | 
1169  |  |  | 
1170  |  |     inline void nsLoad2Val(const double *ptr)  | 
1171  |  |     { | 
1172  |  |         low = ptr[0];  | 
1173  |  |         high = ptr[1];  | 
1174  |  |     }  | 
1175  |  |  | 
1176  |  |     inline void nsLoad2ValAligned(const double *ptr)  | 
1177  |  |     { | 
1178  |  |         low = ptr[0];  | 
1179  |  |         high = ptr[1];  | 
1180  |  |     }  | 
1181  |  |  | 
1182  |  |     inline void nsLoad2Val(const float *ptr)  | 
1183  |  |     { | 
1184  |  |         low = ptr[0];  | 
1185  |  |         high = ptr[1];  | 
1186  |  |     }  | 
1187  |  |  | 
1188  |  |     inline void nsLoad2Val(const unsigned char *ptr)  | 
1189  |  |     { | 
1190  |  |         low = ptr[0];  | 
1191  |  |         high = ptr[1];  | 
1192  |  |     }  | 
1193  |  |  | 
1194  |  |     inline void nsLoad2Val(const short *ptr)  | 
1195  |  |     { | 
1196  |  |         low = ptr[0];  | 
1197  |  |         high = ptr[1];  | 
1198  |  |     }  | 
1199  |  |  | 
1200  |  |     inline void nsLoad2Val(const unsigned short *ptr)  | 
1201  |  |     { | 
1202  |  |         low = ptr[0];  | 
1203  |  |         high = ptr[1];  | 
1204  |  |     }  | 
1205  |  |  | 
1206  |  |     inline void nsLoad2Val(const int *ptr)  | 
1207  |  |     { | 
1208  |  |         low = ptr[0];  | 
1209  |  |         high = ptr[1];  | 
1210  |  |     }  | 
1211  |  |  | 
1212  |  |     static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,  | 
1213  |  |                                 XMMReg2Double &high)  | 
1214  |  |     { | 
1215  |  |         low.low = ptr[0];  | 
1216  |  |         low.high = ptr[1];  | 
1217  |  |         high.low = ptr[2];  | 
1218  |  |         high.high = ptr[3];  | 
1219  |  |     }  | 
1220  |  |  | 
1221  |  |     static inline void Load4Val(const short *ptr, XMMReg2Double &low,  | 
1222  |  |                                 XMMReg2Double &high)  | 
1223  |  |     { | 
1224  |  |         low.nsLoad2Val(ptr);  | 
1225  |  |         high.nsLoad2Val(ptr + 2);  | 
1226  |  |     }  | 
1227  |  |  | 
1228  |  |     static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,  | 
1229  |  |                                 XMMReg2Double &high)  | 
1230  |  |     { | 
1231  |  |         low.nsLoad2Val(ptr);  | 
1232  |  |         high.nsLoad2Val(ptr + 2);  | 
1233  |  |     }  | 
1234  |  |  | 
1235  |  |     static inline void Load4Val(const double *ptr, XMMReg2Double &low,  | 
1236  |  |                                 XMMReg2Double &high)  | 
1237  |  |     { | 
1238  |  |         low.nsLoad2Val(ptr);  | 
1239  |  |         high.nsLoad2Val(ptr + 2);  | 
1240  |  |     }  | 
1241  |  |  | 
1242  |  |     static inline void Load4Val(const float *ptr, XMMReg2Double &low,  | 
1243  |  |                                 XMMReg2Double &high)  | 
1244  |  |     { | 
1245  |  |         low.nsLoad2Val(ptr);  | 
1246  |  |         high.nsLoad2Val(ptr + 2);  | 
1247  |  |     }  | 
1248  |  |  | 
1249  |  |     inline void Zeroize()  | 
1250  |  |     { | 
1251  |  |         low = 0.0;  | 
1252  |  |         high = 0.0;  | 
1253  |  |     }  | 
1254  |  |  | 
1255  |  |     inline XMMReg2Double &operator=(const XMMReg2Double &other)  | 
1256  |  |     { | 
1257  |  |         low = other.low;  | 
1258  |  |         high = other.high;  | 
1259  |  |         return *this;  | 
1260  |  |     }  | 
1261  |  |  | 
1262  |  |     inline XMMReg2Double &operator+=(const XMMReg2Double &other)  | 
1263  |  |     { | 
1264  |  |         low += other.low;  | 
1265  |  |         high += other.high;  | 
1266  |  |         return *this;  | 
1267  |  |     }  | 
1268  |  |  | 
1269  |  |     inline XMMReg2Double &operator*=(const XMMReg2Double &other)  | 
1270  |  |     { | 
1271  |  |         low *= other.low;  | 
1272  |  |         high *= other.high;  | 
1273  |  |         return *this;  | 
1274  |  |     }  | 
1275  |  |  | 
1276  |  |     inline XMMReg2Double operator+(const XMMReg2Double &other) const  | 
1277  |  |     { | 
1278  |  |         XMMReg2Double ret;  | 
1279  |  |         ret.low = low + other.low;  | 
1280  |  |         ret.high = high + other.high;  | 
1281  |  |         return ret;  | 
1282  |  |     }  | 
1283  |  |  | 
1284  |  |     inline XMMReg2Double operator-(const XMMReg2Double &other) const  | 
1285  |  |     { | 
1286  |  |         XMMReg2Double ret;  | 
1287  |  |         ret.low = low - other.low;  | 
1288  |  |         ret.high = high - other.high;  | 
1289  |  |         return ret;  | 
1290  |  |     }  | 
1291  |  |  | 
1292  |  |     inline XMMReg2Double operator*(const XMMReg2Double &other) const  | 
1293  |  |     { | 
1294  |  |         XMMReg2Double ret;  | 
1295  |  |         ret.low = low * other.low;  | 
1296  |  |         ret.high = high * other.high;  | 
1297  |  |         return ret;  | 
1298  |  |     }  | 
1299  |  |  | 
1300  |  |     inline XMMReg2Double operator/(const XMMReg2Double &other) const  | 
1301  |  |     { | 
1302  |  |         XMMReg2Double ret;  | 
1303  |  |         ret.low = low / other.low;  | 
1304  |  |         ret.high = high / other.high;  | 
1305  |  |         return ret;  | 
1306  |  |     }  | 
1307  |  |  | 
1308  |  |     inline double GetHorizSum() const  | 
1309  |  |     { | 
1310  |  |         return low + high;  | 
1311  |  |     }  | 
1312  |  |  | 
1313  |  |     inline void Store2Val(double *ptr) const  | 
1314  |  |     { | 
1315  |  |         ptr[0] = low;  | 
1316  |  |         ptr[1] = high;  | 
1317  |  |     }  | 
1318  |  |  | 
1319  |  |     inline void Store2ValAligned(double *ptr) const  | 
1320  |  |     { | 
1321  |  |         ptr[0] = low;  | 
1322  |  |         ptr[1] = high;  | 
1323  |  |     }  | 
1324  |  |  | 
1325  |  |     inline void Store2Val(float *ptr) const  | 
1326  |  |     { | 
1327  |  |         ptr[0] = static_cast<float>(low);  | 
1328  |  |         ptr[1] = static_cast<float>(high);  | 
1329  |  |     }  | 
1330  |  |  | 
1331  |  |     void Store2Val(unsigned char *ptr) const  | 
1332  |  |     { | 
1333  |  |         ptr[0] = static_cast<unsigned char>(low + 0.5);  | 
1334  |  |         ptr[1] = static_cast<unsigned char>(high + 0.5);  | 
1335  |  |     }  | 
1336  |  |  | 
1337  |  |     void Store2Val(unsigned short *ptr) const  | 
1338  |  |     { | 
1339  |  |         ptr[0] = static_cast<GUInt16>(low + 0.5);  | 
1340  |  |         ptr[1] = static_cast<GUInt16>(high + 0.5);  | 
1341  |  |     }  | 
1342  |  |  | 
1343  |  |     inline void StoreMask(unsigned char *ptr) const  | 
1344  |  |     { | 
1345  |  |         memcpy(ptr, &low, 8);  | 
1346  |  |         memcpy(ptr + 8, &high, 8);  | 
1347  |  |     }  | 
1348  |  |  | 
1349  |  |     inline operator double() const  | 
1350  |  |     { | 
1351  |  |         return low;  | 
1352  |  |     }  | 
1353  |  | };  | 
1354  |  |  | 
1355  |  | #endif /*  defined(__x86_64) || defined(_M_X64) */  | 
1356  |  |  | 
1357  |  | #if defined(__AVX__) && !defined(USE_SSE2_EMULATION)  | 
1358  |  |  | 
1359  |  | #include <immintrin.h>  | 
1360  |  |  | 
1361  |  | class XMMReg4Double  | 
1362  |  | { | 
1363  |  |   public:  | 
1364  |  |     __m256d ymm;  | 
1365  |  |  | 
1366  |  |     XMMReg4Double() : ymm(_mm256_setzero_pd())  | 
1367  |  |     { | 
1368  |  |     }  | 
1369  |  |  | 
1370  |  |     XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)  | 
1371  |  |     { | 
1372  |  |     }  | 
1373  |  |  | 
1374  |  |     static inline XMMReg4Double Zero()  | 
1375  |  |     { | 
1376  |  |         XMMReg4Double reg;  | 
1377  |  |         reg.Zeroize();  | 
1378  |  |         return reg;  | 
1379  |  |     }  | 
1380  |  |  | 
1381  |  |     static inline XMMReg4Double Set1(double d)  | 
1382  |  |     { | 
1383  |  |         XMMReg4Double reg;  | 
1384  |  |         reg.ymm = _mm256_set1_pd(d);  | 
1385  |  |         return reg;  | 
1386  |  |     }  | 
1387  |  |  | 
1388  |  |     inline void Zeroize()  | 
1389  |  |     { | 
1390  |  |         ymm = _mm256_setzero_pd();  | 
1391  |  |     }  | 
1392  |  |  | 
1393  |  |     static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)  | 
1394  |  |     { | 
1395  |  |         XMMReg4Double reg;  | 
1396  |  |         reg.nsLoad1ValHighAndLow(ptr);  | 
1397  |  |         return reg;  | 
1398  |  |     }  | 
1399  |  |  | 
1400  |  |     inline void nsLoad1ValHighAndLow(const double *ptr)  | 
1401  |  |     { | 
1402  |  |         ymm = _mm256_set1_pd(*ptr);  | 
1403  |  |     }  | 
1404  |  |  | 
1405  |  |     static inline XMMReg4Double Load4Val(const unsigned char *ptr)  | 
1406  |  |     { | 
1407  |  |         XMMReg4Double reg;  | 
1408  |  |         reg.nsLoad4Val(ptr);  | 
1409  |  |         return reg;  | 
1410  |  |     }  | 
1411  |  |  | 
1412  |  |     inline void nsLoad4Val(const unsigned char *ptr)  | 
1413  |  |     { | 
1414  |  |         __m128i xmm_i = GDALCopyInt32ToXMM(ptr);  | 
1415  |  |         xmm_i = _mm_cvtepu8_epi32(xmm_i);  | 
1416  |  |         ymm = _mm256_cvtepi32_pd(xmm_i);  | 
1417  |  |     }  | 
1418  |  |  | 
1419  |  |     static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,  | 
1420  |  |                                 XMMReg4Double &high)  | 
1421  |  |     { | 
1422  |  |         const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);  | 
1423  |  |         const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);  | 
1424  |  |         low.ymm = _mm256_cvtepi32_pd(xmm_i_low);  | 
1425  |  |         const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));  | 
1426  |  |         high.ymm = _mm256_cvtepi32_pd(xmm_i_high);  | 
1427  |  |     }  | 
1428  |  |  | 
1429  |  |     static inline XMMReg4Double Load4Val(const short *ptr)  | 
1430  |  |     { | 
1431  |  |         XMMReg4Double reg;  | 
1432  |  |         reg.nsLoad4Val(ptr);  | 
1433  |  |         return reg;  | 
1434  |  |     }  | 
1435  |  |  | 
1436  |  |     inline void nsLoad4Val(const short *ptr)  | 
1437  |  |     { | 
1438  |  |         __m128i xmm_i = GDALCopyInt64ToXMM(ptr);  | 
1439  |  |         xmm_i = _mm_cvtepi16_epi32(xmm_i);  | 
1440  |  |         ymm = _mm256_cvtepi32_pd(xmm_i);  | 
1441  |  |     }  | 
1442  |  |  | 
1443  |  |     static inline void Load8Val(const short *ptr, XMMReg4Double &low,  | 
1444  |  |                                 XMMReg4Double &high)  | 
1445  |  |     { | 
1446  |  |         low.nsLoad4Val(ptr);  | 
1447  |  |         high.nsLoad4Val(ptr + 4);  | 
1448  |  |     }  | 
1449  |  |  | 
1450  |  |     static inline XMMReg4Double Load4Val(const unsigned short *ptr)  | 
1451  |  |     { | 
1452  |  |         XMMReg4Double reg;  | 
1453  |  |         reg.nsLoad4Val(ptr);  | 
1454  |  |         return reg;  | 
1455  |  |     }  | 
1456  |  |  | 
1457  |  |     inline void nsLoad4Val(const unsigned short *ptr)  | 
1458  |  |     { | 
1459  |  |         __m128i xmm_i = GDALCopyInt64ToXMM(ptr);  | 
1460  |  |         xmm_i = _mm_cvtepu16_epi32(xmm_i);  | 
1461  |  |         ymm = _mm256_cvtepi32_pd(  | 
1462  |  |             xmm_i);  // ok to use signed conversion since we are in the ushort  | 
1463  |  |                      // range, so cannot be interpreted as negative int32  | 
1464  |  |     }  | 
1465  |  |  | 
1466  |  |     static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,  | 
1467  |  |                                 XMMReg4Double &high)  | 
1468  |  |     { | 
1469  |  |         low.nsLoad4Val(ptr);  | 
1470  |  |         high.nsLoad4Val(ptr + 4);  | 
1471  |  |     }  | 
1472  |  |  | 
1473  |  |     static inline XMMReg4Double Load4Val(const double *ptr)  | 
1474  |  |     { | 
1475  |  |         XMMReg4Double reg;  | 
1476  |  |         reg.nsLoad4Val(ptr);  | 
1477  |  |         return reg;  | 
1478  |  |     }  | 
1479  |  |  | 
1480  |  |     inline void nsLoad4Val(const double *ptr)  | 
1481  |  |     { | 
1482  |  |         ymm = _mm256_loadu_pd(ptr);  | 
1483  |  |     }  | 
1484  |  |  | 
1485  |  |     static inline void Load8Val(const double *ptr, XMMReg4Double &low,  | 
1486  |  |                                 XMMReg4Double &high)  | 
1487  |  |     { | 
1488  |  |         low.nsLoad4Val(ptr);  | 
1489  |  |         high.nsLoad4Val(ptr + 4);  | 
1490  |  |     }  | 
1491  |  |  | 
1492  |  |     static inline XMMReg4Double Load4ValAligned(const double *ptr)  | 
1493  |  |     { | 
1494  |  |         XMMReg4Double reg;  | 
1495  |  |         reg.nsLoad4ValAligned(ptr);  | 
1496  |  |         return reg;  | 
1497  |  |     }  | 
1498  |  |  | 
1499  |  |     inline void nsLoad4ValAligned(const double *ptr)  | 
1500  |  |     { | 
1501  |  |         ymm = _mm256_load_pd(ptr);  | 
1502  |  |     }  | 
1503  |  |  | 
1504  |  |     static inline XMMReg4Double Load4Val(const float *ptr)  | 
1505  |  |     { | 
1506  |  |         XMMReg4Double reg;  | 
1507  |  |         reg.nsLoad4Val(ptr);  | 
1508  |  |         return reg;  | 
1509  |  |     }  | 
1510  |  |  | 
1511  |  |     inline void nsLoad4Val(const float *ptr)  | 
1512  |  |     { | 
1513  |  |         ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));  | 
1514  |  |     }  | 
1515  |  |  | 
1516  |  |     static inline void Load8Val(const float *ptr, XMMReg4Double &low,  | 
1517  |  |                                 XMMReg4Double &high)  | 
1518  |  |     { | 
1519  |  |         low.nsLoad4Val(ptr);  | 
1520  |  |         high.nsLoad4Val(ptr + 4);  | 
1521  |  |     }  | 
1522  |  |  | 
1523  |  |     static inline XMMReg4Double Load4Val(const int *ptr)  | 
1524  |  |     { | 
1525  |  |         XMMReg4Double reg;  | 
1526  |  |         reg.nsLoad4Val(ptr);  | 
1527  |  |         return reg;  | 
1528  |  |     }  | 
1529  |  |  | 
1530  |  |     inline void nsLoad4Val(const int *ptr)  | 
1531  |  |     { | 
1532  |  |         ymm = _mm256_cvtepi32_pd(  | 
1533  |  |             _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)));  | 
1534  |  |     }  | 
1535  |  |  | 
1536  |  |     static inline void Load8Val(const int *ptr, XMMReg4Double &low,  | 
1537  |  |                                 XMMReg4Double &high)  | 
1538  |  |     { | 
1539  |  |         low.nsLoad4Val(ptr);  | 
1540  |  |         high.nsLoad4Val(ptr + 4);  | 
1541  |  |     }  | 
1542  |  |  | 
1543  |  |     static inline XMMReg4Double Equals(const XMMReg4Double &expr1,  | 
1544  |  |                                        const XMMReg4Double &expr2)  | 
1545  |  |     { | 
1546  |  |         XMMReg4Double reg;  | 
1547  |  |         reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);  | 
1548  |  |         return reg;  | 
1549  |  |     }  | 
1550  |  |  | 
1551  |  |     static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,  | 
1552  |  |                                           const XMMReg4Double &expr2)  | 
1553  |  |     { | 
1554  |  |         XMMReg4Double reg;  | 
1555  |  |         reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);  | 
1556  |  |         return reg;  | 
1557  |  |     }  | 
1558  |  |  | 
1559  |  |     static inline XMMReg4Double Greater(const XMMReg4Double &expr1,  | 
1560  |  |                                         const XMMReg4Double &expr2)  | 
1561  |  |     { | 
1562  |  |         XMMReg4Double reg;  | 
1563  |  |         reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);  | 
1564  |  |         return reg;  | 
1565  |  |     }  | 
1566  |  |  | 
1567  |  |     static inline XMMReg4Double And(const XMMReg4Double &expr1,  | 
1568  |  |                                     const XMMReg4Double &expr2)  | 
1569  |  |     { | 
1570  |  |         XMMReg4Double reg;  | 
1571  |  |         reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);  | 
1572  |  |         return reg;  | 
1573  |  |     }  | 
1574  |  |  | 
1575  |  |     static inline XMMReg4Double Ternary(const XMMReg4Double &cond,  | 
1576  |  |                                         const XMMReg4Double &true_expr,  | 
1577  |  |                                         const XMMReg4Double &false_expr)  | 
1578  |  |     { | 
1579  |  |         XMMReg4Double reg;  | 
1580  |  |         reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),  | 
1581  |  |                                _mm256_andnot_pd(cond.ymm, false_expr.ymm));  | 
1582  |  |         return reg;  | 
1583  |  |     }  | 
1584  |  |  | 
1585  |  |     static inline XMMReg4Double Min(const XMMReg4Double &expr1,  | 
1586  |  |                                     const XMMReg4Double &expr2)  | 
1587  |  |     { | 
1588  |  |         XMMReg4Double reg;  | 
1589  |  |         reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);  | 
1590  |  |         return reg;  | 
1591  |  |     }  | 
1592  |  |  | 
1593  |  |     inline XMMReg4Double &operator=(const XMMReg4Double &other)  | 
1594  |  |     { | 
1595  |  |         ymm = other.ymm;  | 
1596  |  |         return *this;  | 
1597  |  |     }  | 
1598  |  |  | 
1599  |  |     inline XMMReg4Double &operator+=(const XMMReg4Double &other)  | 
1600  |  |     { | 
1601  |  |         ymm = _mm256_add_pd(ymm, other.ymm);  | 
1602  |  |         return *this;  | 
1603  |  |     }  | 
1604  |  |  | 
1605  |  |     inline XMMReg4Double &operator*=(const XMMReg4Double &other)  | 
1606  |  |     { | 
1607  |  |         ymm = _mm256_mul_pd(ymm, other.ymm);  | 
1608  |  |         return *this;  | 
1609  |  |     }  | 
1610  |  |  | 
1611  |  |     inline XMMReg4Double operator+(const XMMReg4Double &other) const  | 
1612  |  |     { | 
1613  |  |         XMMReg4Double ret;  | 
1614  |  |         ret.ymm = _mm256_add_pd(ymm, other.ymm);  | 
1615  |  |         return ret;  | 
1616  |  |     }  | 
1617  |  |  | 
1618  |  |     inline XMMReg4Double operator-(const XMMReg4Double &other) const  | 
1619  |  |     { | 
1620  |  |         XMMReg4Double ret;  | 
1621  |  |         ret.ymm = _mm256_sub_pd(ymm, other.ymm);  | 
1622  |  |         return ret;  | 
1623  |  |     }  | 
1624  |  |  | 
1625  |  |     inline XMMReg4Double operator*(const XMMReg4Double &other) const  | 
1626  |  |     { | 
1627  |  |         XMMReg4Double ret;  | 
1628  |  |         ret.ymm = _mm256_mul_pd(ymm, other.ymm);  | 
1629  |  |         return ret;  | 
1630  |  |     }  | 
1631  |  |  | 
1632  |  |     inline XMMReg4Double operator/(const XMMReg4Double &other) const  | 
1633  |  |     { | 
1634  |  |         XMMReg4Double ret;  | 
1635  |  |         ret.ymm = _mm256_div_pd(ymm, other.ymm);  | 
1636  |  |         return ret;  | 
1637  |  |     }  | 
1638  |  |  | 
1639  |  |     void AddToLow(const XMMReg2Double &other)  | 
1640  |  |     { | 
1641  |  |         __m256d ymm2 = _mm256_setzero_pd();  | 
1642  |  |         ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);  | 
1643  |  |         ymm = _mm256_add_pd(ymm, ymm2);  | 
1644  |  |     }  | 
1645  |  |  | 
1646  |  |     inline double GetHorizSum() const  | 
1647  |  |     { | 
1648  |  |         __m256d ymm_tmp1, ymm_tmp2;  | 
1649  |  |         ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);  | 
1650  |  |         ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);  | 
1651  |  |         ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);  | 
1652  |  |         return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));  | 
1653  |  |     }  | 
1654  |  |  | 
1655  |  |     inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,  | 
1656  |  |                                          const XMMReg4Double &half) const  | 
1657  |  |     { | 
1658  |  |         __m256d reg = ymm;  | 
1659  |  |         __m256d reg_half = _mm256_mul_pd(reg, half.ymm);  | 
1660  |  |         // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps  | 
1661  |  |         reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));  | 
1662  |  |         // And perform one step of Newton-Raphson approximation to improve it  | 
1663  |  |         // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -  | 
1664  |  |         //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);  | 
1665  |  |         const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);  | 
1666  |  |         reg = _mm256_mul_pd(  | 
1667  |  |             reg,  | 
1668  |  |             _mm256_sub_pd(one_and_a_half,  | 
1669  |  |                           _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));  | 
1670  |  |         XMMReg4Double ret;  | 
1671  |  |         ret.ymm = reg;  | 
1672  |  |         return ret;  | 
1673  |  |     }  | 
1674  |  |  | 
1675  |  |     inline XMMReg4Float cast_to_float() const  | 
1676  |  |     { | 
1677  |  |         XMMReg4Float ret;  | 
1678  |  |         ret.xmm = _mm256_cvtpd_ps(ymm);  | 
1679  |  |         return ret;  | 
1680  |  |     }  | 
1681  |  |  | 
1682  |  |     inline void Store4Val(unsigned char *ptr) const  | 
1683  |  |     { | 
1684  |  |         __m128i xmm_i =  | 
1685  |  |             _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));  | 
1686  |  |         // xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16  | 
1687  |  |         // xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8  | 
1688  |  |         xmm_i =  | 
1689  |  |             _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |  | 
1690  |  |                                                       (12 << 24)));  //  SSSE3  | 
1691  |  |         GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));  | 
1692  |  |     }  | 
1693  |  |  | 
1694  |  |     inline void Store4Val(unsigned short *ptr) const  | 
1695  |  |     { | 
1696  |  |         __m128i xmm_i =  | 
1697  |  |             _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));  | 
1698  |  |         xmm_i = _mm_packus_epi32(xmm_i, xmm_i);  // Pack uint32 to uint16  | 
1699  |  |         GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));  | 
1700  |  |     }  | 
1701  |  |  | 
1702  |  |     inline void Store4Val(float *ptr) const  | 
1703  |  |     { | 
1704  |  |         _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));  | 
1705  |  |     }  | 
1706  |  |  | 
1707  |  |     inline void Store4Val(double *ptr) const  | 
1708  |  |     { | 
1709  |  |         _mm256_storeu_pd(ptr, ymm);  | 
1710  |  |     }  | 
1711  |  |  | 
1712  |  |     inline void StoreMask(unsigned char *ptr) const  | 
1713  |  |     { | 
1714  |  |         _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),  | 
1715  |  |                             _mm256_castpd_si256(ymm));  | 
1716  |  |     }  | 
1717  |  | };  | 
1718  |  |  | 
1719  |  | inline XMMReg4Double XMMReg4Float::cast_to_double() const  | 
1720  |  | { | 
1721  |  |     XMMReg4Double ret;  | 
1722  |  |     ret.ymm = _mm256_cvtps_pd(xmm);  | 
1723  |  |     return ret;  | 
1724  |  | }  | 
1725  |  |  | 
1726  |  | inline XMMReg4Double XMMReg4Int::cast_to_double() const  | 
1727  |  | { | 
1728  |  |     XMMReg4Double ret;  | 
1729  |  |     ret.ymm = _mm256_cvtepi32_pd(xmm);  | 
1730  |  |     return ret;  | 
1731  |  | }  | 
1732  |  |  | 
1733  |  | #else  | 
1734  |  |  | 
1735  |  | class XMMReg4Double  | 
1736  |  | { | 
1737  |  |   public:  | 
1738  |  |     XMMReg2Double low, high;  | 
1739  |  |  | 
1740  | 0  |     XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())  | 
1741  | 0  |     { | 
1742  | 0  |     }  | 
1743  |  |  | 
1744  | 0  |     XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)  | 
1745  | 0  |     { | 
1746  | 0  |     }  | 
1747  |  |  | 
1748  |  |     static inline XMMReg4Double Zero()  | 
1749  | 0  |     { | 
1750  | 0  |         XMMReg4Double reg;  | 
1751  | 0  |         reg.low.Zeroize();  | 
1752  | 0  |         reg.high.Zeroize();  | 
1753  | 0  |         return reg;  | 
1754  | 0  |     }  | 
1755  |  |  | 
1756  |  |     static inline XMMReg4Double Set1(double d)  | 
1757  | 0  |     { | 
1758  | 0  |         XMMReg4Double reg;  | 
1759  | 0  |         reg.low = XMMReg2Double::Set1(d);  | 
1760  | 0  |         reg.high = XMMReg2Double::Set1(d);  | 
1761  | 0  |         return reg;  | 
1762  | 0  |     }  | 
1763  |  |  | 
1764  |  |     static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)  | 
1765  | 0  |     { | 
1766  | 0  |         XMMReg4Double reg;  | 
1767  | 0  |         reg.low.nsLoad1ValHighAndLow(ptr);  | 
1768  | 0  |         reg.high = reg.low;  | 
1769  | 0  |         return reg;  | 
1770  | 0  |     }  | 
1771  |  |  | 
1772  |  |     static inline XMMReg4Double Load4Val(const unsigned char *ptr)  | 
1773  | 0  |     { | 
1774  | 0  |         XMMReg4Double reg;  | 
1775  | 0  |         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);  | 
1776  | 0  |         return reg;  | 
1777  | 0  |     }  | 
1778  |  |  | 
1779  |  |     static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,  | 
1780  |  |                                 XMMReg4Double &high)  | 
1781  | 0  |     { | 
1782  | 0  |         low = Load4Val(ptr);  | 
1783  | 0  |         high = Load4Val(ptr + 4);  | 
1784  | 0  |     }  | 
1785  |  |  | 
1786  |  |     static inline XMMReg4Double Load4Val(const short *ptr)  | 
1787  | 0  |     { | 
1788  | 0  |         XMMReg4Double reg;  | 
1789  | 0  |         reg.low.nsLoad2Val(ptr);  | 
1790  | 0  |         reg.high.nsLoad2Val(ptr + 2);  | 
1791  | 0  |         return reg;  | 
1792  | 0  |     }  | 
1793  |  |  | 
1794  |  |     static inline void Load8Val(const short *ptr, XMMReg4Double &low,  | 
1795  |  |                                 XMMReg4Double &high)  | 
1796  | 0  |     { | 
1797  | 0  |         low = Load4Val(ptr);  | 
1798  | 0  |         high = Load4Val(ptr + 4);  | 
1799  | 0  |     }  | 
1800  |  |  | 
1801  |  |     static inline XMMReg4Double Load4Val(const unsigned short *ptr)  | 
1802  | 0  |     { | 
1803  | 0  |         XMMReg4Double reg;  | 
1804  | 0  |         reg.low.nsLoad2Val(ptr);  | 
1805  | 0  |         reg.high.nsLoad2Val(ptr + 2);  | 
1806  | 0  |         return reg;  | 
1807  | 0  |     }  | 
1808  |  |  | 
1809  |  |     static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,  | 
1810  |  |                                 XMMReg4Double &high)  | 
1811  | 0  |     { | 
1812  | 0  |         low = Load4Val(ptr);  | 
1813  | 0  |         high = Load4Val(ptr + 4);  | 
1814  | 0  |     }  | 
1815  |  |  | 
1816  |  |     static inline XMMReg4Double Load4Val(const int *ptr)  | 
1817  | 0  |     { | 
1818  | 0  |         XMMReg4Double reg;  | 
1819  | 0  |         reg.low.nsLoad2Val(ptr);  | 
1820  | 0  |         reg.high.nsLoad2Val(ptr + 2);  | 
1821  | 0  |         return reg;  | 
1822  | 0  |     }  | 
1823  |  |  | 
1824  |  |     static inline void Load8Val(const int *ptr, XMMReg4Double &low,  | 
1825  |  |                                 XMMReg4Double &high)  | 
1826  | 0  |     { | 
1827  | 0  |         low = Load4Val(ptr);  | 
1828  | 0  |         high = Load4Val(ptr + 4);  | 
1829  | 0  |     }  | 
1830  |  |  | 
1831  |  |     static inline XMMReg4Double Load4Val(const double *ptr)  | 
1832  | 0  |     { | 
1833  | 0  |         XMMReg4Double reg;  | 
1834  | 0  |         reg.low.nsLoad2Val(ptr);  | 
1835  | 0  |         reg.high.nsLoad2Val(ptr + 2);  | 
1836  | 0  |         return reg;  | 
1837  | 0  |     }  | 
1838  |  |  | 
1839  |  |     static inline void Load8Val(const double *ptr, XMMReg4Double &low,  | 
1840  |  |                                 XMMReg4Double &high)  | 
1841  | 0  |     { | 
1842  | 0  |         low = Load4Val(ptr);  | 
1843  | 0  |         high = Load4Val(ptr + 4);  | 
1844  | 0  |     }  | 
1845  |  |  | 
1846  |  |     static inline XMMReg4Double Load4ValAligned(const double *ptr)  | 
1847  | 0  |     { | 
1848  | 0  |         XMMReg4Double reg;  | 
1849  | 0  |         reg.low.nsLoad2ValAligned(ptr);  | 
1850  | 0  |         reg.high.nsLoad2ValAligned(ptr + 2);  | 
1851  | 0  |         return reg;  | 
1852  | 0  |     }  | 
1853  |  |  | 
1854  |  |     static inline XMMReg4Double Load4Val(const float *ptr)  | 
1855  | 0  |     { | 
1856  | 0  |         XMMReg4Double reg;  | 
1857  | 0  |         XMMReg2Double::Load4Val(ptr, reg.low, reg.high);  | 
1858  | 0  |         return reg;  | 
1859  | 0  |     }  | 
1860  |  |  | 
1861  |  |     static inline void Load8Val(const float *ptr, XMMReg4Double &low,  | 
1862  |  |                                 XMMReg4Double &high)  | 
1863  | 0  |     { | 
1864  | 0  |         low = Load4Val(ptr);  | 
1865  | 0  |         high = Load4Val(ptr + 4);  | 
1866  | 0  |     }  | 
1867  |  |  | 
1868  |  |     static inline XMMReg4Double Equals(const XMMReg4Double &expr1,  | 
1869  |  |                                        const XMMReg4Double &expr2)  | 
1870  | 0  |     { | 
1871  | 0  |         XMMReg4Double reg;  | 
1872  | 0  |         reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);  | 
1873  | 0  |         reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);  | 
1874  | 0  |         return reg;  | 
1875  | 0  |     }  | 
1876  |  |  | 
1877  |  |     static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,  | 
1878  |  |                                           const XMMReg4Double &expr2)  | 
1879  | 0  |     { | 
1880  | 0  |         XMMReg4Double reg;  | 
1881  | 0  |         reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);  | 
1882  | 0  |         reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);  | 
1883  | 0  |         return reg;  | 
1884  | 0  |     }  | 
1885  |  |  | 
1886  |  |     static inline XMMReg4Double Greater(const XMMReg4Double &expr1,  | 
1887  |  |                                         const XMMReg4Double &expr2)  | 
1888  | 0  |     { | 
1889  | 0  |         XMMReg4Double reg;  | 
1890  | 0  |         reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);  | 
1891  | 0  |         reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);  | 
1892  | 0  |         return reg;  | 
1893  | 0  |     }  | 
1894  |  |  | 
1895  |  |     static inline XMMReg4Double And(const XMMReg4Double &expr1,  | 
1896  |  |                                     const XMMReg4Double &expr2)  | 
1897  | 0  |     { | 
1898  | 0  |         XMMReg4Double reg;  | 
1899  | 0  |         reg.low = XMMReg2Double::And(expr1.low, expr2.low);  | 
1900  | 0  |         reg.high = XMMReg2Double::And(expr1.high, expr2.high);  | 
1901  | 0  |         return reg;  | 
1902  | 0  |     }  | 
1903  |  |  | 
1904  |  |     static inline XMMReg4Double Ternary(const XMMReg4Double &cond,  | 
1905  |  |                                         const XMMReg4Double &true_expr,  | 
1906  |  |                                         const XMMReg4Double &false_expr)  | 
1907  | 0  |     { | 
1908  | 0  |         XMMReg4Double reg;  | 
1909  | 0  |         reg.low =  | 
1910  | 0  |             XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);  | 
1911  | 0  |         reg.high =  | 
1912  | 0  |             XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);  | 
1913  | 0  |         return reg;  | 
1914  | 0  |     }  | 
1915  |  |  | 
1916  |  |     static inline XMMReg4Double Min(const XMMReg4Double &expr1,  | 
1917  |  |                                     const XMMReg4Double &expr2)  | 
1918  | 0  |     { | 
1919  | 0  |         XMMReg4Double reg;  | 
1920  | 0  |         reg.low = XMMReg2Double::Min(expr1.low, expr2.low);  | 
1921  | 0  |         reg.high = XMMReg2Double::Min(expr1.high, expr2.high);  | 
1922  | 0  |         return reg;  | 
1923  | 0  |     }  | 
1924  |  |  | 
1925  |  |     inline XMMReg4Double &operator=(const XMMReg4Double &other)  | 
1926  | 0  |     { | 
1927  | 0  |         low = other.low;  | 
1928  | 0  |         high = other.high;  | 
1929  | 0  |         return *this;  | 
1930  | 0  |     }  | 
1931  |  |  | 
1932  |  |     inline XMMReg4Double &operator+=(const XMMReg4Double &other)  | 
1933  | 0  |     { | 
1934  | 0  |         low += other.low;  | 
1935  | 0  |         high += other.high;  | 
1936  | 0  |         return *this;  | 
1937  | 0  |     }  | 
1938  |  |  | 
1939  |  |     inline XMMReg4Double &operator*=(const XMMReg4Double &other)  | 
1940  | 0  |     { | 
1941  | 0  |         low *= other.low;  | 
1942  | 0  |         high *= other.high;  | 
1943  | 0  |         return *this;  | 
1944  | 0  |     }  | 
1945  |  |  | 
1946  |  |     inline XMMReg4Double operator+(const XMMReg4Double &other) const  | 
1947  | 0  |     { | 
1948  | 0  |         XMMReg4Double ret;  | 
1949  | 0  |         ret.low = low + other.low;  | 
1950  | 0  |         ret.high = high + other.high;  | 
1951  | 0  |         return ret;  | 
1952  | 0  |     }  | 
1953  |  |  | 
1954  |  |     inline XMMReg4Double operator-(const XMMReg4Double &other) const  | 
1955  | 0  |     { | 
1956  | 0  |         XMMReg4Double ret;  | 
1957  | 0  |         ret.low = low - other.low;  | 
1958  | 0  |         ret.high = high - other.high;  | 
1959  | 0  |         return ret;  | 
1960  | 0  |     }  | 
1961  |  |  | 
1962  |  |     inline XMMReg4Double operator*(const XMMReg4Double &other) const  | 
1963  | 0  |     { | 
1964  | 0  |         XMMReg4Double ret;  | 
1965  | 0  |         ret.low = low * other.low;  | 
1966  | 0  |         ret.high = high * other.high;  | 
1967  | 0  |         return ret;  | 
1968  | 0  |     }  | 
1969  |  |  | 
1970  |  |     inline XMMReg4Double operator/(const XMMReg4Double &other) const  | 
1971  | 0  |     { | 
1972  | 0  |         XMMReg4Double ret;  | 
1973  | 0  |         ret.low = low / other.low;  | 
1974  | 0  |         ret.high = high / other.high;  | 
1975  | 0  |         return ret;  | 
1976  | 0  |     }  | 
1977  |  |  | 
1978  |  |     void AddToLow(const XMMReg2Double &other)  | 
1979  | 0  |     { | 
1980  | 0  |         low += other;  | 
1981  | 0  |     }  | 
1982  |  |  | 
1983  |  |     inline double GetHorizSum() const  | 
1984  | 0  |     { | 
1985  | 0  |         return (low + high).GetHorizSum();  | 
1986  | 0  |     }  | 
1987  |  |  | 
1988  |  | #if !defined(USE_SSE2_EMULATION)  | 
1989  |  |     inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,  | 
1990  |  |                                          const XMMReg4Double &half) const  | 
1991  | 0  |     { | 
1992  | 0  |         __m128d reg0 = low.xmm;  | 
1993  | 0  |         __m128d reg1 = high.xmm;  | 
1994  | 0  |         __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);  | 
1995  | 0  |         __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);  | 
1996  | 0  |         // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps  | 
1997  | 0  |         reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));  | 
1998  | 0  |         reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));  | 
1999  | 0  |         // And perform one step of Newton-Raphson approximation to improve it  | 
2000  | 0  |         // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -  | 
2001  | 0  |         //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);  | 
2002  | 0  |         const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);  | 
2003  | 0  |         reg0 = _mm_mul_pd(  | 
2004  | 0  |             reg0, _mm_sub_pd(one_and_a_half,  | 
2005  | 0  |                              _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));  | 
2006  | 0  |         reg1 = _mm_mul_pd(  | 
2007  | 0  |             reg1, _mm_sub_pd(one_and_a_half,  | 
2008  | 0  |                              _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));  | 
2009  | 0  |         XMMReg4Double ret;  | 
2010  | 0  |         ret.low.xmm = reg0;  | 
2011  | 0  |         ret.high.xmm = reg1;  | 
2012  | 0  |         return ret;  | 
2013  | 0  |     }  | 
2014  |  |  | 
2015  |  |     inline XMMReg4Float cast_to_float() const  | 
2016  | 0  |     { | 
2017  | 0  |         XMMReg4Float ret;  | 
2018  | 0  |         ret.xmm = _mm_castsi128_ps(  | 
2019  | 0  |             _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),  | 
2020  | 0  |                                _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));  | 
2021  | 0  |         return ret;  | 
2022  | 0  |     }  | 
2023  |  | #endif  | 
2024  |  |  | 
2025  |  |     inline void Store4Val(unsigned char *ptr) const  | 
2026  | 0  |     { | 
2027  |  | #ifdef USE_SSE2_EMULATION  | 
2028  |  |         low.Store2Val(ptr);  | 
2029  |  |         high.Store2Val(ptr + 2);  | 
2030  |  | #else  | 
2031  | 0  |         __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(  | 
2032  | 0  |             low.xmm,  | 
2033  | 0  |             _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */  | 
2034  | 0  |         __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(  | 
2035  | 0  |             high.xmm,  | 
2036  | 0  |             _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */  | 
2037  | 0  |         auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),  | 
2038  | 0  |                                                    _mm_castsi128_ps(tmpHigh),  | 
2039  | 0  |                                                    _MM_SHUFFLE(1, 0, 1, 0)));  | 
2040  | 0  |         tmp = _mm_packs_epi32(tmp, tmp);  | 
2041  | 0  |         tmp = _mm_packus_epi16(tmp, tmp);  | 
2042  | 0  |         GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));  | 
2043  | 0  | #endif  | 
2044  | 0  |     }  | 
2045  |  |  | 
2046  |  |     inline void Store4Val(unsigned short *ptr) const  | 
2047  | 0  |     { | 
2048  | 0  | #if 1  | 
2049  | 0  |         low.Store2Val(ptr);  | 
2050  | 0  |         high.Store2Val(ptr + 2);  | 
2051  |  | #else  | 
2052  |  |         __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);  | 
2053  |  |         __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);  | 
2054  |  |         xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));  | 
2055  |  | #if __SSE4_1__  | 
2056  |  |         xmm0 = _mm_packus_epi32(xmm0, xmm0);  // Pack uint32 to uint16  | 
2057  |  | #else  | 
2058  |  |         xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));  | 
2059  |  |         xmm0 = _mm_packs_epi32(xmm0, xmm0);  | 
2060  |  |         xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));  | 
2061  |  | #endif  | 
2062  |  |         GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);  | 
2063  |  | #endif  | 
2064  | 0  |     }  | 
2065  |  |  | 
2066  |  |     inline void Store4Val(float *ptr) const  | 
2067  | 0  |     { | 
2068  | 0  |         low.Store2Val(ptr);  | 
2069  | 0  |         high.Store2Val(ptr + 2);  | 
2070  | 0  |     }  | 
2071  |  |  | 
2072  |  |     inline void Store4Val(double *ptr) const  | 
2073  | 0  |     { | 
2074  | 0  |         low.Store2Val(ptr);  | 
2075  | 0  |         high.Store2Val(ptr + 2);  | 
2076  | 0  |     }  | 
2077  |  |  | 
2078  |  |     inline void StoreMask(unsigned char *ptr) const  | 
2079  | 0  |     { | 
2080  | 0  |         low.StoreMask(ptr);  | 
2081  | 0  |         high.StoreMask(ptr + 16);  | 
2082  | 0  |     }  | 
2083  |  | };  | 
2084  |  |  | 
2085  |  | #if !defined(USE_SSE2_EMULATION)  | 
2086  |  | inline XMMReg4Double XMMReg4Float::cast_to_double() const  | 
2087  | 0  | { | 
2088  | 0  |     XMMReg4Double ret;  | 
2089  | 0  |     ret.low.xmm = _mm_cvtps_pd(xmm);  | 
2090  | 0  |     ret.high.xmm = _mm_cvtps_pd(  | 
2091  | 0  |         _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));  | 
2092  | 0  |     return ret;  | 
2093  | 0  | }  | 
2094  |  |  | 
2095  |  | inline XMMReg4Double XMMReg4Int::cast_to_double() const  | 
2096  | 0  | { | 
2097  | 0  |     XMMReg4Double ret;  | 
2098  | 0  |     ret.low.xmm = _mm_cvtepi32_pd(xmm);  | 
2099  | 0  |     ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));  | 
2100  | 0  |     return ret;  | 
2101  | 0  | }  | 
2102  |  | #endif  | 
2103  |  |  | 
2104  |  | #endif  | 
2105  |  |  | 
2106  |  | #endif /* #ifndef DOXYGEN_SKIP */  | 
2107  |  |  | 
2108  |  | #endif /* GDALSSE_PRIV_H_INCLUDED */  |