Coverage Report

Created: 2025-06-22 06:59

/src/gdal/gcore/gdalsse_priv.h
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Project:  GDAL
4
 * Purpose:  SSE2 helper
5
 * Author:   Even Rouault <even dot rouault at spatialys dot com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#ifndef GDALSSE_PRIV_H_INCLUDED
14
#define GDALSSE_PRIV_H_INCLUDED
15
16
#ifndef DOXYGEN_SKIP
17
18
#include "cpl_port.h"
19
20
/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
21
/* Could possibly be used too on 32bit, but we would need to check at runtime */
22
#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) &&             \
23
    !defined(USE_SSE2_EMULATION)
24
25
#include <string.h>
26
27
#ifdef USE_NEON_OPTIMIZATIONS
28
#include "include_sse2neon.h"
29
#else
30
/* Requires SSE2 */
31
#include <emmintrin.h>
32
33
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
34
#include <smmintrin.h>
35
#endif
36
#endif
37
38
#include "gdal_priv_templates.hpp"
39
40
static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
41
0
{
42
0
    unsigned short s;
43
0
    memcpy(&s, ptr, 2);
44
0
    return _mm_cvtsi32_si128(s);
45
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt16ToXMM(void const*)
46
47
static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
48
0
{
49
0
    GInt32 i;
50
0
    memcpy(&i, ptr, 4);
51
0
    return _mm_cvtsi32_si128(i);
52
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt32ToXMM(void const*)
53
54
static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
55
0
{
56
#if defined(__i386__) || defined(_M_IX86)
57
    return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
58
#else
59
0
    GInt64 i;
60
0
    memcpy(&i, ptr, 8);
61
0
    return _mm_cvtsi64_si128(i);
62
0
#endif
63
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt64ToXMM(void const*)
64
65
#ifndef GDALCopyXMMToInt16_defined
66
#define GDALCopyXMMToInt16_defined
67
68
static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
69
0
{
70
0
    GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
71
0
    memcpy(pDest, &i, 2);
72
0
}
Unexecuted instantiation: overview.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
73
#endif
74
75
class XMMReg4Int;
76
77
class XMMReg4Double;
78
79
class XMMReg4Float
80
{
81
  public:
82
    __m128 xmm;
83
84
    XMMReg4Float()
85
#if !defined(_MSC_VER)
86
0
        : xmm(_mm_undefined_ps())
87
#endif
88
0
    {
89
0
    }
90
91
0
    XMMReg4Float(const XMMReg4Float &other) : xmm(other.xmm)
92
0
    {
93
0
    }
94
95
    static inline XMMReg4Float Zero()
96
0
    {
97
0
        XMMReg4Float reg;
98
0
        reg.Zeroize();
99
0
        return reg;
100
0
    }
101
102
    static inline XMMReg4Float Set1(float f)
103
0
    {
104
0
        XMMReg4Float reg;
105
0
        reg.xmm = _mm_set1_ps(f);
106
0
        return reg;
107
0
    }
108
109
    static inline XMMReg4Float Load4Val(const float *ptr)
110
0
    {
111
0
        XMMReg4Float reg;
112
0
        reg.nsLoad4Val(ptr);
113
0
        return reg;
114
0
    }
115
116
    static inline XMMReg4Float Load4Val(const unsigned char *ptr)
117
0
    {
118
0
        XMMReg4Float reg;
119
0
        reg.nsLoad4Val(ptr);
120
0
        return reg;
121
0
    }
122
123
    static inline XMMReg4Float Load4Val(const short *ptr)
124
0
    {
125
0
        XMMReg4Float reg;
126
0
        reg.nsLoad4Val(ptr);
127
0
        return reg;
128
0
    }
129
130
    static inline XMMReg4Float Load4Val(const unsigned short *ptr)
131
0
    {
132
0
        XMMReg4Float reg;
133
0
        reg.nsLoad4Val(ptr);
134
0
        return reg;
135
0
    }
136
137
    static inline XMMReg4Float Load4Val(const int *ptr)
138
0
    {
139
0
        XMMReg4Float reg;
140
0
        reg.nsLoad4Val(ptr);
141
0
        return reg;
142
0
    }
143
144
    static inline XMMReg4Float Equals(const XMMReg4Float &expr1,
145
                                      const XMMReg4Float &expr2)
146
0
    {
147
0
        XMMReg4Float reg;
148
0
        reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);
149
0
        return reg;
150
0
    }
151
152
    static inline XMMReg4Float NotEquals(const XMMReg4Float &expr1,
153
                                         const XMMReg4Float &expr2)
154
0
    {
155
0
        XMMReg4Float reg;
156
0
        reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);
157
0
        return reg;
158
0
    }
159
160
    static inline XMMReg4Float Lesser(const XMMReg4Float &expr1,
161
                                      const XMMReg4Float &expr2)
162
0
    {
163
0
        XMMReg4Float reg;
164
0
        reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);
165
0
        return reg;
166
0
    }
167
168
    static inline XMMReg4Float Greater(const XMMReg4Float &expr1,
169
                                       const XMMReg4Float &expr2)
170
0
    {
171
0
        XMMReg4Float reg;
172
0
        reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);
173
0
        return reg;
174
0
    }
175
176
    static inline XMMReg4Float And(const XMMReg4Float &expr1,
177
                                   const XMMReg4Float &expr2)
178
0
    {
179
0
        XMMReg4Float reg;
180
0
        reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);
181
0
        return reg;
182
0
    }
183
184
    static inline XMMReg4Float Ternary(const XMMReg4Float &cond,
185
                                       const XMMReg4Float &true_expr,
186
                                       const XMMReg4Float &false_expr)
187
0
    {
188
0
        XMMReg4Float reg;
189
0
        reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),
190
0
                            _mm_andnot_ps(cond.xmm, false_expr.xmm));
191
0
        return reg;
192
0
    }
193
194
    static inline XMMReg4Float Min(const XMMReg4Float &expr1,
195
                                   const XMMReg4Float &expr2)
196
0
    {
197
0
        XMMReg4Float reg;
198
0
        reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);
199
0
        return reg;
200
0
    }
201
202
    static inline XMMReg4Float Max(const XMMReg4Float &expr1,
203
                                   const XMMReg4Float &expr2)
204
0
    {
205
0
        XMMReg4Float reg;
206
0
        reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);
207
0
        return reg;
208
0
    }
209
210
    inline void nsLoad4Val(const float *ptr)
211
0
    {
212
0
        xmm = _mm_loadu_ps(ptr);
213
0
    }
214
215
    static inline void Load16Val(const float *ptr, XMMReg4Float &r0,
216
                                 XMMReg4Float &r1, XMMReg4Float &r2,
217
                                 XMMReg4Float &r3)
218
0
    {
219
0
        r0.nsLoad4Val(ptr);
220
0
        r1.nsLoad4Val(ptr + 4);
221
0
        r2.nsLoad4Val(ptr + 8);
222
0
        r3.nsLoad4Val(ptr + 12);
223
0
    }
224
225
    inline void nsLoad4Val(const int *ptr)
226
0
    {
227
0
        const __m128i xmm_i =
228
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
229
0
        xmm = _mm_cvtepi32_ps(xmm_i);
230
0
    }
231
232
    static inline void Load16Val(const int *ptr, XMMReg4Float &r0,
233
                                 XMMReg4Float &r1, XMMReg4Float &r2,
234
                                 XMMReg4Float &r3)
235
0
    {
236
0
        r0.nsLoad4Val(ptr);
237
0
        r1.nsLoad4Val(ptr + 4);
238
0
        r2.nsLoad4Val(ptr + 8);
239
0
        r3.nsLoad4Val(ptr + 12);
240
0
    }
241
242
    static inline __m128i cvtepu8_epi32(__m128i x)
243
0
    {
244
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
245
        return _mm_cvtepu8_epi32(x);
246
#else
247
0
        return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),
248
0
                                  _mm_setzero_si128());
249
0
#endif
250
0
    }
251
252
    inline void nsLoad4Val(const unsigned char *ptr)
253
0
    {
254
0
        const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
255
0
        xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
256
0
    }
257
258
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Float &r0,
259
                                XMMReg4Float &r1)
260
0
    {
261
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
262
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
263
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
264
0
    }
265
266
    static inline void Load16Val(const unsigned char *ptr, XMMReg4Float &r0,
267
                                 XMMReg4Float &r1, XMMReg4Float &r2,
268
                                 XMMReg4Float &r3)
269
0
    {
270
0
        const __m128i xmm_i =
271
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
272
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
273
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
274
0
        r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));
275
0
        r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));
276
0
    }
277
278
    static inline __m128i cvtepi16_epi32(__m128i x)
279
0
    {
280
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
281
        return _mm_cvtepi16_epi32(x);
282
#else
283
        /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
284
0
        return _mm_srai_epi32(
285
            /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
286
0
            _mm_unpacklo_epi16(x, x), 16);
287
0
#endif
288
0
    }
289
290
    inline void nsLoad4Val(const short *ptr)
291
0
    {
292
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
293
0
        xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
294
0
    }
295
296
    static inline void Load8Val(const short *ptr, XMMReg4Float &r0,
297
                                XMMReg4Float &r1)
298
0
    {
299
0
        const __m128i xmm_i =
300
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
301
0
        r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
302
0
        r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));
303
0
    }
304
305
    static inline void Load16Val(const short *ptr, XMMReg4Float &r0,
306
                                 XMMReg4Float &r1, XMMReg4Float &r2,
307
                                 XMMReg4Float &r3)
308
0
    {
309
0
        Load8Val(ptr, r0, r1);
310
0
        Load8Val(ptr + 8, r2, r3);
311
0
    }
312
313
    static inline __m128i cvtepu16_epi32(__m128i x)
314
0
    {
315
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
316
        return _mm_cvtepu16_epi32(x);
317
#else
318
0
        return _mm_unpacklo_epi16(
319
0
            x, _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
320
0
#endif
321
0
    }
322
323
    inline void nsLoad4Val(const unsigned short *ptr)
324
0
    {
325
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
326
0
        xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
327
0
    }
328
329
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Float &r0,
330
                                XMMReg4Float &r1)
331
0
    {
332
0
        const __m128i xmm_i =
333
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
334
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
335
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));
336
0
    }
337
338
    static inline void Load16Val(const unsigned short *ptr, XMMReg4Float &r0,
339
                                 XMMReg4Float &r1, XMMReg4Float &r2,
340
                                 XMMReg4Float &r3)
341
0
    {
342
0
        Load8Val(ptr, r0, r1);
343
0
        Load8Val(ptr + 8, r2, r3);
344
0
    }
345
346
    inline void Zeroize()
347
0
    {
348
0
        xmm = _mm_setzero_ps();
349
0
    }
350
351
    inline XMMReg4Float &operator=(const XMMReg4Float &other)
352
0
    {
353
0
        xmm = other.xmm;
354
0
        return *this;
355
0
    }
356
357
    inline XMMReg4Float &operator+=(const XMMReg4Float &other)
358
0
    {
359
0
        xmm = _mm_add_ps(xmm, other.xmm);
360
0
        return *this;
361
0
    }
362
363
    inline XMMReg4Float &operator-=(const XMMReg4Float &other)
364
0
    {
365
0
        xmm = _mm_sub_ps(xmm, other.xmm);
366
0
        return *this;
367
0
    }
368
369
    inline XMMReg4Float &operator*=(const XMMReg4Float &other)
370
0
    {
371
0
        xmm = _mm_mul_ps(xmm, other.xmm);
372
0
        return *this;
373
0
    }
374
375
    inline XMMReg4Float operator+(const XMMReg4Float &other) const
376
0
    {
377
0
        XMMReg4Float ret;
378
0
        ret.xmm = _mm_add_ps(xmm, other.xmm);
379
0
        return ret;
380
0
    }
381
382
    inline XMMReg4Float operator-(const XMMReg4Float &other) const
383
0
    {
384
0
        XMMReg4Float ret;
385
0
        ret.xmm = _mm_sub_ps(xmm, other.xmm);
386
0
        return ret;
387
0
    }
388
389
    inline XMMReg4Float operator*(const XMMReg4Float &other) const
390
0
    {
391
0
        XMMReg4Float ret;
392
0
        ret.xmm = _mm_mul_ps(xmm, other.xmm);
393
0
        return ret;
394
0
    }
395
396
    inline XMMReg4Float operator/(const XMMReg4Float &other) const
397
0
    {
398
0
        XMMReg4Float ret;
399
0
        ret.xmm = _mm_div_ps(xmm, other.xmm);
400
0
        return ret;
401
0
    }
402
403
    inline XMMReg4Float inverse() const
404
0
    {
405
0
        XMMReg4Float ret;
406
0
        ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);
407
0
        return ret;
408
0
    }
409
410
    inline XMMReg4Int truncate_to_int() const;
411
412
    inline XMMReg4Double cast_to_double() const;
413
414
    inline void Store4Val(float *ptr) const
415
0
    {
416
0
        _mm_storeu_ps(ptr, xmm);
417
0
    }
418
419
    inline void Store4ValAligned(float *ptr) const
420
0
    {
421
0
        _mm_store_ps(ptr, xmm);
422
0
    }
423
424
    inline operator float() const
425
0
    {
426
0
        return _mm_cvtss_f32(xmm);
427
0
    }
428
};
429
430
class XMMReg4Int
431
{
432
  public:
433
    __m128i xmm;
434
435
    XMMReg4Int()
436
#if !defined(_MSC_VER)
437
        : xmm(_mm_undefined_si128())
438
#endif
439
0
    {
440
0
    }
441
442
    XMMReg4Int(const XMMReg4Int &other) : xmm(other.xmm)
443
0
    {
444
0
    }
445
446
    static inline XMMReg4Int Zero()
447
0
    {
448
0
        XMMReg4Int reg;
449
0
        reg.xmm = _mm_setzero_si128();
450
0
        return reg;
451
0
    }
452
453
    static inline XMMReg4Int Set1(int i)
454
0
    {
455
0
        XMMReg4Int reg;
456
0
        reg.xmm = _mm_set1_epi32(i);
457
0
        return reg;
458
0
    }
459
460
    static inline XMMReg4Int Load4Val(const int *ptr)
461
0
    {
462
0
        XMMReg4Int reg;
463
0
        reg.nsLoad4Val(ptr);
464
0
        return reg;
465
0
    }
466
467
    inline void nsLoad4Val(const int *ptr)
468
0
    {
469
0
        xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
470
0
    }
471
472
    static inline XMMReg4Int Equals(const XMMReg4Int &expr1,
473
                                    const XMMReg4Int &expr2)
474
0
    {
475
0
        XMMReg4Int reg;
476
0
        reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);
477
0
        return reg;
478
0
    }
479
480
    static inline XMMReg4Int Ternary(const XMMReg4Int &cond,
481
                                     const XMMReg4Int &true_expr,
482
                                     const XMMReg4Int &false_expr)
483
0
    {
484
0
        XMMReg4Int reg;
485
0
        reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
486
0
                               _mm_andnot_si128(cond.xmm, false_expr.xmm));
487
0
        return reg;
488
0
    }
489
490
    inline XMMReg4Int &operator+=(const XMMReg4Int &other)
491
0
    {
492
0
        xmm = _mm_add_epi32(xmm, other.xmm);
493
0
        return *this;
494
0
    }
495
496
    inline XMMReg4Int &operator-=(const XMMReg4Int &other)
497
0
    {
498
0
        xmm = _mm_sub_epi32(xmm, other.xmm);
499
0
        return *this;
500
0
    }
501
502
    inline XMMReg4Int operator+(const XMMReg4Int &other) const
503
0
    {
504
0
        XMMReg4Int ret;
505
0
        ret.xmm = _mm_add_epi32(xmm, other.xmm);
506
0
        return ret;
507
0
    }
508
509
    inline XMMReg4Int operator-(const XMMReg4Int &other) const
510
0
    {
511
0
        XMMReg4Int ret;
512
0
        ret.xmm = _mm_sub_epi32(xmm, other.xmm);
513
0
        return ret;
514
0
    }
515
516
    XMMReg4Double cast_to_double() const;
517
518
    XMMReg4Float to_float() const
519
0
    {
520
0
        XMMReg4Float ret;
521
0
        ret.xmm = _mm_cvtepi32_ps(xmm);
522
0
        return ret;
523
0
    }
524
};
525
526
inline XMMReg4Int XMMReg4Float::truncate_to_int() const
527
0
{
528
0
    XMMReg4Int ret;
529
0
    ret.xmm = _mm_cvttps_epi32(xmm);
530
0
    return ret;
531
0
}
532
533
class XMMReg8Byte
534
{
535
  public:
536
    __m128i xmm;
537
538
    XMMReg8Byte()
539
#if !defined(_MSC_VER)
540
        : xmm(_mm_undefined_si128())
541
#endif
542
0
    {
543
0
    }
544
545
    XMMReg8Byte(const XMMReg8Byte &other) : xmm(other.xmm)
546
0
    {
547
0
    }
548
549
    static inline XMMReg8Byte Zero()
550
0
    {
551
0
        XMMReg8Byte reg;
552
0
        reg.xmm = _mm_setzero_si128();
553
0
        return reg;
554
0
    }
555
556
    static inline XMMReg8Byte Set1(char i)
557
0
    {
558
0
        XMMReg8Byte reg;
559
0
        reg.xmm = _mm_set1_epi8(i);
560
0
        return reg;
561
0
    }
562
563
    static inline XMMReg8Byte Equals(const XMMReg8Byte &expr1,
564
                                     const XMMReg8Byte &expr2)
565
0
    {
566
0
        XMMReg8Byte reg;
567
0
        reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);
568
0
        return reg;
569
0
    }
570
571
    static inline XMMReg8Byte Or(const XMMReg8Byte &expr1,
572
                                 const XMMReg8Byte &expr2)
573
0
    {
574
0
        XMMReg8Byte reg;
575
0
        reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);
576
0
        return reg;
577
0
    }
578
579
    static inline XMMReg8Byte Ternary(const XMMReg8Byte &cond,
580
                                      const XMMReg8Byte &true_expr,
581
                                      const XMMReg8Byte &false_expr)
582
0
    {
583
0
        XMMReg8Byte reg;
584
0
        reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
585
0
                               _mm_andnot_si128(cond.xmm, false_expr.xmm));
586
0
        return reg;
587
0
    }
588
589
    inline XMMReg8Byte operator+(const XMMReg8Byte &other) const
590
0
    {
591
0
        XMMReg8Byte ret;
592
0
        ret.xmm = _mm_add_epi8(xmm, other.xmm);
593
0
        return ret;
594
0
    }
595
596
    inline XMMReg8Byte operator-(const XMMReg8Byte &other) const
597
0
    {
598
0
        XMMReg8Byte ret;
599
0
        ret.xmm = _mm_sub_epi8(xmm, other.xmm);
600
0
        return ret;
601
0
    }
602
603
    static inline XMMReg8Byte Pack(const XMMReg4Int &r0, const XMMReg4Int &r1)
604
0
    {
605
0
        XMMReg8Byte reg;
606
0
        reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);
607
0
        reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);
608
0
        return reg;
609
0
    }
610
611
    inline void Store8Val(unsigned char *ptr) const
612
0
    {
613
0
        GDALCopyXMMToInt64(xmm, reinterpret_cast<GInt64 *>(ptr));
614
0
    }
615
};
616
617
class XMMReg2Double
618
{
619
  public:
620
    __m128d xmm;
621
622
    XMMReg2Double()
623
#if !defined(_MSC_VER)
624
0
        : xmm(_mm_undefined_pd())
625
#endif
626
0
    {
627
0
    }
628
629
    XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
630
0
    {
631
0
    }
632
633
0
    XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
634
0
    {
635
0
    }
636
637
    static inline XMMReg2Double Set1(double d)
638
0
    {
639
0
        XMMReg2Double reg;
640
0
        reg.xmm = _mm_set1_pd(d);
641
0
        return reg;
642
0
    }
643
644
    static inline XMMReg2Double Zero()
645
0
    {
646
0
        XMMReg2Double reg;
647
0
        reg.Zeroize();
648
0
        return reg;
649
0
    }
650
651
    static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
652
0
    {
653
0
        XMMReg2Double reg;
654
0
        reg.nsLoad1ValHighAndLow(ptr);
655
0
        return reg;
656
0
    }
657
658
    static inline XMMReg2Double Load2Val(const double *ptr)
659
0
    {
660
0
        XMMReg2Double reg;
661
0
        reg.nsLoad2Val(ptr);
662
0
        return reg;
663
0
    }
664
665
    static inline XMMReg2Double Load2Val(const float *ptr)
666
0
    {
667
0
        XMMReg2Double reg;
668
0
        reg.nsLoad2Val(ptr);
669
0
        return reg;
670
0
    }
671
672
    static inline XMMReg2Double Load2ValAligned(const double *ptr)
673
0
    {
674
0
        XMMReg2Double reg;
675
0
        reg.nsLoad2ValAligned(ptr);
676
0
        return reg;
677
0
    }
678
679
    static inline XMMReg2Double Load2Val(const unsigned char *ptr)
680
0
    {
681
0
        XMMReg2Double reg;
682
0
        reg.nsLoad2Val(ptr);
683
0
        return reg;
684
0
    }
685
686
    static inline XMMReg2Double Load2Val(const short *ptr)
687
0
    {
688
0
        XMMReg2Double reg;
689
0
        reg.nsLoad2Val(ptr);
690
0
        return reg;
691
0
    }
692
693
    static inline XMMReg2Double Load2Val(const unsigned short *ptr)
694
0
    {
695
0
        XMMReg2Double reg;
696
0
        reg.nsLoad2Val(ptr);
697
0
        return reg;
698
0
    }
699
700
    static inline XMMReg2Double Load2Val(const int *ptr)
701
0
    {
702
0
        XMMReg2Double reg;
703
0
        reg.nsLoad2Val(ptr);
704
0
        return reg;
705
0
    }
706
707
    static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
708
                                       const XMMReg2Double &expr2)
709
0
    {
710
0
        XMMReg2Double reg;
711
0
        reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
712
0
        return reg;
713
0
    }
714
715
    static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
716
                                          const XMMReg2Double &expr2)
717
0
    {
718
0
        XMMReg2Double reg;
719
0
        reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
720
0
        return reg;
721
0
    }
722
723
    static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
724
                                        const XMMReg2Double &expr2)
725
0
    {
726
0
        XMMReg2Double reg;
727
0
        reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
728
0
        return reg;
729
0
    }
730
731
    static inline XMMReg2Double And(const XMMReg2Double &expr1,
732
                                    const XMMReg2Double &expr2)
733
0
    {
734
0
        XMMReg2Double reg;
735
0
        reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
736
0
        return reg;
737
0
    }
738
739
    static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
740
                                        const XMMReg2Double &true_expr,
741
                                        const XMMReg2Double &false_expr)
742
0
    {
743
0
        XMMReg2Double reg;
744
0
        reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
745
0
                            _mm_andnot_pd(cond.xmm, false_expr.xmm));
746
0
        return reg;
747
0
    }
748
749
    static inline XMMReg2Double Min(const XMMReg2Double &expr1,
750
                                    const XMMReg2Double &expr2)
751
0
    {
752
0
        XMMReg2Double reg;
753
0
        reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
754
0
        return reg;
755
0
    }
756
757
    inline void nsLoad1ValHighAndLow(const double *ptr)
758
0
    {
759
0
        xmm = _mm_load1_pd(ptr);
760
0
    }
761
762
    inline void nsLoad2Val(const double *ptr)
763
0
    {
764
0
        xmm = _mm_loadu_pd(ptr);
765
0
    }
766
767
    inline void nsLoad2ValAligned(const double *ptr)
768
0
    {
769
0
        xmm = _mm_load_pd(ptr);
770
0
    }
771
772
    inline void nsLoad2Val(const float *ptr)
773
0
    {
774
0
        xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
775
0
    }
776
777
    inline void nsLoad2Val(const int *ptr)
778
0
    {
779
0
        xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));
780
0
    }
781
782
    inline void nsLoad2Val(const unsigned char *ptr)
783
0
    {
784
0
        __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
785
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
786
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
787
#else
788
0
        xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
789
0
        xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
790
0
#endif
791
0
        xmm = _mm_cvtepi32_pd(xmm_i);
792
0
    }
793
794
    inline void nsLoad2Val(const short *ptr)
795
0
    {
796
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
797
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
798
        xmm_i = _mm_cvtepi16_epi32(xmm_i);
799
#else
800
0
        xmm_i = _mm_unpacklo_epi16(
801
0
            xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
802
0
        xmm_i = _mm_srai_epi32(
803
0
            xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
804
0
#endif
805
0
        xmm = _mm_cvtepi32_pd(xmm_i);
806
0
    }
807
808
    inline void nsLoad2Val(const unsigned short *ptr)
809
0
    {
810
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
811
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
812
        xmm_i = _mm_cvtepu16_epi32(xmm_i);
813
#else
814
0
        xmm_i = _mm_unpacklo_epi16(
815
0
            xmm_i,
816
0
            _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
817
0
#endif
818
0
        xmm = _mm_cvtepi32_pd(xmm_i);
819
0
    }
820
821
    static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
822
                                XMMReg2Double &high)
823
0
    {
824
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
825
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
826
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
827
#else
828
0
        xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
829
0
        xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
830
0
#endif
831
0
        low.xmm = _mm_cvtepi32_pd(xmm_i);
832
0
        high.xmm =
833
0
            _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
834
0
    }
835
836
    static inline void Load4Val(const short *ptr, XMMReg2Double &low,
837
                                XMMReg2Double &high)
838
0
    {
839
0
        low.nsLoad2Val(ptr);
840
0
        high.nsLoad2Val(ptr + 2);
841
0
    }
842
843
    static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
844
                                XMMReg2Double &high)
845
0
    {
846
0
        low.nsLoad2Val(ptr);
847
0
        high.nsLoad2Val(ptr + 2);
848
0
    }
849
850
    static inline void Load4Val(const double *ptr, XMMReg2Double &low,
851
                                XMMReg2Double &high)
852
0
    {
853
0
        low.nsLoad2Val(ptr);
854
0
        high.nsLoad2Val(ptr + 2);
855
0
    }
856
857
    static inline void Load4Val(const float *ptr, XMMReg2Double &low,
858
                                XMMReg2Double &high)
859
0
    {
860
0
        __m128 temp1 = _mm_loadu_ps(ptr);
861
0
        __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
862
0
        low.xmm = _mm_cvtps_pd(temp1);
863
0
        high.xmm = _mm_cvtps_pd(temp2);
864
0
    }
865
866
    inline void Zeroize()
867
0
    {
868
0
        xmm = _mm_setzero_pd();
869
0
    }
870
871
    inline XMMReg2Double &operator=(const XMMReg2Double &other)
872
0
    {
873
0
        xmm = other.xmm;
874
0
        return *this;
875
0
    }
876
877
    inline XMMReg2Double &operator+=(const XMMReg2Double &other)
878
0
    {
879
0
        xmm = _mm_add_pd(xmm, other.xmm);
880
0
        return *this;
881
0
    }
882
883
    inline XMMReg2Double &operator*=(const XMMReg2Double &other)
884
0
    {
885
0
        xmm = _mm_mul_pd(xmm, other.xmm);
886
0
        return *this;
887
0
    }
888
889
    inline XMMReg2Double operator+(const XMMReg2Double &other) const
890
0
    {
891
0
        XMMReg2Double ret;
892
0
        ret.xmm = _mm_add_pd(xmm, other.xmm);
893
0
        return ret;
894
0
    }
895
896
    inline XMMReg2Double operator-(const XMMReg2Double &other) const
897
0
    {
898
0
        XMMReg2Double ret;
899
0
        ret.xmm = _mm_sub_pd(xmm, other.xmm);
900
0
        return ret;
901
0
    }
902
903
    inline XMMReg2Double operator*(const XMMReg2Double &other) const
904
0
    {
905
0
        XMMReg2Double ret;
906
0
        ret.xmm = _mm_mul_pd(xmm, other.xmm);
907
0
        return ret;
908
0
    }
909
910
    inline XMMReg2Double operator/(const XMMReg2Double &other) const
911
0
    {
912
0
        XMMReg2Double ret;
913
0
        ret.xmm = _mm_div_pd(xmm, other.xmm);
914
0
        return ret;
915
0
    }
916
917
    inline double GetHorizSum() const
918
0
    {
919
0
        __m128d xmm2;
920
0
        xmm2 = _mm_shuffle_pd(
921
0
            xmm, xmm,
922
0
            _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
923
0
        return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
924
0
    }
925
926
    inline void Store2Val(double *ptr) const
927
0
    {
928
0
        _mm_storeu_pd(ptr, xmm);
929
0
    }
930
931
    inline void Store2ValAligned(double *ptr) const
932
0
    {
933
0
        _mm_store_pd(ptr, xmm);
934
0
    }
935
936
    inline void Store2Val(float *ptr) const
937
0
    {
938
0
        __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
939
0
        GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
940
0
    }
941
942
    inline void Store2Val(unsigned char *ptr) const
943
0
    {
944
0
        __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
945
0
            xmm,
946
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
947
0
        tmp = _mm_packs_epi32(tmp, tmp);
948
0
        tmp = _mm_packus_epi16(tmp, tmp);
949
0
        GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
950
0
    }
951
952
    inline void Store2Val(unsigned short *ptr) const
953
0
    {
954
0
        __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
955
0
            xmm,
956
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
957
        // X X X X 0 B 0 A --> X X X X A A B A
958
0
        tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
959
0
        GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
960
0
    }
961
962
    inline void StoreMask(unsigned char *ptr) const
963
0
    {
964
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
965
0
                         _mm_castpd_si128(xmm));
966
0
    }
967
968
    inline operator double() const
969
0
    {
970
0
        return _mm_cvtsd_f64(xmm);
971
0
    }
972
};
973
974
#else
975
976
#ifndef NO_WARN_USE_SSE2_EMULATION
977
#warning "Software emulation of SSE2 !"
978
#endif
979
980
class XMMReg2Double
981
{
982
  public:
983
    double low;
984
    double high;
985
986
    // cppcheck-suppress uninitMemberVar
987
    XMMReg2Double() = default;
988
989
    explicit XMMReg2Double(double val)
990
    {
991
        low = val;
992
        high = 0.0;
993
    }
994
995
    XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
996
    {
997
    }
998
999
    static inline XMMReg2Double Zero()
1000
    {
1001
        XMMReg2Double reg;
1002
        reg.Zeroize();
1003
        return reg;
1004
    }
1005
1006
    static inline XMMReg2Double Set1(double d)
1007
    {
1008
        XMMReg2Double reg;
1009
        reg.low = d;
1010
        reg.high = d;
1011
        return reg;
1012
    }
1013
1014
    static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
1015
    {
1016
        XMMReg2Double reg;
1017
        reg.nsLoad1ValHighAndLow(ptr);
1018
        return reg;
1019
    }
1020
1021
    static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
1022
                                       const XMMReg2Double &expr2)
1023
    {
1024
        XMMReg2Double reg;
1025
1026
        if (expr1.low == expr2.low)
1027
            memset(&(reg.low), 0xFF, sizeof(double));
1028
        else
1029
            reg.low = 0;
1030
1031
        if (expr1.high == expr2.high)
1032
            memset(&(reg.high), 0xFF, sizeof(double));
1033
        else
1034
            reg.high = 0;
1035
1036
        return reg;
1037
    }
1038
1039
    static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
1040
                                          const XMMReg2Double &expr2)
1041
    {
1042
        XMMReg2Double reg;
1043
1044
        if (expr1.low != expr2.low)
1045
            memset(&(reg.low), 0xFF, sizeof(double));
1046
        else
1047
            reg.low = 0;
1048
1049
        if (expr1.high != expr2.high)
1050
            memset(&(reg.high), 0xFF, sizeof(double));
1051
        else
1052
            reg.high = 0;
1053
1054
        return reg;
1055
    }
1056
1057
    static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
1058
                                        const XMMReg2Double &expr2)
1059
    {
1060
        XMMReg2Double reg;
1061
1062
        if (expr1.low > expr2.low)
1063
            memset(&(reg.low), 0xFF, sizeof(double));
1064
        else
1065
            reg.low = 0;
1066
1067
        if (expr1.high > expr2.high)
1068
            memset(&(reg.high), 0xFF, sizeof(double));
1069
        else
1070
            reg.high = 0;
1071
1072
        return reg;
1073
    }
1074
1075
    static inline XMMReg2Double And(const XMMReg2Double &expr1,
1076
                                    const XMMReg2Double &expr2)
1077
    {
1078
        XMMReg2Double reg;
1079
        int low1[2], high1[2];
1080
        int low2[2], high2[2];
1081
        memcpy(low1, &expr1.low, sizeof(double));
1082
        memcpy(high1, &expr1.high, sizeof(double));
1083
        memcpy(low2, &expr2.low, sizeof(double));
1084
        memcpy(high2, &expr2.high, sizeof(double));
1085
        low1[0] &= low2[0];
1086
        low1[1] &= low2[1];
1087
        high1[0] &= high2[0];
1088
        high1[1] &= high2[1];
1089
        memcpy(&reg.low, low1, sizeof(double));
1090
        memcpy(&reg.high, high1, sizeof(double));
1091
        return reg;
1092
    }
1093
1094
    static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
1095
                                        const XMMReg2Double &true_expr,
1096
                                        const XMMReg2Double &false_expr)
1097
    {
1098
        XMMReg2Double reg;
1099
        if (cond.low != 0)
1100
            reg.low = true_expr.low;
1101
        else
1102
            reg.low = false_expr.low;
1103
        if (cond.high != 0)
1104
            reg.high = true_expr.high;
1105
        else
1106
            reg.high = false_expr.high;
1107
        return reg;
1108
    }
1109
1110
    static inline XMMReg2Double Min(const XMMReg2Double &expr1,
1111
                                    const XMMReg2Double &expr2)
1112
    {
1113
        XMMReg2Double reg;
1114
        reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
1115
        reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
1116
        return reg;
1117
    }
1118
1119
    static inline XMMReg2Double Load2Val(const double *ptr)
1120
    {
1121
        XMMReg2Double reg;
1122
        reg.nsLoad2Val(ptr);
1123
        return reg;
1124
    }
1125
1126
    static inline XMMReg2Double Load2ValAligned(const double *ptr)
1127
    {
1128
        XMMReg2Double reg;
1129
        reg.nsLoad2ValAligned(ptr);
1130
        return reg;
1131
    }
1132
1133
    static inline XMMReg2Double Load2Val(const float *ptr)
1134
    {
1135
        XMMReg2Double reg;
1136
        reg.nsLoad2Val(ptr);
1137
        return reg;
1138
    }
1139
1140
    static inline XMMReg2Double Load2Val(const unsigned char *ptr)
1141
    {
1142
        XMMReg2Double reg;
1143
        reg.nsLoad2Val(ptr);
1144
        return reg;
1145
    }
1146
1147
    static inline XMMReg2Double Load2Val(const short *ptr)
1148
    {
1149
        XMMReg2Double reg;
1150
        reg.nsLoad2Val(ptr);
1151
        return reg;
1152
    }
1153
1154
    static inline XMMReg2Double Load2Val(const unsigned short *ptr)
1155
    {
1156
        XMMReg2Double reg;
1157
        reg.nsLoad2Val(ptr);
1158
        return reg;
1159
    }
1160
1161
    static inline XMMReg2Double Load2Val(const int *ptr)
1162
    {
1163
        XMMReg2Double reg;
1164
        reg.nsLoad2Val(ptr);
1165
        return reg;
1166
    }
1167
1168
    inline void nsLoad1ValHighAndLow(const double *ptr)
1169
    {
1170
        low = ptr[0];
1171
        high = ptr[0];
1172
    }
1173
1174
    inline void nsLoad2Val(const double *ptr)
1175
    {
1176
        low = ptr[0];
1177
        high = ptr[1];
1178
    }
1179
1180
    inline void nsLoad2ValAligned(const double *ptr)
1181
    {
1182
        low = ptr[0];
1183
        high = ptr[1];
1184
    }
1185
1186
    inline void nsLoad2Val(const float *ptr)
1187
    {
1188
        low = ptr[0];
1189
        high = ptr[1];
1190
    }
1191
1192
    inline void nsLoad2Val(const unsigned char *ptr)
1193
    {
1194
        low = ptr[0];
1195
        high = ptr[1];
1196
    }
1197
1198
    inline void nsLoad2Val(const short *ptr)
1199
    {
1200
        low = ptr[0];
1201
        high = ptr[1];
1202
    }
1203
1204
    inline void nsLoad2Val(const unsigned short *ptr)
1205
    {
1206
        low = ptr[0];
1207
        high = ptr[1];
1208
    }
1209
1210
    inline void nsLoad2Val(const int *ptr)
1211
    {
1212
        low = ptr[0];
1213
        high = ptr[1];
1214
    }
1215
1216
    static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
1217
                                XMMReg2Double &high)
1218
    {
1219
        low.low = ptr[0];
1220
        low.high = ptr[1];
1221
        high.low = ptr[2];
1222
        high.high = ptr[3];
1223
    }
1224
1225
    static inline void Load4Val(const short *ptr, XMMReg2Double &low,
1226
                                XMMReg2Double &high)
1227
    {
1228
        low.nsLoad2Val(ptr);
1229
        high.nsLoad2Val(ptr + 2);
1230
    }
1231
1232
    static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
1233
                                XMMReg2Double &high)
1234
    {
1235
        low.nsLoad2Val(ptr);
1236
        high.nsLoad2Val(ptr + 2);
1237
    }
1238
1239
    static inline void Load4Val(const double *ptr, XMMReg2Double &low,
1240
                                XMMReg2Double &high)
1241
    {
1242
        low.nsLoad2Val(ptr);
1243
        high.nsLoad2Val(ptr + 2);
1244
    }
1245
1246
    static inline void Load4Val(const float *ptr, XMMReg2Double &low,
1247
                                XMMReg2Double &high)
1248
    {
1249
        low.nsLoad2Val(ptr);
1250
        high.nsLoad2Val(ptr + 2);
1251
    }
1252
1253
    inline void Zeroize()
1254
    {
1255
        low = 0.0;
1256
        high = 0.0;
1257
    }
1258
1259
    inline XMMReg2Double &operator=(const XMMReg2Double &other)
1260
    {
1261
        low = other.low;
1262
        high = other.high;
1263
        return *this;
1264
    }
1265
1266
    inline XMMReg2Double &operator+=(const XMMReg2Double &other)
1267
    {
1268
        low += other.low;
1269
        high += other.high;
1270
        return *this;
1271
    }
1272
1273
    inline XMMReg2Double &operator*=(const XMMReg2Double &other)
1274
    {
1275
        low *= other.low;
1276
        high *= other.high;
1277
        return *this;
1278
    }
1279
1280
    inline XMMReg2Double operator+(const XMMReg2Double &other) const
1281
    {
1282
        XMMReg2Double ret;
1283
        ret.low = low + other.low;
1284
        ret.high = high + other.high;
1285
        return ret;
1286
    }
1287
1288
    inline XMMReg2Double operator-(const XMMReg2Double &other) const
1289
    {
1290
        XMMReg2Double ret;
1291
        ret.low = low - other.low;
1292
        ret.high = high - other.high;
1293
        return ret;
1294
    }
1295
1296
    inline XMMReg2Double operator*(const XMMReg2Double &other) const
1297
    {
1298
        XMMReg2Double ret;
1299
        ret.low = low * other.low;
1300
        ret.high = high * other.high;
1301
        return ret;
1302
    }
1303
1304
    inline XMMReg2Double operator/(const XMMReg2Double &other) const
1305
    {
1306
        XMMReg2Double ret;
1307
        ret.low = low / other.low;
1308
        ret.high = high / other.high;
1309
        return ret;
1310
    }
1311
1312
    inline double GetHorizSum() const
1313
    {
1314
        return low + high;
1315
    }
1316
1317
    inline void Store2Val(double *ptr) const
1318
    {
1319
        ptr[0] = low;
1320
        ptr[1] = high;
1321
    }
1322
1323
    inline void Store2ValAligned(double *ptr) const
1324
    {
1325
        ptr[0] = low;
1326
        ptr[1] = high;
1327
    }
1328
1329
    inline void Store2Val(float *ptr) const
1330
    {
1331
        ptr[0] = static_cast<float>(low);
1332
        ptr[1] = static_cast<float>(high);
1333
    }
1334
1335
    void Store2Val(unsigned char *ptr) const
1336
    {
1337
        ptr[0] = static_cast<unsigned char>(low + 0.5);
1338
        ptr[1] = static_cast<unsigned char>(high + 0.5);
1339
    }
1340
1341
    void Store2Val(unsigned short *ptr) const
1342
    {
1343
        ptr[0] = static_cast<GUInt16>(low + 0.5);
1344
        ptr[1] = static_cast<GUInt16>(high + 0.5);
1345
    }
1346
1347
    inline void StoreMask(unsigned char *ptr) const
1348
    {
1349
        memcpy(ptr, &low, 8);
1350
        memcpy(ptr + 8, &high, 8);
1351
    }
1352
1353
    inline operator double() const
1354
    {
1355
        return low;
1356
    }
1357
};
1358
1359
#endif /*  defined(__x86_64) || defined(_M_X64) */
1360
1361
#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
1362
1363
#include <immintrin.h>
1364
1365
class XMMReg4Double
1366
{
1367
  public:
1368
    __m256d ymm;
1369
1370
    XMMReg4Double() : ymm(_mm256_setzero_pd())
1371
    {
1372
    }
1373
1374
    XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
1375
    {
1376
    }
1377
1378
    static inline XMMReg4Double Zero()
1379
    {
1380
        XMMReg4Double reg;
1381
        reg.Zeroize();
1382
        return reg;
1383
    }
1384
1385
    static inline XMMReg4Double Set1(double d)
1386
    {
1387
        XMMReg4Double reg;
1388
        reg.ymm = _mm256_set1_pd(d);
1389
        return reg;
1390
    }
1391
1392
    inline void Zeroize()
1393
    {
1394
        ymm = _mm256_setzero_pd();
1395
    }
1396
1397
    static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1398
    {
1399
        XMMReg4Double reg;
1400
        reg.nsLoad1ValHighAndLow(ptr);
1401
        return reg;
1402
    }
1403
1404
    inline void nsLoad1ValHighAndLow(const double *ptr)
1405
    {
1406
        ymm = _mm256_set1_pd(*ptr);
1407
    }
1408
1409
    static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1410
    {
1411
        XMMReg4Double reg;
1412
        reg.nsLoad4Val(ptr);
1413
        return reg;
1414
    }
1415
1416
    inline void nsLoad4Val(const unsigned char *ptr)
1417
    {
1418
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
1419
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
1420
        ymm = _mm256_cvtepi32_pd(xmm_i);
1421
    }
1422
1423
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
1424
                                XMMReg4Double &high)
1425
    {
1426
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1427
        const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);
1428
        low.ymm = _mm256_cvtepi32_pd(xmm_i_low);
1429
        const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));
1430
        high.ymm = _mm256_cvtepi32_pd(xmm_i_high);
1431
    }
1432
1433
    static inline XMMReg4Double Load4Val(const short *ptr)
1434
    {
1435
        XMMReg4Double reg;
1436
        reg.nsLoad4Val(ptr);
1437
        return reg;
1438
    }
1439
1440
    inline void nsLoad4Val(const short *ptr)
1441
    {
1442
        __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1443
        xmm_i = _mm_cvtepi16_epi32(xmm_i);
1444
        ymm = _mm256_cvtepi32_pd(xmm_i);
1445
    }
1446
1447
    static inline void Load8Val(const short *ptr, XMMReg4Double &low,
1448
                                XMMReg4Double &high)
1449
    {
1450
        low.nsLoad4Val(ptr);
1451
        high.nsLoad4Val(ptr + 4);
1452
    }
1453
1454
    static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1455
    {
1456
        XMMReg4Double reg;
1457
        reg.nsLoad4Val(ptr);
1458
        return reg;
1459
    }
1460
1461
    inline void nsLoad4Val(const unsigned short *ptr)
1462
    {
1463
        __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1464
        xmm_i = _mm_cvtepu16_epi32(xmm_i);
1465
        ymm = _mm256_cvtepi32_pd(
1466
            xmm_i);  // ok to use signed conversion since we are in the ushort
1467
                     // range, so cannot be interpreted as negative int32
1468
    }
1469
1470
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
1471
                                XMMReg4Double &high)
1472
    {
1473
        low.nsLoad4Val(ptr);
1474
        high.nsLoad4Val(ptr + 4);
1475
    }
1476
1477
    static inline XMMReg4Double Load4Val(const double *ptr)
1478
    {
1479
        XMMReg4Double reg;
1480
        reg.nsLoad4Val(ptr);
1481
        return reg;
1482
    }
1483
1484
    inline void nsLoad4Val(const double *ptr)
1485
    {
1486
        ymm = _mm256_loadu_pd(ptr);
1487
    }
1488
1489
    static inline void Load8Val(const double *ptr, XMMReg4Double &low,
1490
                                XMMReg4Double &high)
1491
    {
1492
        low.nsLoad4Val(ptr);
1493
        high.nsLoad4Val(ptr + 4);
1494
    }
1495
1496
    static inline XMMReg4Double Load4ValAligned(const double *ptr)
1497
    {
1498
        XMMReg4Double reg;
1499
        reg.nsLoad4ValAligned(ptr);
1500
        return reg;
1501
    }
1502
1503
    inline void nsLoad4ValAligned(const double *ptr)
1504
    {
1505
        ymm = _mm256_load_pd(ptr);
1506
    }
1507
1508
    static inline XMMReg4Double Load4Val(const float *ptr)
1509
    {
1510
        XMMReg4Double reg;
1511
        reg.nsLoad4Val(ptr);
1512
        return reg;
1513
    }
1514
1515
    inline void nsLoad4Val(const float *ptr)
1516
    {
1517
        ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
1518
    }
1519
1520
    static inline void Load8Val(const float *ptr, XMMReg4Double &low,
1521
                                XMMReg4Double &high)
1522
    {
1523
        low.nsLoad4Val(ptr);
1524
        high.nsLoad4Val(ptr + 4);
1525
    }
1526
1527
    static inline XMMReg4Double Load4Val(const int *ptr)
1528
    {
1529
        XMMReg4Double reg;
1530
        reg.nsLoad4Val(ptr);
1531
        return reg;
1532
    }
1533
1534
    inline void nsLoad4Val(const int *ptr)
1535
    {
1536
        ymm = _mm256_cvtepi32_pd(
1537
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)));
1538
    }
1539
1540
    static inline void Load8Val(const int *ptr, XMMReg4Double &low,
1541
                                XMMReg4Double &high)
1542
    {
1543
        low.nsLoad4Val(ptr);
1544
        high.nsLoad4Val(ptr + 4);
1545
    }
1546
1547
    static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1548
                                       const XMMReg4Double &expr2)
1549
    {
1550
        XMMReg4Double reg;
1551
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
1552
        return reg;
1553
    }
1554
1555
    static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1556
                                          const XMMReg4Double &expr2)
1557
    {
1558
        XMMReg4Double reg;
1559
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
1560
        return reg;
1561
    }
1562
1563
    static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1564
                                        const XMMReg4Double &expr2)
1565
    {
1566
        XMMReg4Double reg;
1567
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
1568
        return reg;
1569
    }
1570
1571
    static inline XMMReg4Double And(const XMMReg4Double &expr1,
1572
                                    const XMMReg4Double &expr2)
1573
    {
1574
        XMMReg4Double reg;
1575
        reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
1576
        return reg;
1577
    }
1578
1579
    static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1580
                                        const XMMReg4Double &true_expr,
1581
                                        const XMMReg4Double &false_expr)
1582
    {
1583
        XMMReg4Double reg;
1584
        reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
1585
                               _mm256_andnot_pd(cond.ymm, false_expr.ymm));
1586
        return reg;
1587
    }
1588
1589
    static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1590
                                    const XMMReg4Double &expr2)
1591
    {
1592
        XMMReg4Double reg;
1593
        reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
1594
        return reg;
1595
    }
1596
1597
    inline XMMReg4Double &operator=(const XMMReg4Double &other)
1598
    {
1599
        ymm = other.ymm;
1600
        return *this;
1601
    }
1602
1603
    inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1604
    {
1605
        ymm = _mm256_add_pd(ymm, other.ymm);
1606
        return *this;
1607
    }
1608
1609
    inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1610
    {
1611
        ymm = _mm256_mul_pd(ymm, other.ymm);
1612
        return *this;
1613
    }
1614
1615
    inline XMMReg4Double operator+(const XMMReg4Double &other) const
1616
    {
1617
        XMMReg4Double ret;
1618
        ret.ymm = _mm256_add_pd(ymm, other.ymm);
1619
        return ret;
1620
    }
1621
1622
    inline XMMReg4Double operator-(const XMMReg4Double &other) const
1623
    {
1624
        XMMReg4Double ret;
1625
        ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1626
        return ret;
1627
    }
1628
1629
    inline XMMReg4Double operator*(const XMMReg4Double &other) const
1630
    {
1631
        XMMReg4Double ret;
1632
        ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1633
        return ret;
1634
    }
1635
1636
    inline XMMReg4Double operator/(const XMMReg4Double &other) const
1637
    {
1638
        XMMReg4Double ret;
1639
        ret.ymm = _mm256_div_pd(ymm, other.ymm);
1640
        return ret;
1641
    }
1642
1643
    void AddToLow(const XMMReg2Double &other)
1644
    {
1645
        __m256d ymm2 = _mm256_setzero_pd();
1646
        ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1647
        ymm = _mm256_add_pd(ymm, ymm2);
1648
    }
1649
1650
    inline double GetHorizSum() const
1651
    {
1652
        __m256d ymm_tmp1, ymm_tmp2;
1653
        ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1654
        ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1655
        ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1656
        return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1657
    }
1658
1659
    inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
1660
                                         const XMMReg4Double &half) const
1661
    {
1662
        __m256d reg = ymm;
1663
        __m256d reg_half = _mm256_mul_pd(reg, half.ymm);
1664
        // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
1665
        reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));
1666
        // And perform one step of Newton-Raphson approximation to improve it
1667
        // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
1668
        //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
1669
        const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);
1670
        reg = _mm256_mul_pd(
1671
            reg,
1672
            _mm256_sub_pd(one_and_a_half,
1673
                          _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));
1674
        XMMReg4Double ret;
1675
        ret.ymm = reg;
1676
        return ret;
1677
    }
1678
1679
    inline XMMReg4Float cast_to_float() const
1680
    {
1681
        XMMReg4Float ret;
1682
        ret.xmm = _mm256_cvtpd_ps(ymm);
1683
        return ret;
1684
    }
1685
1686
    inline void Store4Val(unsigned char *ptr) const
1687
    {
1688
        __m128i xmm_i =
1689
            _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1690
        // xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
1691
        // xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
1692
        xmm_i =
1693
            _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1694
                                                      (12 << 24)));  //  SSSE3
1695
        GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1696
    }
1697
1698
    inline void Store4Val(unsigned short *ptr) const
1699
    {
1700
        __m128i xmm_i =
1701
            _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1702
        xmm_i = _mm_packus_epi32(xmm_i, xmm_i);  // Pack uint32 to uint16
1703
        GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1704
    }
1705
1706
    inline void Store4Val(float *ptr) const
1707
    {
1708
        _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1709
    }
1710
1711
    inline void Store4Val(double *ptr) const
1712
    {
1713
        _mm256_storeu_pd(ptr, ymm);
1714
    }
1715
1716
    inline void StoreMask(unsigned char *ptr) const
1717
    {
1718
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1719
                            _mm256_castpd_si256(ymm));
1720
    }
1721
};
1722
1723
inline XMMReg4Double XMMReg4Float::cast_to_double() const
1724
{
1725
    XMMReg4Double ret;
1726
    ret.ymm = _mm256_cvtps_pd(xmm);
1727
    return ret;
1728
}
1729
1730
inline XMMReg4Double XMMReg4Int::cast_to_double() const
1731
{
1732
    XMMReg4Double ret;
1733
    ret.ymm = _mm256_cvtepi32_pd(xmm);
1734
    return ret;
1735
}
1736
1737
#else
1738
1739
class XMMReg4Double
1740
{
1741
  public:
1742
    XMMReg2Double low, high;
1743
1744
0
    XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())
1745
0
    {
1746
0
    }
1747
1748
0
    XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
1749
0
    {
1750
0
    }
1751
1752
    static inline XMMReg4Double Zero()
1753
0
    {
1754
0
        XMMReg4Double reg;
1755
0
        reg.low.Zeroize();
1756
0
        reg.high.Zeroize();
1757
0
        return reg;
1758
0
    }
1759
1760
    static inline XMMReg4Double Set1(double d)
1761
0
    {
1762
0
        XMMReg4Double reg;
1763
0
        reg.low = XMMReg2Double::Set1(d);
1764
0
        reg.high = XMMReg2Double::Set1(d);
1765
0
        return reg;
1766
0
    }
1767
1768
    static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1769
0
    {
1770
0
        XMMReg4Double reg;
1771
0
        reg.low.nsLoad1ValHighAndLow(ptr);
1772
0
        reg.high = reg.low;
1773
0
        return reg;
1774
0
    }
1775
1776
    static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1777
0
    {
1778
0
        XMMReg4Double reg;
1779
0
        XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1780
0
        return reg;
1781
0
    }
1782
1783
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
1784
                                XMMReg4Double &high)
1785
0
    {
1786
0
        low = Load4Val(ptr);
1787
0
        high = Load4Val(ptr + 4);
1788
0
    }
1789
1790
    static inline XMMReg4Double Load4Val(const short *ptr)
1791
0
    {
1792
0
        XMMReg4Double reg;
1793
0
        reg.low.nsLoad2Val(ptr);
1794
0
        reg.high.nsLoad2Val(ptr + 2);
1795
0
        return reg;
1796
0
    }
1797
1798
    static inline void Load8Val(const short *ptr, XMMReg4Double &low,
1799
                                XMMReg4Double &high)
1800
0
    {
1801
0
        low = Load4Val(ptr);
1802
0
        high = Load4Val(ptr + 4);
1803
0
    }
1804
1805
    static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1806
0
    {
1807
0
        XMMReg4Double reg;
1808
0
        reg.low.nsLoad2Val(ptr);
1809
0
        reg.high.nsLoad2Val(ptr + 2);
1810
0
        return reg;
1811
0
    }
1812
1813
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
1814
                                XMMReg4Double &high)
1815
0
    {
1816
0
        low = Load4Val(ptr);
1817
0
        high = Load4Val(ptr + 4);
1818
0
    }
1819
1820
    static inline XMMReg4Double Load4Val(const int *ptr)
1821
0
    {
1822
0
        XMMReg4Double reg;
1823
0
        reg.low.nsLoad2Val(ptr);
1824
0
        reg.high.nsLoad2Val(ptr + 2);
1825
0
        return reg;
1826
0
    }
1827
1828
    static inline void Load8Val(const int *ptr, XMMReg4Double &low,
1829
                                XMMReg4Double &high)
1830
0
    {
1831
0
        low = Load4Val(ptr);
1832
0
        high = Load4Val(ptr + 4);
1833
0
    }
1834
1835
    static inline XMMReg4Double Load4Val(const double *ptr)
1836
0
    {
1837
0
        XMMReg4Double reg;
1838
0
        reg.low.nsLoad2Val(ptr);
1839
0
        reg.high.nsLoad2Val(ptr + 2);
1840
0
        return reg;
1841
0
    }
1842
1843
    static inline void Load8Val(const double *ptr, XMMReg4Double &low,
1844
                                XMMReg4Double &high)
1845
0
    {
1846
0
        low = Load4Val(ptr);
1847
0
        high = Load4Val(ptr + 4);
1848
0
    }
1849
1850
    static inline XMMReg4Double Load4ValAligned(const double *ptr)
1851
0
    {
1852
0
        XMMReg4Double reg;
1853
0
        reg.low.nsLoad2ValAligned(ptr);
1854
0
        reg.high.nsLoad2ValAligned(ptr + 2);
1855
0
        return reg;
1856
0
    }
1857
1858
    static inline XMMReg4Double Load4Val(const float *ptr)
1859
0
    {
1860
0
        XMMReg4Double reg;
1861
0
        XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1862
0
        return reg;
1863
0
    }
1864
1865
    static inline void Load8Val(const float *ptr, XMMReg4Double &low,
1866
                                XMMReg4Double &high)
1867
0
    {
1868
0
        low = Load4Val(ptr);
1869
0
        high = Load4Val(ptr + 4);
1870
0
    }
1871
1872
    static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1873
                                       const XMMReg4Double &expr2)
1874
0
    {
1875
0
        XMMReg4Double reg;
1876
0
        reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1877
0
        reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1878
0
        return reg;
1879
0
    }
1880
1881
    static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1882
                                          const XMMReg4Double &expr2)
1883
0
    {
1884
0
        XMMReg4Double reg;
1885
0
        reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1886
0
        reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1887
0
        return reg;
1888
0
    }
1889
1890
    static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1891
                                        const XMMReg4Double &expr2)
1892
0
    {
1893
0
        XMMReg4Double reg;
1894
0
        reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1895
0
        reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1896
0
        return reg;
1897
0
    }
1898
1899
    static inline XMMReg4Double And(const XMMReg4Double &expr1,
1900
                                    const XMMReg4Double &expr2)
1901
0
    {
1902
0
        XMMReg4Double reg;
1903
0
        reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1904
0
        reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1905
0
        return reg;
1906
0
    }
1907
1908
    static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1909
                                        const XMMReg4Double &true_expr,
1910
                                        const XMMReg4Double &false_expr)
1911
0
    {
1912
0
        XMMReg4Double reg;
1913
0
        reg.low =
1914
0
            XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1915
0
        reg.high =
1916
0
            XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1917
0
        return reg;
1918
0
    }
1919
1920
    static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1921
                                    const XMMReg4Double &expr2)
1922
0
    {
1923
0
        XMMReg4Double reg;
1924
0
        reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1925
0
        reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1926
0
        return reg;
1927
0
    }
1928
1929
    inline XMMReg4Double &operator=(const XMMReg4Double &other)
1930
0
    {
1931
0
        low = other.low;
1932
0
        high = other.high;
1933
0
        return *this;
1934
0
    }
1935
1936
    inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1937
0
    {
1938
0
        low += other.low;
1939
0
        high += other.high;
1940
0
        return *this;
1941
0
    }
1942
1943
    inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1944
0
    {
1945
0
        low *= other.low;
1946
0
        high *= other.high;
1947
0
        return *this;
1948
0
    }
1949
1950
    inline XMMReg4Double operator+(const XMMReg4Double &other) const
1951
0
    {
1952
0
        XMMReg4Double ret;
1953
0
        ret.low = low + other.low;
1954
0
        ret.high = high + other.high;
1955
0
        return ret;
1956
0
    }
1957
1958
    inline XMMReg4Double operator-(const XMMReg4Double &other) const
1959
0
    {
1960
0
        XMMReg4Double ret;
1961
0
        ret.low = low - other.low;
1962
0
        ret.high = high - other.high;
1963
0
        return ret;
1964
0
    }
1965
1966
    inline XMMReg4Double operator*(const XMMReg4Double &other) const
1967
0
    {
1968
0
        XMMReg4Double ret;
1969
0
        ret.low = low * other.low;
1970
0
        ret.high = high * other.high;
1971
0
        return ret;
1972
0
    }
1973
1974
    inline XMMReg4Double operator/(const XMMReg4Double &other) const
1975
0
    {
1976
0
        XMMReg4Double ret;
1977
0
        ret.low = low / other.low;
1978
0
        ret.high = high / other.high;
1979
0
        return ret;
1980
0
    }
1981
1982
    void AddToLow(const XMMReg2Double &other)
1983
0
    {
1984
0
        low += other;
1985
0
    }
1986
1987
    inline double GetHorizSum() const
1988
0
    {
1989
0
        return (low + high).GetHorizSum();
1990
0
    }
1991
1992
#if !defined(USE_SSE2_EMULATION)
1993
    inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
1994
                                         const XMMReg4Double &half) const
1995
0
    {
1996
0
        __m128d reg0 = low.xmm;
1997
0
        __m128d reg1 = high.xmm;
1998
0
        __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);
1999
0
        __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);
2000
0
        // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
2001
0
        reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));
2002
0
        reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));
2003
0
        // And perform one step of Newton-Raphson approximation to improve it
2004
0
        // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
2005
0
        //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
2006
0
        const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);
2007
0
        reg0 = _mm_mul_pd(
2008
0
            reg0, _mm_sub_pd(one_and_a_half,
2009
0
                             _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));
2010
0
        reg1 = _mm_mul_pd(
2011
0
            reg1, _mm_sub_pd(one_and_a_half,
2012
0
                             _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));
2013
0
        XMMReg4Double ret;
2014
0
        ret.low.xmm = reg0;
2015
0
        ret.high.xmm = reg1;
2016
0
        return ret;
2017
0
    }
2018
2019
    inline XMMReg4Float cast_to_float() const
2020
0
    {
2021
0
        XMMReg4Float ret;
2022
0
        ret.xmm = _mm_castsi128_ps(
2023
0
            _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),
2024
0
                               _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));
2025
0
        return ret;
2026
0
    }
2027
#endif
2028
2029
    inline void Store4Val(unsigned char *ptr) const
2030
0
    {
2031
#ifdef USE_SSE2_EMULATION
2032
        low.Store2Val(ptr);
2033
        high.Store2Val(ptr + 2);
2034
#else
2035
0
        __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
2036
0
            low.xmm,
2037
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2038
0
        __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
2039
0
            high.xmm,
2040
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2041
0
        auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
2042
0
                                                   _mm_castsi128_ps(tmpHigh),
2043
0
                                                   _MM_SHUFFLE(1, 0, 1, 0)));
2044
0
        tmp = _mm_packs_epi32(tmp, tmp);
2045
0
        tmp = _mm_packus_epi16(tmp, tmp);
2046
0
        GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
2047
0
#endif
2048
0
    }
2049
2050
    inline void Store4Val(unsigned short *ptr) const
2051
0
    {
2052
0
#if 1
2053
0
        low.Store2Val(ptr);
2054
0
        high.Store2Val(ptr + 2);
2055
#else
2056
        __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
2057
        __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
2058
        xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
2059
#if __SSE4_1__
2060
        xmm0 = _mm_packus_epi32(xmm0, xmm0);  // Pack uint32 to uint16
2061
#else
2062
        xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
2063
        xmm0 = _mm_packs_epi32(xmm0, xmm0);
2064
        xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
2065
#endif
2066
        GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
2067
#endif
2068
0
    }
2069
2070
    inline void Store4Val(float *ptr) const
2071
0
    {
2072
0
        low.Store2Val(ptr);
2073
0
        high.Store2Val(ptr + 2);
2074
0
    }
2075
2076
    inline void Store4Val(double *ptr) const
2077
0
    {
2078
0
        low.Store2Val(ptr);
2079
0
        high.Store2Val(ptr + 2);
2080
0
    }
2081
2082
    inline void StoreMask(unsigned char *ptr) const
2083
0
    {
2084
0
        low.StoreMask(ptr);
2085
0
        high.StoreMask(ptr + 16);
2086
0
    }
2087
};
2088
2089
#if !defined(USE_SSE2_EMULATION)
2090
inline XMMReg4Double XMMReg4Float::cast_to_double() const
2091
0
{
2092
0
    XMMReg4Double ret;
2093
0
    ret.low.xmm = _mm_cvtps_pd(xmm);
2094
0
    ret.high.xmm = _mm_cvtps_pd(
2095
0
        _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));
2096
0
    return ret;
2097
0
}
2098
2099
inline XMMReg4Double XMMReg4Int::cast_to_double() const
2100
0
{
2101
0
    XMMReg4Double ret;
2102
0
    ret.low.xmm = _mm_cvtepi32_pd(xmm);
2103
0
    ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));
2104
0
    return ret;
2105
0
}
2106
#endif
2107
2108
#endif
2109
2110
#endif /* #ifndef DOXYGEN_SKIP */
2111
2112
#endif /* GDALSSE_PRIV_H_INCLUDED */