Coverage Report

Created: 2025-06-13 06:29

/src/gdal/gcore/gdalsse_priv.h
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Project:  GDAL
4
 * Purpose:  SSE2 helper
5
 * Author:   Even Rouault <even dot rouault at spatialys dot com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#ifndef GDALSSE_PRIV_H_INCLUDED
14
#define GDALSSE_PRIV_H_INCLUDED
15
16
#ifndef DOXYGEN_SKIP
17
18
#include "cpl_port.h"
19
20
/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
21
/* Could possibly be used too on 32bit, but we would need to check at runtime */
22
#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) &&             \
23
    !defined(USE_SSE2_EMULATION)
24
25
#include <string.h>
26
27
#ifdef USE_NEON_OPTIMIZATIONS
28
#include "include_sse2neon.h"
29
#else
30
/* Requires SSE2 */
31
#include <emmintrin.h>
32
33
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
34
#include <smmintrin.h>
35
#endif
36
#endif
37
38
#include "gdal_priv_templates.hpp"
39
40
static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
41
0
{
42
0
    unsigned short s;
43
0
    memcpy(&s, ptr, 2);
44
0
    return _mm_cvtsi32_si128(s);
45
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt16ToXMM(void const*)
46
47
static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
48
0
{
49
0
    GInt32 i;
50
0
    memcpy(&i, ptr, 4);
51
0
    return _mm_cvtsi32_si128(i);
52
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt32ToXMM(void const*)
53
54
static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
55
0
{
56
#if defined(__i386__) || defined(_M_IX86)
57
    return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
58
#else
59
0
    GInt64 i;
60
0
    memcpy(&i, ptr, 8);
61
0
    return _mm_cvtsi64_si128(i);
62
0
#endif
63
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt64ToXMM(void const*)
64
65
static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
66
0
{
67
0
    GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
68
0
    memcpy(pDest, &i, 2);
69
0
}
Unexecuted instantiation: overview.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
70
71
class XMMReg4Int;
72
73
class XMMReg4Double;
74
75
class XMMReg4Float
76
{
77
  public:
78
    __m128 xmm;
79
80
    XMMReg4Float()
81
#if !defined(_MSC_VER)
82
0
        : xmm(_mm_undefined_ps())
83
#endif
84
0
    {
85
0
    }
86
87
0
    XMMReg4Float(const XMMReg4Float &other) : xmm(other.xmm)
88
0
    {
89
0
    }
90
91
    static inline XMMReg4Float Zero()
92
0
    {
93
0
        XMMReg4Float reg;
94
0
        reg.Zeroize();
95
0
        return reg;
96
0
    }
97
98
    static inline XMMReg4Float Set1(float f)
99
0
    {
100
0
        XMMReg4Float reg;
101
0
        reg.xmm = _mm_set1_ps(f);
102
0
        return reg;
103
0
    }
104
105
    static inline XMMReg4Float Load4Val(const float *ptr)
106
0
    {
107
0
        XMMReg4Float reg;
108
0
        reg.nsLoad4Val(ptr);
109
0
        return reg;
110
0
    }
111
112
    static inline XMMReg4Float Load4Val(const unsigned char *ptr)
113
0
    {
114
0
        XMMReg4Float reg;
115
0
        reg.nsLoad4Val(ptr);
116
0
        return reg;
117
0
    }
118
119
    static inline XMMReg4Float Load4Val(const short *ptr)
120
0
    {
121
0
        XMMReg4Float reg;
122
0
        reg.nsLoad4Val(ptr);
123
0
        return reg;
124
0
    }
125
126
    static inline XMMReg4Float Load4Val(const unsigned short *ptr)
127
0
    {
128
0
        XMMReg4Float reg;
129
0
        reg.nsLoad4Val(ptr);
130
0
        return reg;
131
0
    }
132
133
    static inline XMMReg4Float Load4Val(const int *ptr)
134
0
    {
135
0
        XMMReg4Float reg;
136
0
        reg.nsLoad4Val(ptr);
137
0
        return reg;
138
0
    }
139
140
    static inline XMMReg4Float Equals(const XMMReg4Float &expr1,
141
                                      const XMMReg4Float &expr2)
142
0
    {
143
0
        XMMReg4Float reg;
144
0
        reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);
145
0
        return reg;
146
0
    }
147
148
    static inline XMMReg4Float NotEquals(const XMMReg4Float &expr1,
149
                                         const XMMReg4Float &expr2)
150
0
    {
151
0
        XMMReg4Float reg;
152
0
        reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);
153
0
        return reg;
154
0
    }
155
156
    static inline XMMReg4Float Lesser(const XMMReg4Float &expr1,
157
                                      const XMMReg4Float &expr2)
158
0
    {
159
0
        XMMReg4Float reg;
160
0
        reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);
161
0
        return reg;
162
0
    }
163
164
    static inline XMMReg4Float Greater(const XMMReg4Float &expr1,
165
                                       const XMMReg4Float &expr2)
166
0
    {
167
0
        XMMReg4Float reg;
168
0
        reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);
169
0
        return reg;
170
0
    }
171
172
    static inline XMMReg4Float And(const XMMReg4Float &expr1,
173
                                   const XMMReg4Float &expr2)
174
0
    {
175
0
        XMMReg4Float reg;
176
0
        reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);
177
0
        return reg;
178
0
    }
179
180
    static inline XMMReg4Float Ternary(const XMMReg4Float &cond,
181
                                       const XMMReg4Float &true_expr,
182
                                       const XMMReg4Float &false_expr)
183
0
    {
184
0
        XMMReg4Float reg;
185
0
        reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),
186
0
                            _mm_andnot_ps(cond.xmm, false_expr.xmm));
187
0
        return reg;
188
0
    }
189
190
    static inline XMMReg4Float Min(const XMMReg4Float &expr1,
191
                                   const XMMReg4Float &expr2)
192
0
    {
193
0
        XMMReg4Float reg;
194
0
        reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);
195
0
        return reg;
196
0
    }
197
198
    static inline XMMReg4Float Max(const XMMReg4Float &expr1,
199
                                   const XMMReg4Float &expr2)
200
0
    {
201
0
        XMMReg4Float reg;
202
0
        reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);
203
0
        return reg;
204
0
    }
205
206
    inline void nsLoad4Val(const float *ptr)
207
0
    {
208
0
        xmm = _mm_loadu_ps(ptr);
209
0
    }
210
211
    static inline void Load16Val(const float *ptr, XMMReg4Float &r0,
212
                                 XMMReg4Float &r1, XMMReg4Float &r2,
213
                                 XMMReg4Float &r3)
214
0
    {
215
0
        r0.nsLoad4Val(ptr);
216
0
        r1.nsLoad4Val(ptr + 4);
217
0
        r2.nsLoad4Val(ptr + 8);
218
0
        r3.nsLoad4Val(ptr + 12);
219
0
    }
220
221
    inline void nsLoad4Val(const int *ptr)
222
0
    {
223
0
        const __m128i xmm_i =
224
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
225
0
        xmm = _mm_cvtepi32_ps(xmm_i);
226
0
    }
227
228
    static inline void Load16Val(const int *ptr, XMMReg4Float &r0,
229
                                 XMMReg4Float &r1, XMMReg4Float &r2,
230
                                 XMMReg4Float &r3)
231
0
    {
232
0
        r0.nsLoad4Val(ptr);
233
0
        r1.nsLoad4Val(ptr + 4);
234
0
        r2.nsLoad4Val(ptr + 8);
235
0
        r3.nsLoad4Val(ptr + 12);
236
0
    }
237
238
    static inline __m128i cvtepu8_epi32(__m128i x)
239
0
    {
240
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
241
        return _mm_cvtepu8_epi32(x);
242
#else
243
0
        return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),
244
0
                                  _mm_setzero_si128());
245
0
#endif
246
0
    }
247
248
    inline void nsLoad4Val(const unsigned char *ptr)
249
0
    {
250
0
        const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
251
0
        xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
252
0
    }
253
254
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Float &r0,
255
                                XMMReg4Float &r1)
256
0
    {
257
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
258
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
259
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
260
0
    }
261
262
    static inline void Load16Val(const unsigned char *ptr, XMMReg4Float &r0,
263
                                 XMMReg4Float &r1, XMMReg4Float &r2,
264
                                 XMMReg4Float &r3)
265
0
    {
266
0
        const __m128i xmm_i =
267
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
268
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
269
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
270
0
        r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));
271
0
        r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));
272
0
    }
273
274
    static inline __m128i cvtepi16_epi32(__m128i x)
275
0
    {
276
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
277
        return _mm_cvtepi16_epi32(x);
278
#else
279
        /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
280
0
        return _mm_srai_epi32(
281
            /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
282
0
            _mm_unpacklo_epi16(x, x), 16);
283
0
#endif
284
0
    }
285
286
    inline void nsLoad4Val(const short *ptr)
287
0
    {
288
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
289
0
        xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
290
0
    }
291
292
    static inline void Load8Val(const short *ptr, XMMReg4Float &r0,
293
                                XMMReg4Float &r1)
294
0
    {
295
0
        const __m128i xmm_i =
296
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
297
0
        r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
298
0
        r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));
299
0
    }
300
301
    static inline void Load16Val(const short *ptr, XMMReg4Float &r0,
302
                                 XMMReg4Float &r1, XMMReg4Float &r2,
303
                                 XMMReg4Float &r3)
304
0
    {
305
0
        Load8Val(ptr, r0, r1);
306
0
        Load8Val(ptr + 8, r2, r3);
307
0
    }
308
309
    static inline __m128i cvtepu16_epi32(__m128i x)
310
0
    {
311
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
312
        return _mm_cvtepu16_epi32(x);
313
#else
314
0
        return _mm_unpacklo_epi16(
315
0
            x, _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
316
0
#endif
317
0
    }
318
319
    inline void nsLoad4Val(const unsigned short *ptr)
320
0
    {
321
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
322
0
        xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
323
0
    }
324
325
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Float &r0,
326
                                XMMReg4Float &r1)
327
0
    {
328
0
        const __m128i xmm_i =
329
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
330
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
331
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));
332
0
    }
333
334
    static inline void Load16Val(const unsigned short *ptr, XMMReg4Float &r0,
335
                                 XMMReg4Float &r1, XMMReg4Float &r2,
336
                                 XMMReg4Float &r3)
337
0
    {
338
0
        Load8Val(ptr, r0, r1);
339
0
        Load8Val(ptr + 8, r2, r3);
340
0
    }
341
342
    inline void Zeroize()
343
0
    {
344
0
        xmm = _mm_setzero_ps();
345
0
    }
346
347
    inline XMMReg4Float &operator=(const XMMReg4Float &other)
348
0
    {
349
0
        xmm = other.xmm;
350
0
        return *this;
351
0
    }
352
353
    inline XMMReg4Float &operator+=(const XMMReg4Float &other)
354
0
    {
355
0
        xmm = _mm_add_ps(xmm, other.xmm);
356
0
        return *this;
357
0
    }
358
359
    inline XMMReg4Float &operator-=(const XMMReg4Float &other)
360
0
    {
361
0
        xmm = _mm_sub_ps(xmm, other.xmm);
362
0
        return *this;
363
0
    }
364
365
    inline XMMReg4Float &operator*=(const XMMReg4Float &other)
366
0
    {
367
0
        xmm = _mm_mul_ps(xmm, other.xmm);
368
0
        return *this;
369
0
    }
370
371
    inline XMMReg4Float operator+(const XMMReg4Float &other) const
372
0
    {
373
0
        XMMReg4Float ret;
374
0
        ret.xmm = _mm_add_ps(xmm, other.xmm);
375
0
        return ret;
376
0
    }
377
378
    inline XMMReg4Float operator-(const XMMReg4Float &other) const
379
0
    {
380
0
        XMMReg4Float ret;
381
0
        ret.xmm = _mm_sub_ps(xmm, other.xmm);
382
0
        return ret;
383
0
    }
384
385
    inline XMMReg4Float operator*(const XMMReg4Float &other) const
386
0
    {
387
0
        XMMReg4Float ret;
388
0
        ret.xmm = _mm_mul_ps(xmm, other.xmm);
389
0
        return ret;
390
0
    }
391
392
    inline XMMReg4Float operator/(const XMMReg4Float &other) const
393
0
    {
394
0
        XMMReg4Float ret;
395
0
        ret.xmm = _mm_div_ps(xmm, other.xmm);
396
0
        return ret;
397
0
    }
398
399
    inline XMMReg4Float inverse() const
400
0
    {
401
0
        XMMReg4Float ret;
402
0
        ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);
403
0
        return ret;
404
0
    }
405
406
    inline XMMReg4Int truncate_to_int() const;
407
408
    inline XMMReg4Double cast_to_double() const;
409
410
    inline void Store4Val(float *ptr) const
411
0
    {
412
0
        _mm_storeu_ps(ptr, xmm);
413
0
    }
414
415
    inline void Store4ValAligned(float *ptr) const
416
0
    {
417
0
        _mm_store_ps(ptr, xmm);
418
0
    }
419
420
    inline operator float() const
421
0
    {
422
0
        return _mm_cvtss_f32(xmm);
423
0
    }
424
};
425
426
class XMMReg4Int
427
{
428
  public:
429
    __m128i xmm;
430
431
    XMMReg4Int()
432
#if !defined(_MSC_VER)
433
        : xmm(_mm_undefined_si128())
434
#endif
435
0
    {
436
0
    }
437
438
    XMMReg4Int(const XMMReg4Int &other) : xmm(other.xmm)
439
0
    {
440
0
    }
441
442
    static inline XMMReg4Int Zero()
443
0
    {
444
0
        XMMReg4Int reg;
445
0
        reg.xmm = _mm_setzero_si128();
446
0
        return reg;
447
0
    }
448
449
    static inline XMMReg4Int Set1(int i)
450
0
    {
451
0
        XMMReg4Int reg;
452
0
        reg.xmm = _mm_set1_epi32(i);
453
0
        return reg;
454
0
    }
455
456
    static inline XMMReg4Int Load4Val(const int *ptr)
457
0
    {
458
0
        XMMReg4Int reg;
459
0
        reg.nsLoad4Val(ptr);
460
0
        return reg;
461
0
    }
462
463
    inline void nsLoad4Val(const int *ptr)
464
0
    {
465
0
        xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
466
0
    }
467
468
    static inline XMMReg4Int Equals(const XMMReg4Int &expr1,
469
                                    const XMMReg4Int &expr2)
470
0
    {
471
0
        XMMReg4Int reg;
472
0
        reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);
473
0
        return reg;
474
0
    }
475
476
    static inline XMMReg4Int Ternary(const XMMReg4Int &cond,
477
                                     const XMMReg4Int &true_expr,
478
                                     const XMMReg4Int &false_expr)
479
0
    {
480
0
        XMMReg4Int reg;
481
0
        reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
482
0
                               _mm_andnot_si128(cond.xmm, false_expr.xmm));
483
0
        return reg;
484
0
    }
485
486
    inline XMMReg4Int &operator+=(const XMMReg4Int &other)
487
0
    {
488
0
        xmm = _mm_add_epi32(xmm, other.xmm);
489
0
        return *this;
490
0
    }
491
492
    inline XMMReg4Int &operator-=(const XMMReg4Int &other)
493
0
    {
494
0
        xmm = _mm_sub_epi32(xmm, other.xmm);
495
0
        return *this;
496
0
    }
497
498
    inline XMMReg4Int operator+(const XMMReg4Int &other) const
499
0
    {
500
0
        XMMReg4Int ret;
501
0
        ret.xmm = _mm_add_epi32(xmm, other.xmm);
502
0
        return ret;
503
0
    }
504
505
    inline XMMReg4Int operator-(const XMMReg4Int &other) const
506
0
    {
507
0
        XMMReg4Int ret;
508
0
        ret.xmm = _mm_sub_epi32(xmm, other.xmm);
509
0
        return ret;
510
0
    }
511
512
    XMMReg4Double cast_to_double() const;
513
514
    XMMReg4Float to_float() const
515
0
    {
516
0
        XMMReg4Float ret;
517
0
        ret.xmm = _mm_cvtepi32_ps(xmm);
518
0
        return ret;
519
0
    }
520
};
521
522
inline XMMReg4Int XMMReg4Float::truncate_to_int() const
523
0
{
524
0
    XMMReg4Int ret;
525
0
    ret.xmm = _mm_cvttps_epi32(xmm);
526
0
    return ret;
527
0
}
528
529
class XMMReg8Byte
530
{
531
  public:
532
    __m128i xmm;
533
534
    XMMReg8Byte()
535
#if !defined(_MSC_VER)
536
        : xmm(_mm_undefined_si128())
537
#endif
538
0
    {
539
0
    }
540
541
    XMMReg8Byte(const XMMReg8Byte &other) : xmm(other.xmm)
542
0
    {
543
0
    }
544
545
    static inline XMMReg8Byte Zero()
546
0
    {
547
0
        XMMReg8Byte reg;
548
0
        reg.xmm = _mm_setzero_si128();
549
0
        return reg;
550
0
    }
551
552
    static inline XMMReg8Byte Set1(char i)
553
0
    {
554
0
        XMMReg8Byte reg;
555
0
        reg.xmm = _mm_set1_epi8(i);
556
0
        return reg;
557
0
    }
558
559
    static inline XMMReg8Byte Equals(const XMMReg8Byte &expr1,
560
                                     const XMMReg8Byte &expr2)
561
0
    {
562
0
        XMMReg8Byte reg;
563
0
        reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);
564
0
        return reg;
565
0
    }
566
567
    static inline XMMReg8Byte Or(const XMMReg8Byte &expr1,
568
                                 const XMMReg8Byte &expr2)
569
0
    {
570
0
        XMMReg8Byte reg;
571
0
        reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);
572
0
        return reg;
573
0
    }
574
575
    static inline XMMReg8Byte Ternary(const XMMReg8Byte &cond,
576
                                      const XMMReg8Byte &true_expr,
577
                                      const XMMReg8Byte &false_expr)
578
0
    {
579
0
        XMMReg8Byte reg;
580
0
        reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
581
0
                               _mm_andnot_si128(cond.xmm, false_expr.xmm));
582
0
        return reg;
583
0
    }
584
585
    inline XMMReg8Byte operator+(const XMMReg8Byte &other) const
586
0
    {
587
0
        XMMReg8Byte ret;
588
0
        ret.xmm = _mm_add_epi8(xmm, other.xmm);
589
0
        return ret;
590
0
    }
591
592
    inline XMMReg8Byte operator-(const XMMReg8Byte &other) const
593
0
    {
594
0
        XMMReg8Byte ret;
595
0
        ret.xmm = _mm_sub_epi8(xmm, other.xmm);
596
0
        return ret;
597
0
    }
598
599
    static inline XMMReg8Byte Pack(const XMMReg4Int &r0, const XMMReg4Int &r1)
600
0
    {
601
0
        XMMReg8Byte reg;
602
0
        reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);
603
0
        reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);
604
0
        return reg;
605
0
    }
606
607
    inline void Store8Val(unsigned char *ptr) const
608
0
    {
609
0
        GDALCopyXMMToInt64(xmm, reinterpret_cast<GInt64 *>(ptr));
610
0
    }
611
};
612
613
class XMMReg2Double
614
{
615
  public:
616
    __m128d xmm;
617
618
    XMMReg2Double()
619
#if !defined(_MSC_VER)
620
0
        : xmm(_mm_undefined_pd())
621
#endif
622
0
    {
623
0
    }
624
625
    XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
626
0
    {
627
0
    }
628
629
0
    XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
630
0
    {
631
0
    }
632
633
    static inline XMMReg2Double Set1(double d)
634
0
    {
635
0
        XMMReg2Double reg;
636
0
        reg.xmm = _mm_set1_pd(d);
637
0
        return reg;
638
0
    }
639
640
    static inline XMMReg2Double Zero()
641
0
    {
642
0
        XMMReg2Double reg;
643
0
        reg.Zeroize();
644
0
        return reg;
645
0
    }
646
647
    static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
648
0
    {
649
0
        XMMReg2Double reg;
650
0
        reg.nsLoad1ValHighAndLow(ptr);
651
0
        return reg;
652
0
    }
653
654
    static inline XMMReg2Double Load2Val(const double *ptr)
655
0
    {
656
0
        XMMReg2Double reg;
657
0
        reg.nsLoad2Val(ptr);
658
0
        return reg;
659
0
    }
660
661
    static inline XMMReg2Double Load2Val(const float *ptr)
662
0
    {
663
0
        XMMReg2Double reg;
664
0
        reg.nsLoad2Val(ptr);
665
0
        return reg;
666
0
    }
667
668
    static inline XMMReg2Double Load2ValAligned(const double *ptr)
669
0
    {
670
0
        XMMReg2Double reg;
671
0
        reg.nsLoad2ValAligned(ptr);
672
0
        return reg;
673
0
    }
674
675
    static inline XMMReg2Double Load2Val(const unsigned char *ptr)
676
0
    {
677
0
        XMMReg2Double reg;
678
0
        reg.nsLoad2Val(ptr);
679
0
        return reg;
680
0
    }
681
682
    static inline XMMReg2Double Load2Val(const short *ptr)
683
0
    {
684
0
        XMMReg2Double reg;
685
0
        reg.nsLoad2Val(ptr);
686
0
        return reg;
687
0
    }
688
689
    static inline XMMReg2Double Load2Val(const unsigned short *ptr)
690
0
    {
691
0
        XMMReg2Double reg;
692
0
        reg.nsLoad2Val(ptr);
693
0
        return reg;
694
0
    }
695
696
    static inline XMMReg2Double Load2Val(const int *ptr)
697
0
    {
698
0
        XMMReg2Double reg;
699
0
        reg.nsLoad2Val(ptr);
700
0
        return reg;
701
0
    }
702
703
    static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
704
                                       const XMMReg2Double &expr2)
705
0
    {
706
0
        XMMReg2Double reg;
707
0
        reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
708
0
        return reg;
709
0
    }
710
711
    static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
712
                                          const XMMReg2Double &expr2)
713
0
    {
714
0
        XMMReg2Double reg;
715
0
        reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
716
0
        return reg;
717
0
    }
718
719
    static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
720
                                        const XMMReg2Double &expr2)
721
0
    {
722
0
        XMMReg2Double reg;
723
0
        reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
724
0
        return reg;
725
0
    }
726
727
    static inline XMMReg2Double And(const XMMReg2Double &expr1,
728
                                    const XMMReg2Double &expr2)
729
0
    {
730
0
        XMMReg2Double reg;
731
0
        reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
732
0
        return reg;
733
0
    }
734
735
    static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
736
                                        const XMMReg2Double &true_expr,
737
                                        const XMMReg2Double &false_expr)
738
0
    {
739
0
        XMMReg2Double reg;
740
0
        reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
741
0
                            _mm_andnot_pd(cond.xmm, false_expr.xmm));
742
0
        return reg;
743
0
    }
744
745
    static inline XMMReg2Double Min(const XMMReg2Double &expr1,
746
                                    const XMMReg2Double &expr2)
747
0
    {
748
0
        XMMReg2Double reg;
749
0
        reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
750
0
        return reg;
751
0
    }
752
753
    inline void nsLoad1ValHighAndLow(const double *ptr)
754
0
    {
755
0
        xmm = _mm_load1_pd(ptr);
756
0
    }
757
758
    inline void nsLoad2Val(const double *ptr)
759
0
    {
760
0
        xmm = _mm_loadu_pd(ptr);
761
0
    }
762
763
    inline void nsLoad2ValAligned(const double *ptr)
764
0
    {
765
0
        xmm = _mm_load_pd(ptr);
766
0
    }
767
768
    inline void nsLoad2Val(const float *ptr)
769
0
    {
770
0
        xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
771
0
    }
772
773
    inline void nsLoad2Val(const int *ptr)
774
0
    {
775
0
        xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));
776
0
    }
777
778
    inline void nsLoad2Val(const unsigned char *ptr)
779
0
    {
780
0
        __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
781
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
782
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
783
#else
784
0
        xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
785
0
        xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
786
0
#endif
787
0
        xmm = _mm_cvtepi32_pd(xmm_i);
788
0
    }
789
790
    inline void nsLoad2Val(const short *ptr)
791
0
    {
792
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
793
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
794
        xmm_i = _mm_cvtepi16_epi32(xmm_i);
795
#else
796
0
        xmm_i = _mm_unpacklo_epi16(
797
0
            xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
798
0
        xmm_i = _mm_srai_epi32(
799
0
            xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
800
0
#endif
801
0
        xmm = _mm_cvtepi32_pd(xmm_i);
802
0
    }
803
804
    inline void nsLoad2Val(const unsigned short *ptr)
805
0
    {
806
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
807
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
808
        xmm_i = _mm_cvtepu16_epi32(xmm_i);
809
#else
810
0
        xmm_i = _mm_unpacklo_epi16(
811
0
            xmm_i,
812
0
            _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
813
0
#endif
814
0
        xmm = _mm_cvtepi32_pd(xmm_i);
815
0
    }
816
817
    static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
818
                                XMMReg2Double &high)
819
0
    {
820
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
821
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
822
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
823
#else
824
0
        xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
825
0
        xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
826
0
#endif
827
0
        low.xmm = _mm_cvtepi32_pd(xmm_i);
828
0
        high.xmm =
829
0
            _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
830
0
    }
831
832
    static inline void Load4Val(const short *ptr, XMMReg2Double &low,
833
                                XMMReg2Double &high)
834
0
    {
835
0
        low.nsLoad2Val(ptr);
836
0
        high.nsLoad2Val(ptr + 2);
837
0
    }
838
839
    static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
840
                                XMMReg2Double &high)
841
0
    {
842
0
        low.nsLoad2Val(ptr);
843
0
        high.nsLoad2Val(ptr + 2);
844
0
    }
845
846
    static inline void Load4Val(const double *ptr, XMMReg2Double &low,
847
                                XMMReg2Double &high)
848
0
    {
849
0
        low.nsLoad2Val(ptr);
850
0
        high.nsLoad2Val(ptr + 2);
851
0
    }
852
853
    static inline void Load4Val(const float *ptr, XMMReg2Double &low,
854
                                XMMReg2Double &high)
855
0
    {
856
0
        __m128 temp1 = _mm_loadu_ps(ptr);
857
0
        __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
858
0
        low.xmm = _mm_cvtps_pd(temp1);
859
0
        high.xmm = _mm_cvtps_pd(temp2);
860
0
    }
861
862
    inline void Zeroize()
863
0
    {
864
0
        xmm = _mm_setzero_pd();
865
0
    }
866
867
    inline XMMReg2Double &operator=(const XMMReg2Double &other)
868
0
    {
869
0
        xmm = other.xmm;
870
0
        return *this;
871
0
    }
872
873
    inline XMMReg2Double &operator+=(const XMMReg2Double &other)
874
0
    {
875
0
        xmm = _mm_add_pd(xmm, other.xmm);
876
0
        return *this;
877
0
    }
878
879
    inline XMMReg2Double &operator*=(const XMMReg2Double &other)
880
0
    {
881
0
        xmm = _mm_mul_pd(xmm, other.xmm);
882
0
        return *this;
883
0
    }
884
885
    inline XMMReg2Double operator+(const XMMReg2Double &other) const
886
0
    {
887
0
        XMMReg2Double ret;
888
0
        ret.xmm = _mm_add_pd(xmm, other.xmm);
889
0
        return ret;
890
0
    }
891
892
    inline XMMReg2Double operator-(const XMMReg2Double &other) const
893
0
    {
894
0
        XMMReg2Double ret;
895
0
        ret.xmm = _mm_sub_pd(xmm, other.xmm);
896
0
        return ret;
897
0
    }
898
899
    inline XMMReg2Double operator*(const XMMReg2Double &other) const
900
0
    {
901
0
        XMMReg2Double ret;
902
0
        ret.xmm = _mm_mul_pd(xmm, other.xmm);
903
0
        return ret;
904
0
    }
905
906
    inline XMMReg2Double operator/(const XMMReg2Double &other) const
907
0
    {
908
0
        XMMReg2Double ret;
909
0
        ret.xmm = _mm_div_pd(xmm, other.xmm);
910
0
        return ret;
911
0
    }
912
913
    inline double GetHorizSum() const
914
0
    {
915
0
        __m128d xmm2;
916
0
        xmm2 = _mm_shuffle_pd(
917
0
            xmm, xmm,
918
0
            _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
919
0
        return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
920
0
    }
921
922
    inline void Store2Val(double *ptr) const
923
0
    {
924
0
        _mm_storeu_pd(ptr, xmm);
925
0
    }
926
927
    inline void Store2ValAligned(double *ptr) const
928
0
    {
929
0
        _mm_store_pd(ptr, xmm);
930
0
    }
931
932
    inline void Store2Val(float *ptr) const
933
0
    {
934
0
        __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
935
0
        GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
936
0
    }
937
938
    inline void Store2Val(unsigned char *ptr) const
939
0
    {
940
0
        __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
941
0
            xmm,
942
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
943
0
        tmp = _mm_packs_epi32(tmp, tmp);
944
0
        tmp = _mm_packus_epi16(tmp, tmp);
945
0
        GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
946
0
    }
947
948
    inline void Store2Val(unsigned short *ptr) const
949
0
    {
950
0
        __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
951
0
            xmm,
952
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
953
        // X X X X 0 B 0 A --> X X X X A A B A
954
0
        tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
955
0
        GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
956
0
    }
957
958
    inline void StoreMask(unsigned char *ptr) const
959
0
    {
960
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
961
0
                         _mm_castpd_si128(xmm));
962
0
    }
963
964
    inline operator double() const
965
0
    {
966
0
        return _mm_cvtsd_f64(xmm);
967
0
    }
968
};
969
970
#else
971
972
#ifndef NO_WARN_USE_SSE2_EMULATION
973
#warning "Software emulation of SSE2 !"
974
#endif
975
976
class XMMReg2Double
977
{
978
  public:
979
    double low;
980
    double high;
981
982
    // cppcheck-suppress uninitMemberVar
983
    XMMReg2Double() = default;
984
985
    explicit XMMReg2Double(double val)
986
    {
987
        low = val;
988
        high = 0.0;
989
    }
990
991
    XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
992
    {
993
    }
994
995
    static inline XMMReg2Double Zero()
996
    {
997
        XMMReg2Double reg;
998
        reg.Zeroize();
999
        return reg;
1000
    }
1001
1002
    static inline XMMReg2Double Set1(double d)
1003
    {
1004
        XMMReg2Double reg;
1005
        reg.low = d;
1006
        reg.high = d;
1007
        return reg;
1008
    }
1009
1010
    static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
1011
    {
1012
        XMMReg2Double reg;
1013
        reg.nsLoad1ValHighAndLow(ptr);
1014
        return reg;
1015
    }
1016
1017
    static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
1018
                                       const XMMReg2Double &expr2)
1019
    {
1020
        XMMReg2Double reg;
1021
1022
        if (expr1.low == expr2.low)
1023
            memset(&(reg.low), 0xFF, sizeof(double));
1024
        else
1025
            reg.low = 0;
1026
1027
        if (expr1.high == expr2.high)
1028
            memset(&(reg.high), 0xFF, sizeof(double));
1029
        else
1030
            reg.high = 0;
1031
1032
        return reg;
1033
    }
1034
1035
    static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
1036
                                          const XMMReg2Double &expr2)
1037
    {
1038
        XMMReg2Double reg;
1039
1040
        if (expr1.low != expr2.low)
1041
            memset(&(reg.low), 0xFF, sizeof(double));
1042
        else
1043
            reg.low = 0;
1044
1045
        if (expr1.high != expr2.high)
1046
            memset(&(reg.high), 0xFF, sizeof(double));
1047
        else
1048
            reg.high = 0;
1049
1050
        return reg;
1051
    }
1052
1053
    static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
1054
                                        const XMMReg2Double &expr2)
1055
    {
1056
        XMMReg2Double reg;
1057
1058
        if (expr1.low > expr2.low)
1059
            memset(&(reg.low), 0xFF, sizeof(double));
1060
        else
1061
            reg.low = 0;
1062
1063
        if (expr1.high > expr2.high)
1064
            memset(&(reg.high), 0xFF, sizeof(double));
1065
        else
1066
            reg.high = 0;
1067
1068
        return reg;
1069
    }
1070
1071
    static inline XMMReg2Double And(const XMMReg2Double &expr1,
1072
                                    const XMMReg2Double &expr2)
1073
    {
1074
        XMMReg2Double reg;
1075
        int low1[2], high1[2];
1076
        int low2[2], high2[2];
1077
        memcpy(low1, &expr1.low, sizeof(double));
1078
        memcpy(high1, &expr1.high, sizeof(double));
1079
        memcpy(low2, &expr2.low, sizeof(double));
1080
        memcpy(high2, &expr2.high, sizeof(double));
1081
        low1[0] &= low2[0];
1082
        low1[1] &= low2[1];
1083
        high1[0] &= high2[0];
1084
        high1[1] &= high2[1];
1085
        memcpy(&reg.low, low1, sizeof(double));
1086
        memcpy(&reg.high, high1, sizeof(double));
1087
        return reg;
1088
    }
1089
1090
    static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
1091
                                        const XMMReg2Double &true_expr,
1092
                                        const XMMReg2Double &false_expr)
1093
    {
1094
        XMMReg2Double reg;
1095
        if (cond.low != 0)
1096
            reg.low = true_expr.low;
1097
        else
1098
            reg.low = false_expr.low;
1099
        if (cond.high != 0)
1100
            reg.high = true_expr.high;
1101
        else
1102
            reg.high = false_expr.high;
1103
        return reg;
1104
    }
1105
1106
    static inline XMMReg2Double Min(const XMMReg2Double &expr1,
1107
                                    const XMMReg2Double &expr2)
1108
    {
1109
        XMMReg2Double reg;
1110
        reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
1111
        reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
1112
        return reg;
1113
    }
1114
1115
    static inline XMMReg2Double Load2Val(const double *ptr)
1116
    {
1117
        XMMReg2Double reg;
1118
        reg.nsLoad2Val(ptr);
1119
        return reg;
1120
    }
1121
1122
    static inline XMMReg2Double Load2ValAligned(const double *ptr)
1123
    {
1124
        XMMReg2Double reg;
1125
        reg.nsLoad2ValAligned(ptr);
1126
        return reg;
1127
    }
1128
1129
    static inline XMMReg2Double Load2Val(const float *ptr)
1130
    {
1131
        XMMReg2Double reg;
1132
        reg.nsLoad2Val(ptr);
1133
        return reg;
1134
    }
1135
1136
    static inline XMMReg2Double Load2Val(const unsigned char *ptr)
1137
    {
1138
        XMMReg2Double reg;
1139
        reg.nsLoad2Val(ptr);
1140
        return reg;
1141
    }
1142
1143
    static inline XMMReg2Double Load2Val(const short *ptr)
1144
    {
1145
        XMMReg2Double reg;
1146
        reg.nsLoad2Val(ptr);
1147
        return reg;
1148
    }
1149
1150
    static inline XMMReg2Double Load2Val(const unsigned short *ptr)
1151
    {
1152
        XMMReg2Double reg;
1153
        reg.nsLoad2Val(ptr);
1154
        return reg;
1155
    }
1156
1157
    static inline XMMReg2Double Load2Val(const int *ptr)
1158
    {
1159
        XMMReg2Double reg;
1160
        reg.nsLoad2Val(ptr);
1161
        return reg;
1162
    }
1163
1164
    inline void nsLoad1ValHighAndLow(const double *ptr)
1165
    {
1166
        low = ptr[0];
1167
        high = ptr[0];
1168
    }
1169
1170
    inline void nsLoad2Val(const double *ptr)
1171
    {
1172
        low = ptr[0];
1173
        high = ptr[1];
1174
    }
1175
1176
    inline void nsLoad2ValAligned(const double *ptr)
1177
    {
1178
        low = ptr[0];
1179
        high = ptr[1];
1180
    }
1181
1182
    inline void nsLoad2Val(const float *ptr)
1183
    {
1184
        low = ptr[0];
1185
        high = ptr[1];
1186
    }
1187
1188
    inline void nsLoad2Val(const unsigned char *ptr)
1189
    {
1190
        low = ptr[0];
1191
        high = ptr[1];
1192
    }
1193
1194
    inline void nsLoad2Val(const short *ptr)
1195
    {
1196
        low = ptr[0];
1197
        high = ptr[1];
1198
    }
1199
1200
    inline void nsLoad2Val(const unsigned short *ptr)
1201
    {
1202
        low = ptr[0];
1203
        high = ptr[1];
1204
    }
1205
1206
    inline void nsLoad2Val(const int *ptr)
1207
    {
1208
        low = ptr[0];
1209
        high = ptr[1];
1210
    }
1211
1212
    static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
1213
                                XMMReg2Double &high)
1214
    {
1215
        low.low = ptr[0];
1216
        low.high = ptr[1];
1217
        high.low = ptr[2];
1218
        high.high = ptr[3];
1219
    }
1220
1221
    static inline void Load4Val(const short *ptr, XMMReg2Double &low,
1222
                                XMMReg2Double &high)
1223
    {
1224
        low.nsLoad2Val(ptr);
1225
        high.nsLoad2Val(ptr + 2);
1226
    }
1227
1228
    static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
1229
                                XMMReg2Double &high)
1230
    {
1231
        low.nsLoad2Val(ptr);
1232
        high.nsLoad2Val(ptr + 2);
1233
    }
1234
1235
    static inline void Load4Val(const double *ptr, XMMReg2Double &low,
1236
                                XMMReg2Double &high)
1237
    {
1238
        low.nsLoad2Val(ptr);
1239
        high.nsLoad2Val(ptr + 2);
1240
    }
1241
1242
    static inline void Load4Val(const float *ptr, XMMReg2Double &low,
1243
                                XMMReg2Double &high)
1244
    {
1245
        low.nsLoad2Val(ptr);
1246
        high.nsLoad2Val(ptr + 2);
1247
    }
1248
1249
    inline void Zeroize()
1250
    {
1251
        low = 0.0;
1252
        high = 0.0;
1253
    }
1254
1255
    inline XMMReg2Double &operator=(const XMMReg2Double &other)
1256
    {
1257
        low = other.low;
1258
        high = other.high;
1259
        return *this;
1260
    }
1261
1262
    inline XMMReg2Double &operator+=(const XMMReg2Double &other)
1263
    {
1264
        low += other.low;
1265
        high += other.high;
1266
        return *this;
1267
    }
1268
1269
    inline XMMReg2Double &operator*=(const XMMReg2Double &other)
1270
    {
1271
        low *= other.low;
1272
        high *= other.high;
1273
        return *this;
1274
    }
1275
1276
    inline XMMReg2Double operator+(const XMMReg2Double &other) const
1277
    {
1278
        XMMReg2Double ret;
1279
        ret.low = low + other.low;
1280
        ret.high = high + other.high;
1281
        return ret;
1282
    }
1283
1284
    inline XMMReg2Double operator-(const XMMReg2Double &other) const
1285
    {
1286
        XMMReg2Double ret;
1287
        ret.low = low - other.low;
1288
        ret.high = high - other.high;
1289
        return ret;
1290
    }
1291
1292
    inline XMMReg2Double operator*(const XMMReg2Double &other) const
1293
    {
1294
        XMMReg2Double ret;
1295
        ret.low = low * other.low;
1296
        ret.high = high * other.high;
1297
        return ret;
1298
    }
1299
1300
    inline XMMReg2Double operator/(const XMMReg2Double &other) const
1301
    {
1302
        XMMReg2Double ret;
1303
        ret.low = low / other.low;
1304
        ret.high = high / other.high;
1305
        return ret;
1306
    }
1307
1308
    inline double GetHorizSum() const
1309
    {
1310
        return low + high;
1311
    }
1312
1313
    inline void Store2Val(double *ptr) const
1314
    {
1315
        ptr[0] = low;
1316
        ptr[1] = high;
1317
    }
1318
1319
    inline void Store2ValAligned(double *ptr) const
1320
    {
1321
        ptr[0] = low;
1322
        ptr[1] = high;
1323
    }
1324
1325
    inline void Store2Val(float *ptr) const
1326
    {
1327
        ptr[0] = static_cast<float>(low);
1328
        ptr[1] = static_cast<float>(high);
1329
    }
1330
1331
    void Store2Val(unsigned char *ptr) const
1332
    {
1333
        ptr[0] = static_cast<unsigned char>(low + 0.5);
1334
        ptr[1] = static_cast<unsigned char>(high + 0.5);
1335
    }
1336
1337
    void Store2Val(unsigned short *ptr) const
1338
    {
1339
        ptr[0] = static_cast<GUInt16>(low + 0.5);
1340
        ptr[1] = static_cast<GUInt16>(high + 0.5);
1341
    }
1342
1343
    inline void StoreMask(unsigned char *ptr) const
1344
    {
1345
        memcpy(ptr, &low, 8);
1346
        memcpy(ptr + 8, &high, 8);
1347
    }
1348
1349
    inline operator double() const
1350
    {
1351
        return low;
1352
    }
1353
};
1354
1355
#endif /*  defined(__x86_64) || defined(_M_X64) */
1356
1357
#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
1358
1359
#include <immintrin.h>
1360
1361
class XMMReg4Double
1362
{
1363
  public:
1364
    __m256d ymm;
1365
1366
    XMMReg4Double() : ymm(_mm256_setzero_pd())
1367
    {
1368
    }
1369
1370
    XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
1371
    {
1372
    }
1373
1374
    static inline XMMReg4Double Zero()
1375
    {
1376
        XMMReg4Double reg;
1377
        reg.Zeroize();
1378
        return reg;
1379
    }
1380
1381
    static inline XMMReg4Double Set1(double d)
1382
    {
1383
        XMMReg4Double reg;
1384
        reg.ymm = _mm256_set1_pd(d);
1385
        return reg;
1386
    }
1387
1388
    inline void Zeroize()
1389
    {
1390
        ymm = _mm256_setzero_pd();
1391
    }
1392
1393
    static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1394
    {
1395
        XMMReg4Double reg;
1396
        reg.nsLoad1ValHighAndLow(ptr);
1397
        return reg;
1398
    }
1399
1400
    inline void nsLoad1ValHighAndLow(const double *ptr)
1401
    {
1402
        ymm = _mm256_set1_pd(*ptr);
1403
    }
1404
1405
    static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1406
    {
1407
        XMMReg4Double reg;
1408
        reg.nsLoad4Val(ptr);
1409
        return reg;
1410
    }
1411
1412
    inline void nsLoad4Val(const unsigned char *ptr)
1413
    {
1414
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
1415
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
1416
        ymm = _mm256_cvtepi32_pd(xmm_i);
1417
    }
1418
1419
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
1420
                                XMMReg4Double &high)
1421
    {
1422
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1423
        const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);
1424
        low.ymm = _mm256_cvtepi32_pd(xmm_i_low);
1425
        const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));
1426
        high.ymm = _mm256_cvtepi32_pd(xmm_i_high);
1427
    }
1428
1429
    static inline XMMReg4Double Load4Val(const short *ptr)
1430
    {
1431
        XMMReg4Double reg;
1432
        reg.nsLoad4Val(ptr);
1433
        return reg;
1434
    }
1435
1436
    inline void nsLoad4Val(const short *ptr)
1437
    {
1438
        __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1439
        xmm_i = _mm_cvtepi16_epi32(xmm_i);
1440
        ymm = _mm256_cvtepi32_pd(xmm_i);
1441
    }
1442
1443
    static inline void Load8Val(const short *ptr, XMMReg4Double &low,
1444
                                XMMReg4Double &high)
1445
    {
1446
        low.nsLoad4Val(ptr);
1447
        high.nsLoad4Val(ptr + 4);
1448
    }
1449
1450
    static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1451
    {
1452
        XMMReg4Double reg;
1453
        reg.nsLoad4Val(ptr);
1454
        return reg;
1455
    }
1456
1457
    inline void nsLoad4Val(const unsigned short *ptr)
1458
    {
1459
        __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1460
        xmm_i = _mm_cvtepu16_epi32(xmm_i);
1461
        ymm = _mm256_cvtepi32_pd(
1462
            xmm_i);  // ok to use signed conversion since we are in the ushort
1463
                     // range, so cannot be interpreted as negative int32
1464
    }
1465
1466
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
1467
                                XMMReg4Double &high)
1468
    {
1469
        low.nsLoad4Val(ptr);
1470
        high.nsLoad4Val(ptr + 4);
1471
    }
1472
1473
    static inline XMMReg4Double Load4Val(const double *ptr)
1474
    {
1475
        XMMReg4Double reg;
1476
        reg.nsLoad4Val(ptr);
1477
        return reg;
1478
    }
1479
1480
    inline void nsLoad4Val(const double *ptr)
1481
    {
1482
        ymm = _mm256_loadu_pd(ptr);
1483
    }
1484
1485
    static inline void Load8Val(const double *ptr, XMMReg4Double &low,
1486
                                XMMReg4Double &high)
1487
    {
1488
        low.nsLoad4Val(ptr);
1489
        high.nsLoad4Val(ptr + 4);
1490
    }
1491
1492
    static inline XMMReg4Double Load4ValAligned(const double *ptr)
1493
    {
1494
        XMMReg4Double reg;
1495
        reg.nsLoad4ValAligned(ptr);
1496
        return reg;
1497
    }
1498
1499
    inline void nsLoad4ValAligned(const double *ptr)
1500
    {
1501
        ymm = _mm256_load_pd(ptr);
1502
    }
1503
1504
    static inline XMMReg4Double Load4Val(const float *ptr)
1505
    {
1506
        XMMReg4Double reg;
1507
        reg.nsLoad4Val(ptr);
1508
        return reg;
1509
    }
1510
1511
    inline void nsLoad4Val(const float *ptr)
1512
    {
1513
        ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
1514
    }
1515
1516
    static inline void Load8Val(const float *ptr, XMMReg4Double &low,
1517
                                XMMReg4Double &high)
1518
    {
1519
        low.nsLoad4Val(ptr);
1520
        high.nsLoad4Val(ptr + 4);
1521
    }
1522
1523
    static inline XMMReg4Double Load4Val(const int *ptr)
1524
    {
1525
        XMMReg4Double reg;
1526
        reg.nsLoad4Val(ptr);
1527
        return reg;
1528
    }
1529
1530
    inline void nsLoad4Val(const int *ptr)
1531
    {
1532
        ymm = _mm256_cvtepi32_pd(
1533
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)));
1534
    }
1535
1536
    static inline void Load8Val(const int *ptr, XMMReg4Double &low,
1537
                                XMMReg4Double &high)
1538
    {
1539
        low.nsLoad4Val(ptr);
1540
        high.nsLoad4Val(ptr + 4);
1541
    }
1542
1543
    static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1544
                                       const XMMReg4Double &expr2)
1545
    {
1546
        XMMReg4Double reg;
1547
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
1548
        return reg;
1549
    }
1550
1551
    static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1552
                                          const XMMReg4Double &expr2)
1553
    {
1554
        XMMReg4Double reg;
1555
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
1556
        return reg;
1557
    }
1558
1559
    static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1560
                                        const XMMReg4Double &expr2)
1561
    {
1562
        XMMReg4Double reg;
1563
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
1564
        return reg;
1565
    }
1566
1567
    static inline XMMReg4Double And(const XMMReg4Double &expr1,
1568
                                    const XMMReg4Double &expr2)
1569
    {
1570
        XMMReg4Double reg;
1571
        reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
1572
        return reg;
1573
    }
1574
1575
    static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1576
                                        const XMMReg4Double &true_expr,
1577
                                        const XMMReg4Double &false_expr)
1578
    {
1579
        XMMReg4Double reg;
1580
        reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
1581
                               _mm256_andnot_pd(cond.ymm, false_expr.ymm));
1582
        return reg;
1583
    }
1584
1585
    static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1586
                                    const XMMReg4Double &expr2)
1587
    {
1588
        XMMReg4Double reg;
1589
        reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
1590
        return reg;
1591
    }
1592
1593
    inline XMMReg4Double &operator=(const XMMReg4Double &other)
1594
    {
1595
        ymm = other.ymm;
1596
        return *this;
1597
    }
1598
1599
    inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1600
    {
1601
        ymm = _mm256_add_pd(ymm, other.ymm);
1602
        return *this;
1603
    }
1604
1605
    inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1606
    {
1607
        ymm = _mm256_mul_pd(ymm, other.ymm);
1608
        return *this;
1609
    }
1610
1611
    inline XMMReg4Double operator+(const XMMReg4Double &other) const
1612
    {
1613
        XMMReg4Double ret;
1614
        ret.ymm = _mm256_add_pd(ymm, other.ymm);
1615
        return ret;
1616
    }
1617
1618
    inline XMMReg4Double operator-(const XMMReg4Double &other) const
1619
    {
1620
        XMMReg4Double ret;
1621
        ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1622
        return ret;
1623
    }
1624
1625
    inline XMMReg4Double operator*(const XMMReg4Double &other) const
1626
    {
1627
        XMMReg4Double ret;
1628
        ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1629
        return ret;
1630
    }
1631
1632
    inline XMMReg4Double operator/(const XMMReg4Double &other) const
1633
    {
1634
        XMMReg4Double ret;
1635
        ret.ymm = _mm256_div_pd(ymm, other.ymm);
1636
        return ret;
1637
    }
1638
1639
    void AddToLow(const XMMReg2Double &other)
1640
    {
1641
        __m256d ymm2 = _mm256_setzero_pd();
1642
        ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1643
        ymm = _mm256_add_pd(ymm, ymm2);
1644
    }
1645
1646
    inline double GetHorizSum() const
1647
    {
1648
        __m256d ymm_tmp1, ymm_tmp2;
1649
        ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1650
        ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1651
        ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1652
        return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1653
    }
1654
1655
    inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
1656
                                         const XMMReg4Double &half) const
1657
    {
1658
        __m256d reg = ymm;
1659
        __m256d reg_half = _mm256_mul_pd(reg, half.ymm);
1660
        // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
1661
        reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));
1662
        // And perform one step of Newton-Raphson approximation to improve it
1663
        // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
1664
        //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
1665
        const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);
1666
        reg = _mm256_mul_pd(
1667
            reg,
1668
            _mm256_sub_pd(one_and_a_half,
1669
                          _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));
1670
        XMMReg4Double ret;
1671
        ret.ymm = reg;
1672
        return ret;
1673
    }
1674
1675
    inline XMMReg4Float cast_to_float() const
1676
    {
1677
        XMMReg4Float ret;
1678
        ret.xmm = _mm256_cvtpd_ps(ymm);
1679
        return ret;
1680
    }
1681
1682
    inline void Store4Val(unsigned char *ptr) const
1683
    {
1684
        __m128i xmm_i =
1685
            _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1686
        // xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
1687
        // xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
1688
        xmm_i =
1689
            _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1690
                                                      (12 << 24)));  //  SSSE3
1691
        GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1692
    }
1693
1694
    inline void Store4Val(unsigned short *ptr) const
1695
    {
1696
        __m128i xmm_i =
1697
            _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1698
        xmm_i = _mm_packus_epi32(xmm_i, xmm_i);  // Pack uint32 to uint16
1699
        GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1700
    }
1701
1702
    inline void Store4Val(float *ptr) const
1703
    {
1704
        _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1705
    }
1706
1707
    inline void Store4Val(double *ptr) const
1708
    {
1709
        _mm256_storeu_pd(ptr, ymm);
1710
    }
1711
1712
    inline void StoreMask(unsigned char *ptr) const
1713
    {
1714
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1715
                            _mm256_castpd_si256(ymm));
1716
    }
1717
};
1718
1719
inline XMMReg4Double XMMReg4Float::cast_to_double() const
1720
{
1721
    XMMReg4Double ret;
1722
    ret.ymm = _mm256_cvtps_pd(xmm);
1723
    return ret;
1724
}
1725
1726
inline XMMReg4Double XMMReg4Int::cast_to_double() const
1727
{
1728
    XMMReg4Double ret;
1729
    ret.ymm = _mm256_cvtepi32_pd(xmm);
1730
    return ret;
1731
}
1732
1733
#else
1734
1735
class XMMReg4Double
1736
{
1737
  public:
1738
    XMMReg2Double low, high;
1739
1740
0
    XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())
1741
0
    {
1742
0
    }
1743
1744
0
    XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
1745
0
    {
1746
0
    }
1747
1748
    static inline XMMReg4Double Zero()
1749
0
    {
1750
0
        XMMReg4Double reg;
1751
0
        reg.low.Zeroize();
1752
0
        reg.high.Zeroize();
1753
0
        return reg;
1754
0
    }
1755
1756
    static inline XMMReg4Double Set1(double d)
1757
0
    {
1758
0
        XMMReg4Double reg;
1759
0
        reg.low = XMMReg2Double::Set1(d);
1760
0
        reg.high = XMMReg2Double::Set1(d);
1761
0
        return reg;
1762
0
    }
1763
1764
    static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1765
0
    {
1766
0
        XMMReg4Double reg;
1767
0
        reg.low.nsLoad1ValHighAndLow(ptr);
1768
0
        reg.high = reg.low;
1769
0
        return reg;
1770
0
    }
1771
1772
    static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1773
0
    {
1774
0
        XMMReg4Double reg;
1775
0
        XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1776
0
        return reg;
1777
0
    }
1778
1779
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
1780
                                XMMReg4Double &high)
1781
0
    {
1782
0
        low = Load4Val(ptr);
1783
0
        high = Load4Val(ptr + 4);
1784
0
    }
1785
1786
    static inline XMMReg4Double Load4Val(const short *ptr)
1787
0
    {
1788
0
        XMMReg4Double reg;
1789
0
        reg.low.nsLoad2Val(ptr);
1790
0
        reg.high.nsLoad2Val(ptr + 2);
1791
0
        return reg;
1792
0
    }
1793
1794
    static inline void Load8Val(const short *ptr, XMMReg4Double &low,
1795
                                XMMReg4Double &high)
1796
0
    {
1797
0
        low = Load4Val(ptr);
1798
0
        high = Load4Val(ptr + 4);
1799
0
    }
1800
1801
    static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1802
0
    {
1803
0
        XMMReg4Double reg;
1804
0
        reg.low.nsLoad2Val(ptr);
1805
0
        reg.high.nsLoad2Val(ptr + 2);
1806
0
        return reg;
1807
0
    }
1808
1809
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
1810
                                XMMReg4Double &high)
1811
0
    {
1812
0
        low = Load4Val(ptr);
1813
0
        high = Load4Val(ptr + 4);
1814
0
    }
1815
1816
    static inline XMMReg4Double Load4Val(const int *ptr)
1817
0
    {
1818
0
        XMMReg4Double reg;
1819
0
        reg.low.nsLoad2Val(ptr);
1820
0
        reg.high.nsLoad2Val(ptr + 2);
1821
0
        return reg;
1822
0
    }
1823
1824
    static inline void Load8Val(const int *ptr, XMMReg4Double &low,
1825
                                XMMReg4Double &high)
1826
0
    {
1827
0
        low = Load4Val(ptr);
1828
0
        high = Load4Val(ptr + 4);
1829
0
    }
1830
1831
    static inline XMMReg4Double Load4Val(const double *ptr)
1832
0
    {
1833
0
        XMMReg4Double reg;
1834
0
        reg.low.nsLoad2Val(ptr);
1835
0
        reg.high.nsLoad2Val(ptr + 2);
1836
0
        return reg;
1837
0
    }
1838
1839
    static inline void Load8Val(const double *ptr, XMMReg4Double &low,
1840
                                XMMReg4Double &high)
1841
0
    {
1842
0
        low = Load4Val(ptr);
1843
0
        high = Load4Val(ptr + 4);
1844
0
    }
1845
1846
    static inline XMMReg4Double Load4ValAligned(const double *ptr)
1847
0
    {
1848
0
        XMMReg4Double reg;
1849
0
        reg.low.nsLoad2ValAligned(ptr);
1850
0
        reg.high.nsLoad2ValAligned(ptr + 2);
1851
0
        return reg;
1852
0
    }
1853
1854
    static inline XMMReg4Double Load4Val(const float *ptr)
1855
0
    {
1856
0
        XMMReg4Double reg;
1857
0
        XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1858
0
        return reg;
1859
0
    }
1860
1861
    static inline void Load8Val(const float *ptr, XMMReg4Double &low,
1862
                                XMMReg4Double &high)
1863
0
    {
1864
0
        low = Load4Val(ptr);
1865
0
        high = Load4Val(ptr + 4);
1866
0
    }
1867
1868
    static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1869
                                       const XMMReg4Double &expr2)
1870
0
    {
1871
0
        XMMReg4Double reg;
1872
0
        reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1873
0
        reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1874
0
        return reg;
1875
0
    }
1876
1877
    static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1878
                                          const XMMReg4Double &expr2)
1879
0
    {
1880
0
        XMMReg4Double reg;
1881
0
        reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1882
0
        reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1883
0
        return reg;
1884
0
    }
1885
1886
    static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1887
                                        const XMMReg4Double &expr2)
1888
0
    {
1889
0
        XMMReg4Double reg;
1890
0
        reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1891
0
        reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1892
0
        return reg;
1893
0
    }
1894
1895
    static inline XMMReg4Double And(const XMMReg4Double &expr1,
1896
                                    const XMMReg4Double &expr2)
1897
0
    {
1898
0
        XMMReg4Double reg;
1899
0
        reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1900
0
        reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1901
0
        return reg;
1902
0
    }
1903
1904
    static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1905
                                        const XMMReg4Double &true_expr,
1906
                                        const XMMReg4Double &false_expr)
1907
0
    {
1908
0
        XMMReg4Double reg;
1909
0
        reg.low =
1910
0
            XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1911
0
        reg.high =
1912
0
            XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1913
0
        return reg;
1914
0
    }
1915
1916
    static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1917
                                    const XMMReg4Double &expr2)
1918
0
    {
1919
0
        XMMReg4Double reg;
1920
0
        reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1921
0
        reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1922
0
        return reg;
1923
0
    }
1924
1925
    inline XMMReg4Double &operator=(const XMMReg4Double &other)
1926
0
    {
1927
0
        low = other.low;
1928
0
        high = other.high;
1929
0
        return *this;
1930
0
    }
1931
1932
    inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1933
0
    {
1934
0
        low += other.low;
1935
0
        high += other.high;
1936
0
        return *this;
1937
0
    }
1938
1939
    inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1940
0
    {
1941
0
        low *= other.low;
1942
0
        high *= other.high;
1943
0
        return *this;
1944
0
    }
1945
1946
    inline XMMReg4Double operator+(const XMMReg4Double &other) const
1947
0
    {
1948
0
        XMMReg4Double ret;
1949
0
        ret.low = low + other.low;
1950
0
        ret.high = high + other.high;
1951
0
        return ret;
1952
0
    }
1953
1954
    inline XMMReg4Double operator-(const XMMReg4Double &other) const
1955
0
    {
1956
0
        XMMReg4Double ret;
1957
0
        ret.low = low - other.low;
1958
0
        ret.high = high - other.high;
1959
0
        return ret;
1960
0
    }
1961
1962
    inline XMMReg4Double operator*(const XMMReg4Double &other) const
1963
0
    {
1964
0
        XMMReg4Double ret;
1965
0
        ret.low = low * other.low;
1966
0
        ret.high = high * other.high;
1967
0
        return ret;
1968
0
    }
1969
1970
    inline XMMReg4Double operator/(const XMMReg4Double &other) const
1971
0
    {
1972
0
        XMMReg4Double ret;
1973
0
        ret.low = low / other.low;
1974
0
        ret.high = high / other.high;
1975
0
        return ret;
1976
0
    }
1977
1978
    void AddToLow(const XMMReg2Double &other)
1979
0
    {
1980
0
        low += other;
1981
0
    }
1982
1983
    inline double GetHorizSum() const
1984
0
    {
1985
0
        return (low + high).GetHorizSum();
1986
0
    }
1987
1988
#if !defined(USE_SSE2_EMULATION)
1989
    inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
1990
                                         const XMMReg4Double &half) const
1991
0
    {
1992
0
        __m128d reg0 = low.xmm;
1993
0
        __m128d reg1 = high.xmm;
1994
0
        __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);
1995
0
        __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);
1996
0
        // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
1997
0
        reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));
1998
0
        reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));
1999
0
        // And perform one step of Newton-Raphson approximation to improve it
2000
0
        // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
2001
0
        //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
2002
0
        const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);
2003
0
        reg0 = _mm_mul_pd(
2004
0
            reg0, _mm_sub_pd(one_and_a_half,
2005
0
                             _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));
2006
0
        reg1 = _mm_mul_pd(
2007
0
            reg1, _mm_sub_pd(one_and_a_half,
2008
0
                             _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));
2009
0
        XMMReg4Double ret;
2010
0
        ret.low.xmm = reg0;
2011
0
        ret.high.xmm = reg1;
2012
0
        return ret;
2013
0
    }
2014
2015
    inline XMMReg4Float cast_to_float() const
2016
0
    {
2017
0
        XMMReg4Float ret;
2018
0
        ret.xmm = _mm_castsi128_ps(
2019
0
            _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),
2020
0
                               _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));
2021
0
        return ret;
2022
0
    }
2023
#endif
2024
2025
    inline void Store4Val(unsigned char *ptr) const
2026
0
    {
2027
#ifdef USE_SSE2_EMULATION
2028
        low.Store2Val(ptr);
2029
        high.Store2Val(ptr + 2);
2030
#else
2031
0
        __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
2032
0
            low.xmm,
2033
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2034
0
        __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
2035
0
            high.xmm,
2036
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2037
0
        auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
2038
0
                                                   _mm_castsi128_ps(tmpHigh),
2039
0
                                                   _MM_SHUFFLE(1, 0, 1, 0)));
2040
0
        tmp = _mm_packs_epi32(tmp, tmp);
2041
0
        tmp = _mm_packus_epi16(tmp, tmp);
2042
0
        GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
2043
0
#endif
2044
0
    }
2045
2046
    inline void Store4Val(unsigned short *ptr) const
2047
0
    {
2048
0
#if 1
2049
0
        low.Store2Val(ptr);
2050
0
        high.Store2Val(ptr + 2);
2051
#else
2052
        __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
2053
        __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
2054
        xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
2055
#if __SSE4_1__
2056
        xmm0 = _mm_packus_epi32(xmm0, xmm0);  // Pack uint32 to uint16
2057
#else
2058
        xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
2059
        xmm0 = _mm_packs_epi32(xmm0, xmm0);
2060
        xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
2061
#endif
2062
        GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
2063
#endif
2064
0
    }
2065
2066
    inline void Store4Val(float *ptr) const
2067
0
    {
2068
0
        low.Store2Val(ptr);
2069
0
        high.Store2Val(ptr + 2);
2070
0
    }
2071
2072
    inline void Store4Val(double *ptr) const
2073
0
    {
2074
0
        low.Store2Val(ptr);
2075
0
        high.Store2Val(ptr + 2);
2076
0
    }
2077
2078
    inline void StoreMask(unsigned char *ptr) const
2079
0
    {
2080
0
        low.StoreMask(ptr);
2081
0
        high.StoreMask(ptr + 16);
2082
0
    }
2083
};
2084
2085
#if !defined(USE_SSE2_EMULATION)
2086
inline XMMReg4Double XMMReg4Float::cast_to_double() const
2087
0
{
2088
0
    XMMReg4Double ret;
2089
0
    ret.low.xmm = _mm_cvtps_pd(xmm);
2090
0
    ret.high.xmm = _mm_cvtps_pd(
2091
0
        _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));
2092
0
    return ret;
2093
0
}
2094
2095
inline XMMReg4Double XMMReg4Int::cast_to_double() const
2096
0
{
2097
0
    XMMReg4Double ret;
2098
0
    ret.low.xmm = _mm_cvtepi32_pd(xmm);
2099
0
    ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));
2100
0
    return ret;
2101
0
}
2102
#endif
2103
2104
#endif
2105
2106
#endif /* #ifndef DOXYGEN_SKIP */
2107
2108
#endif /* GDALSSE_PRIV_H_INCLUDED */