Coverage Report

Created: 2025-12-31 06:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/gcore/gdalsse_priv.h
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Project:  GDAL
4
 * Purpose:  SSE2 helper
5
 * Author:   Even Rouault <even dot rouault at spatialys dot com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#ifndef GDALSSE_PRIV_H_INCLUDED
14
#define GDALSSE_PRIV_H_INCLUDED
15
16
#ifndef DOXYGEN_SKIP
17
18
#include "cpl_port.h"
19
20
/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
21
/* Could possibly be used too on 32bit, but we would need to check at runtime */
22
#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) &&             \
23
    !defined(USE_SSE2_EMULATION)
24
25
#include <string.h>
26
27
#ifdef USE_NEON_OPTIMIZATIONS
28
#include "include_sse2neon.h"
29
#else
30
/* Requires SSE2 */
31
#include <emmintrin.h>
32
33
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
34
#include <smmintrin.h>
35
#endif
36
#endif
37
38
#include "gdal_priv_templates.hpp"
39
40
static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
41
0
{
42
0
    unsigned short s;
43
0
    memcpy(&s, ptr, 2);
44
0
    return _mm_cvtsi32_si128(s);
45
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt16ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt16ToXMM(void const*)
46
47
static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
48
0
{
49
0
    GInt32 i;
50
0
    memcpy(&i, ptr, 4);
51
0
    return _mm_cvtsi32_si128(i);
52
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt32ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt32ToXMM(void const*)
53
54
static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
55
0
{
56
#if defined(__i386__) || defined(_M_IX86)
57
    return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
58
#else
59
0
    GInt64 i;
60
0
    memcpy(&i, ptr, 8);
61
0
    return _mm_cvtsi64_si128(i);
62
0
#endif
63
0
}
Unexecuted instantiation: overview.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyInt64ToXMM(void const*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyInt64ToXMM(void const*)
64
65
#ifndef GDALCopyXMMToInt16_defined
66
#define GDALCopyXMMToInt16_defined
67
68
static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
69
0
{
70
0
    GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
71
0
    memcpy(pDest, &i, 2);
72
0
}
Unexecuted instantiation: overview.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdalwarpkernel.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: pixelfunctions.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdal_rpc.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
Unexecuted instantiation: gdalpansharpen.cpp:GDALCopyXMMToInt16(long long __vector(2), void*)
73
#endif
74
75
class XMMReg4Int;
76
77
class XMMReg4Double;
78
79
class XMMReg4Float
80
{
81
  public:
82
    __m128 xmm;
83
84
    XMMReg4Float()
85
#if !defined(_MSC_VER)
86
0
        : xmm(_mm_undefined_ps())
87
#endif
88
0
    {
89
0
    }
90
91
0
    XMMReg4Float(const XMMReg4Float &other) : xmm(other.xmm)
92
0
    {
93
0
    }
94
95
    static inline XMMReg4Float Zero()
96
0
    {
97
0
        XMMReg4Float reg;
98
0
        reg.Zeroize();
99
0
        return reg;
100
0
    }
101
102
    static inline XMMReg4Float Set1(float f)
103
0
    {
104
0
        XMMReg4Float reg;
105
0
        reg.xmm = _mm_set1_ps(f);
106
0
        return reg;
107
0
    }
108
109
    static inline XMMReg4Float LoadAllVal(const float *ptr)
110
0
    {
111
0
        return Load4Val(ptr);
112
0
    }
113
114
    static inline XMMReg4Float Load4Val(const float *ptr)
115
0
    {
116
0
        XMMReg4Float reg;
117
0
        reg.nsLoad4Val(ptr);
118
0
        return reg;
119
0
    }
120
121
    static inline XMMReg4Float Load4Val(const unsigned char *ptr)
122
0
    {
123
0
        XMMReg4Float reg;
124
0
        reg.nsLoad4Val(ptr);
125
0
        return reg;
126
0
    }
127
128
    static inline XMMReg4Float Load4Val(const short *ptr)
129
0
    {
130
0
        XMMReg4Float reg;
131
0
        reg.nsLoad4Val(ptr);
132
0
        return reg;
133
0
    }
134
135
    static inline XMMReg4Float Load4Val(const unsigned short *ptr)
136
0
    {
137
0
        XMMReg4Float reg;
138
0
        reg.nsLoad4Val(ptr);
139
0
        return reg;
140
0
    }
141
142
    static inline XMMReg4Float Load4Val(const int *ptr)
143
0
    {
144
0
        XMMReg4Float reg;
145
0
        reg.nsLoad4Val(ptr);
146
0
        return reg;
147
0
    }
148
149
    static inline XMMReg4Float Equals(const XMMReg4Float &expr1,
150
                                      const XMMReg4Float &expr2)
151
0
    {
152
0
        XMMReg4Float reg;
153
0
        reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);
154
0
        return reg;
155
0
    }
156
157
    static inline XMMReg4Float NotEquals(const XMMReg4Float &expr1,
158
                                         const XMMReg4Float &expr2)
159
0
    {
160
0
        XMMReg4Float reg;
161
0
        reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);
162
0
        return reg;
163
0
    }
164
165
    static inline XMMReg4Float Lesser(const XMMReg4Float &expr1,
166
                                      const XMMReg4Float &expr2)
167
0
    {
168
0
        XMMReg4Float reg;
169
0
        reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);
170
0
        return reg;
171
0
    }
172
173
    static inline XMMReg4Float Greater(const XMMReg4Float &expr1,
174
                                       const XMMReg4Float &expr2)
175
0
    {
176
0
        XMMReg4Float reg;
177
0
        reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);
178
0
        return reg;
179
0
    }
180
181
    static inline XMMReg4Float And(const XMMReg4Float &expr1,
182
                                   const XMMReg4Float &expr2)
183
0
    {
184
0
        XMMReg4Float reg;
185
0
        reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);
186
0
        return reg;
187
0
    }
188
189
    static inline XMMReg4Float Ternary(const XMMReg4Float &cond,
190
                                       const XMMReg4Float &true_expr,
191
                                       const XMMReg4Float &false_expr)
192
0
    {
193
0
        XMMReg4Float reg;
194
0
        reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),
195
0
                            _mm_andnot_ps(cond.xmm, false_expr.xmm));
196
0
        return reg;
197
0
    }
198
199
    static inline XMMReg4Float Min(const XMMReg4Float &expr1,
200
                                   const XMMReg4Float &expr2)
201
0
    {
202
0
        XMMReg4Float reg;
203
0
        reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);
204
0
        return reg;
205
0
    }
206
207
    static inline XMMReg4Float Max(const XMMReg4Float &expr1,
208
                                   const XMMReg4Float &expr2)
209
0
    {
210
0
        XMMReg4Float reg;
211
0
        reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);
212
0
        return reg;
213
0
    }
214
215
    inline void nsLoad4Val(const float *ptr)
216
0
    {
217
0
        xmm = _mm_loadu_ps(ptr);
218
0
    }
219
220
    static inline void Load16Val(const float *ptr, XMMReg4Float &r0,
221
                                 XMMReg4Float &r1, XMMReg4Float &r2,
222
                                 XMMReg4Float &r3)
223
0
    {
224
0
        r0.nsLoad4Val(ptr);
225
0
        r1.nsLoad4Val(ptr + 4);
226
0
        r2.nsLoad4Val(ptr + 8);
227
0
        r3.nsLoad4Val(ptr + 12);
228
0
    }
229
230
    inline void nsLoad4Val(const int *ptr)
231
0
    {
232
0
        const __m128i xmm_i =
233
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
234
0
        xmm = _mm_cvtepi32_ps(xmm_i);
235
0
    }
236
237
    static inline void Load16Val(const int *ptr, XMMReg4Float &r0,
238
                                 XMMReg4Float &r1, XMMReg4Float &r2,
239
                                 XMMReg4Float &r3)
240
0
    {
241
0
        r0.nsLoad4Val(ptr);
242
0
        r1.nsLoad4Val(ptr + 4);
243
0
        r2.nsLoad4Val(ptr + 8);
244
0
        r3.nsLoad4Val(ptr + 12);
245
0
    }
246
247
    static inline __m128i cvtepu8_epi32(__m128i x)
248
0
    {
249
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
250
        return _mm_cvtepu8_epi32(x);
251
#else
252
0
        return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),
253
0
                                  _mm_setzero_si128());
254
0
#endif
255
0
    }
256
257
    inline void nsLoad4Val(const unsigned char *ptr)
258
0
    {
259
0
        const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
260
0
        xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
261
0
    }
262
263
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Float &r0,
264
                                XMMReg4Float &r1)
265
0
    {
266
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
267
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
268
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
269
0
    }
270
271
    static inline void Load16Val(const unsigned char *ptr, XMMReg4Float &r0,
272
                                 XMMReg4Float &r1, XMMReg4Float &r2,
273
                                 XMMReg4Float &r3)
274
0
    {
275
0
        const __m128i xmm_i =
276
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
277
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
278
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
279
0
        r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));
280
0
        r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));
281
0
    }
282
283
    static inline __m128i cvtepi16_epi32(__m128i x)
284
0
    {
285
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
286
        return _mm_cvtepi16_epi32(x);
287
#else
288
        /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
289
0
        return _mm_srai_epi32(
290
            /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
291
0
            _mm_unpacklo_epi16(x, x), 16);
292
0
#endif
293
0
    }
294
295
    inline void nsLoad4Val(const short *ptr)
296
0
    {
297
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
298
0
        xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
299
0
    }
300
301
    static inline void Load8Val(const short *ptr, XMMReg4Float &r0,
302
                                XMMReg4Float &r1)
303
0
    {
304
0
        const __m128i xmm_i =
305
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
306
0
        r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
307
0
        r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));
308
0
    }
309
310
    static inline void Load16Val(const short *ptr, XMMReg4Float &r0,
311
                                 XMMReg4Float &r1, XMMReg4Float &r2,
312
                                 XMMReg4Float &r3)
313
0
    {
314
0
        Load8Val(ptr, r0, r1);
315
0
        Load8Val(ptr + 8, r2, r3);
316
0
    }
317
318
    static inline __m128i cvtepu16_epi32(__m128i x)
319
0
    {
320
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
321
        return _mm_cvtepu16_epi32(x);
322
#else
323
0
        return _mm_unpacklo_epi16(
324
0
            x, _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
325
0
#endif
326
0
    }
327
328
    inline void nsLoad4Val(const unsigned short *ptr)
329
0
    {
330
0
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
331
0
        xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
332
0
    }
333
334
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Float &r0,
335
                                XMMReg4Float &r1)
336
0
    {
337
0
        const __m128i xmm_i =
338
0
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
339
0
        r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
340
0
        r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));
341
0
    }
342
343
    static inline void Load16Val(const unsigned short *ptr, XMMReg4Float &r0,
344
                                 XMMReg4Float &r1, XMMReg4Float &r2,
345
                                 XMMReg4Float &r3)
346
0
    {
347
0
        Load8Val(ptr, r0, r1);
348
0
        Load8Val(ptr + 8, r2, r3);
349
0
    }
350
351
    inline void Zeroize()
352
0
    {
353
0
        xmm = _mm_setzero_ps();
354
0
    }
355
356
    inline XMMReg4Float &operator=(const XMMReg4Float &other)
357
0
    {
358
0
        xmm = other.xmm;
359
0
        return *this;
360
0
    }
361
362
    inline XMMReg4Float &operator+=(const XMMReg4Float &other)
363
0
    {
364
0
        xmm = _mm_add_ps(xmm, other.xmm);
365
0
        return *this;
366
0
    }
367
368
    inline XMMReg4Float &operator-=(const XMMReg4Float &other)
369
0
    {
370
0
        xmm = _mm_sub_ps(xmm, other.xmm);
371
0
        return *this;
372
0
    }
373
374
    inline XMMReg4Float &operator*=(const XMMReg4Float &other)
375
0
    {
376
0
        xmm = _mm_mul_ps(xmm, other.xmm);
377
0
        return *this;
378
0
    }
379
380
    inline XMMReg4Float operator+(const XMMReg4Float &other) const
381
0
    {
382
0
        XMMReg4Float ret;
383
0
        ret.xmm = _mm_add_ps(xmm, other.xmm);
384
0
        return ret;
385
0
    }
386
387
    inline XMMReg4Float operator-(const XMMReg4Float &other) const
388
0
    {
389
0
        XMMReg4Float ret;
390
0
        ret.xmm = _mm_sub_ps(xmm, other.xmm);
391
0
        return ret;
392
0
    }
393
394
    inline XMMReg4Float operator*(const XMMReg4Float &other) const
395
0
    {
396
0
        XMMReg4Float ret;
397
0
        ret.xmm = _mm_mul_ps(xmm, other.xmm);
398
0
        return ret;
399
0
    }
400
401
    inline XMMReg4Float operator/(const XMMReg4Float &other) const
402
0
    {
403
0
        XMMReg4Float ret;
404
0
        ret.xmm = _mm_div_ps(xmm, other.xmm);
405
0
        return ret;
406
0
    }
407
408
    inline XMMReg4Float inverse() const
409
0
    {
410
0
        XMMReg4Float ret;
411
0
        ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);
412
0
        return ret;
413
0
    }
414
415
    inline XMMReg4Int truncate_to_int() const;
416
417
    inline XMMReg4Float cast_to_float() const
418
0
    {
419
0
        return *this;
420
0
    }
421
422
    inline XMMReg4Double cast_to_double() const;
423
424
    inline XMMReg4Float approx_inv_sqrt(const XMMReg4Float &one,
425
                                        const XMMReg4Float &half) const
426
0
    {
427
0
        __m128 reg = xmm;
428
0
        __m128 reg_half = _mm_mul_ps(reg, half.xmm);
429
0
        // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
430
0
        reg = _mm_rsqrt_ps(reg);
431
0
        // And perform one step of Newton-Raphson approximation to improve it
432
0
        // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
433
0
        //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
434
0
        const __m128 one_and_a_half = _mm_add_ps(one.xmm, half.xmm);
435
0
        reg = _mm_mul_ps(
436
0
            reg, _mm_sub_ps(one_and_a_half,
437
0
                            _mm_mul_ps(reg_half, _mm_mul_ps(reg, reg))));
438
0
        XMMReg4Float ret;
439
0
        ret.xmm = reg;
440
0
        return ret;
441
0
    }
442
443
    inline void StoreAllVal(float *ptr) const
444
0
    {
445
0
        Store4Val(ptr);
446
0
    }
447
448
    inline void Store4Val(float *ptr) const
449
0
    {
450
0
        _mm_storeu_ps(ptr, xmm);
451
0
    }
452
453
    inline void Store4ValAligned(float *ptr) const
454
0
    {
455
0
        _mm_store_ps(ptr, xmm);
456
0
    }
457
458
    inline operator float() const
459
0
    {
460
0
        return _mm_cvtss_f32(xmm);
461
0
    }
462
};
463
464
class XMMReg4Int
465
{
466
  public:
467
    __m128i xmm;
468
469
    XMMReg4Int()
470
#if !defined(_MSC_VER)
471
        : xmm(_mm_undefined_si128())
472
#endif
473
0
    {
474
0
    }
475
476
    XMMReg4Int(const XMMReg4Int &other) : xmm(other.xmm)
477
0
    {
478
0
    }
479
480
    inline XMMReg4Int &operator=(const XMMReg4Int &other)
481
0
    {
482
0
        xmm = other.xmm;
483
0
        return *this;
484
0
    }
485
486
    static inline XMMReg4Int Zero()
487
0
    {
488
0
        XMMReg4Int reg;
489
0
        reg.xmm = _mm_setzero_si128();
490
0
        return reg;
491
0
    }
492
493
    static inline XMMReg4Int Set1(int i)
494
0
    {
495
0
        XMMReg4Int reg;
496
0
        reg.xmm = _mm_set1_epi32(i);
497
0
        return reg;
498
0
    }
499
500
    static inline XMMReg4Int LoadAllVal(const int *ptr)
501
0
    {
502
0
        return Load4Val(ptr);
503
0
    }
504
505
    static inline XMMReg4Int Load4Val(const int *ptr)
506
0
    {
507
0
        XMMReg4Int reg;
508
0
        reg.nsLoad4Val(ptr);
509
0
        return reg;
510
0
    }
511
512
    inline void nsLoad4Val(const int *ptr)
513
0
    {
514
0
        xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
515
0
    }
516
517
    static inline XMMReg4Int Equals(const XMMReg4Int &expr1,
518
                                    const XMMReg4Int &expr2)
519
0
    {
520
0
        XMMReg4Int reg;
521
0
        reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);
522
0
        return reg;
523
0
    }
524
525
    static inline XMMReg4Int Ternary(const XMMReg4Int &cond,
526
                                     const XMMReg4Int &true_expr,
527
                                     const XMMReg4Int &false_expr)
528
0
    {
529
0
        XMMReg4Int reg;
530
0
        reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
531
0
                               _mm_andnot_si128(cond.xmm, false_expr.xmm));
532
0
        return reg;
533
0
    }
534
535
    inline XMMReg4Int &operator+=(const XMMReg4Int &other)
536
0
    {
537
0
        xmm = _mm_add_epi32(xmm, other.xmm);
538
0
        return *this;
539
0
    }
540
541
    inline XMMReg4Int &operator-=(const XMMReg4Int &other)
542
0
    {
543
0
        xmm = _mm_sub_epi32(xmm, other.xmm);
544
0
        return *this;
545
0
    }
546
547
    inline XMMReg4Int operator+(const XMMReg4Int &other) const
548
0
    {
549
0
        XMMReg4Int ret;
550
0
        ret.xmm = _mm_add_epi32(xmm, other.xmm);
551
0
        return ret;
552
0
    }
553
554
    inline XMMReg4Int operator-(const XMMReg4Int &other) const
555
0
    {
556
0
        XMMReg4Int ret;
557
0
        ret.xmm = _mm_sub_epi32(xmm, other.xmm);
558
0
        return ret;
559
0
    }
560
561
    XMMReg4Double cast_to_double() const;
562
563
    XMMReg4Float cast_to_float() const
564
0
    {
565
0
        XMMReg4Float ret;
566
0
        ret.xmm = _mm_cvtepi32_ps(xmm);
567
0
        return ret;
568
0
    }
569
};
570
571
inline XMMReg4Int XMMReg4Float::truncate_to_int() const
572
0
{
573
0
    XMMReg4Int ret;
574
0
    ret.xmm = _mm_cvttps_epi32(xmm);
575
0
    return ret;
576
0
}
577
578
class XMMReg8Byte
579
{
580
  public:
581
    __m128i xmm;
582
583
    XMMReg8Byte()
584
#if !defined(_MSC_VER)
585
        : xmm(_mm_undefined_si128())
586
#endif
587
0
    {
588
0
    }
589
590
    XMMReg8Byte(const XMMReg8Byte &other) : xmm(other.xmm)
591
0
    {
592
0
    }
593
594
    static inline XMMReg8Byte Zero()
595
0
    {
596
0
        XMMReg8Byte reg;
597
0
        reg.xmm = _mm_setzero_si128();
598
0
        return reg;
599
0
    }
600
601
    static inline XMMReg8Byte Set1(char i)
602
0
    {
603
0
        XMMReg8Byte reg;
604
0
        reg.xmm = _mm_set1_epi8(i);
605
0
        return reg;
606
0
    }
607
608
    static inline XMMReg8Byte Equals(const XMMReg8Byte &expr1,
609
                                     const XMMReg8Byte &expr2)
610
0
    {
611
0
        XMMReg8Byte reg;
612
0
        reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);
613
0
        return reg;
614
0
    }
615
616
    static inline XMMReg8Byte Or(const XMMReg8Byte &expr1,
617
                                 const XMMReg8Byte &expr2)
618
0
    {
619
0
        XMMReg8Byte reg;
620
0
        reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);
621
0
        return reg;
622
0
    }
623
624
    static inline XMMReg8Byte Ternary(const XMMReg8Byte &cond,
625
                                      const XMMReg8Byte &true_expr,
626
                                      const XMMReg8Byte &false_expr)
627
0
    {
628
0
        XMMReg8Byte reg;
629
0
        reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
630
0
                               _mm_andnot_si128(cond.xmm, false_expr.xmm));
631
0
        return reg;
632
0
    }
633
634
    inline XMMReg8Byte operator+(const XMMReg8Byte &other) const
635
0
    {
636
0
        XMMReg8Byte ret;
637
0
        ret.xmm = _mm_add_epi8(xmm, other.xmm);
638
0
        return ret;
639
0
    }
640
641
    inline XMMReg8Byte operator-(const XMMReg8Byte &other) const
642
0
    {
643
0
        XMMReg8Byte ret;
644
0
        ret.xmm = _mm_sub_epi8(xmm, other.xmm);
645
0
        return ret;
646
0
    }
647
648
    static inline XMMReg8Byte Pack(const XMMReg4Int &r0, const XMMReg4Int &r1)
649
0
    {
650
0
        XMMReg8Byte reg;
651
0
        reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);
652
0
        reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);
653
0
        return reg;
654
0
    }
655
656
    inline void Store8Val(unsigned char *ptr) const
657
0
    {
658
0
        GDALCopyXMMToInt64(xmm, reinterpret_cast<GInt64 *>(ptr));
659
0
    }
660
};
661
662
class XMMReg2Double
663
{
664
  public:
665
    __m128d xmm;
666
667
    XMMReg2Double()
668
#if !defined(_MSC_VER)
669
0
        : xmm(_mm_undefined_pd())
670
#endif
671
0
    {
672
0
    }
673
674
    XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
675
0
    {
676
0
    }
677
678
0
    XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
679
0
    {
680
0
    }
681
682
    static inline XMMReg2Double Set1(double d)
683
0
    {
684
0
        XMMReg2Double reg;
685
0
        reg.xmm = _mm_set1_pd(d);
686
0
        return reg;
687
0
    }
688
689
    static inline XMMReg2Double Zero()
690
0
    {
691
0
        XMMReg2Double reg;
692
0
        reg.Zeroize();
693
0
        return reg;
694
0
    }
695
696
    static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
697
0
    {
698
0
        XMMReg2Double reg;
699
0
        reg.nsLoad1ValHighAndLow(ptr);
700
0
        return reg;
701
0
    }
702
703
    static inline XMMReg2Double Load2Val(const double *ptr)
704
0
    {
705
0
        XMMReg2Double reg;
706
0
        reg.nsLoad2Val(ptr);
707
0
        return reg;
708
0
    }
709
710
    static inline XMMReg2Double Load2Val(const float *ptr)
711
0
    {
712
0
        XMMReg2Double reg;
713
0
        reg.nsLoad2Val(ptr);
714
0
        return reg;
715
0
    }
716
717
    static inline XMMReg2Double Load2ValAligned(const double *ptr)
718
0
    {
719
0
        XMMReg2Double reg;
720
0
        reg.nsLoad2ValAligned(ptr);
721
0
        return reg;
722
0
    }
723
724
    static inline XMMReg2Double Load2Val(const unsigned char *ptr)
725
0
    {
726
0
        XMMReg2Double reg;
727
0
        reg.nsLoad2Val(ptr);
728
0
        return reg;
729
0
    }
730
731
    static inline XMMReg2Double Load2Val(const short *ptr)
732
0
    {
733
0
        XMMReg2Double reg;
734
0
        reg.nsLoad2Val(ptr);
735
0
        return reg;
736
0
    }
737
738
    static inline XMMReg2Double Load2Val(const unsigned short *ptr)
739
0
    {
740
0
        XMMReg2Double reg;
741
0
        reg.nsLoad2Val(ptr);
742
0
        return reg;
743
0
    }
744
745
    static inline XMMReg2Double Load2Val(const int *ptr)
746
0
    {
747
0
        XMMReg2Double reg;
748
0
        reg.nsLoad2Val(ptr);
749
0
        return reg;
750
0
    }
751
752
    static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
753
                                       const XMMReg2Double &expr2)
754
0
    {
755
0
        XMMReg2Double reg;
756
0
        reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
757
0
        return reg;
758
0
    }
759
760
    static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
761
                                          const XMMReg2Double &expr2)
762
0
    {
763
0
        XMMReg2Double reg;
764
0
        reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
765
0
        return reg;
766
0
    }
767
768
    static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
769
                                        const XMMReg2Double &expr2)
770
0
    {
771
0
        XMMReg2Double reg;
772
0
        reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
773
0
        return reg;
774
0
    }
775
776
    static inline XMMReg2Double And(const XMMReg2Double &expr1,
777
                                    const XMMReg2Double &expr2)
778
0
    {
779
0
        XMMReg2Double reg;
780
0
        reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
781
0
        return reg;
782
0
    }
783
784
    static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
785
                                        const XMMReg2Double &true_expr,
786
                                        const XMMReg2Double &false_expr)
787
0
    {
788
0
        XMMReg2Double reg;
789
0
        reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
790
0
                            _mm_andnot_pd(cond.xmm, false_expr.xmm));
791
0
        return reg;
792
0
    }
793
794
    static inline XMMReg2Double Min(const XMMReg2Double &expr1,
795
                                    const XMMReg2Double &expr2)
796
0
    {
797
0
        XMMReg2Double reg;
798
0
        reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
799
0
        return reg;
800
0
    }
801
802
    inline void nsLoad1ValHighAndLow(const double *ptr)
803
0
    {
804
0
        xmm = _mm_load1_pd(ptr);
805
0
    }
806
807
    inline void nsLoad2Val(const double *ptr)
808
0
    {
809
0
        xmm = _mm_loadu_pd(ptr);
810
0
    }
811
812
    inline void nsLoad2ValAligned(const double *ptr)
813
0
    {
814
0
        xmm = _mm_load_pd(ptr);
815
0
    }
816
817
    inline void nsLoad2Val(const float *ptr)
818
0
    {
819
0
        xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
820
0
    }
821
822
    inline void nsLoad2Val(const int *ptr)
823
0
    {
824
0
        xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));
825
0
    }
826
827
    inline void nsLoad2Val(const unsigned char *ptr)
828
0
    {
829
0
        __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
830
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
831
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
832
#else
833
0
        xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
834
0
        xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
835
0
#endif
836
0
        xmm = _mm_cvtepi32_pd(xmm_i);
837
0
    }
838
839
    inline void nsLoad2Val(const short *ptr)
840
0
    {
841
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
842
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
843
        xmm_i = _mm_cvtepi16_epi32(xmm_i);
844
#else
845
0
        xmm_i = _mm_unpacklo_epi16(
846
0
            xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
847
0
        xmm_i = _mm_srai_epi32(
848
0
            xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
849
0
#endif
850
0
        xmm = _mm_cvtepi32_pd(xmm_i);
851
0
    }
852
853
    inline void nsLoad2Val(const unsigned short *ptr)
854
0
    {
855
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
856
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
857
        xmm_i = _mm_cvtepu16_epi32(xmm_i);
858
#else
859
0
        xmm_i = _mm_unpacklo_epi16(
860
0
            xmm_i,
861
0
            _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
862
0
#endif
863
0
        xmm = _mm_cvtepi32_pd(xmm_i);
864
0
    }
865
866
    static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
867
                                XMMReg2Double &high)
868
0
    {
869
0
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
870
#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
871
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
872
#else
873
0
        xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
874
0
        xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
875
0
#endif
876
0
        low.xmm = _mm_cvtepi32_pd(xmm_i);
877
0
        high.xmm =
878
0
            _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
879
0
    }
880
881
    static inline void Load4Val(const short *ptr, XMMReg2Double &low,
882
                                XMMReg2Double &high)
883
0
    {
884
0
        low.nsLoad2Val(ptr);
885
0
        high.nsLoad2Val(ptr + 2);
886
0
    }
887
888
    static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
889
                                XMMReg2Double &high)
890
0
    {
891
0
        low.nsLoad2Val(ptr);
892
0
        high.nsLoad2Val(ptr + 2);
893
0
    }
894
895
    static inline void Load4Val(const double *ptr, XMMReg2Double &low,
896
                                XMMReg2Double &high)
897
0
    {
898
0
        low.nsLoad2Val(ptr);
899
0
        high.nsLoad2Val(ptr + 2);
900
0
    }
901
902
    static inline void Load4Val(const float *ptr, XMMReg2Double &low,
903
                                XMMReg2Double &high)
904
0
    {
905
0
        __m128 temp1 = _mm_loadu_ps(ptr);
906
0
        __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
907
0
        low.xmm = _mm_cvtps_pd(temp1);
908
0
        high.xmm = _mm_cvtps_pd(temp2);
909
0
    }
910
911
    inline void Zeroize()
912
0
    {
913
0
        xmm = _mm_setzero_pd();
914
0
    }
915
916
    inline XMMReg2Double &operator=(const XMMReg2Double &other)
917
0
    {
918
0
        xmm = other.xmm;
919
0
        return *this;
920
0
    }
921
922
    inline XMMReg2Double &operator+=(const XMMReg2Double &other)
923
0
    {
924
0
        xmm = _mm_add_pd(xmm, other.xmm);
925
0
        return *this;
926
0
    }
927
928
    inline XMMReg2Double &operator*=(const XMMReg2Double &other)
929
0
    {
930
0
        xmm = _mm_mul_pd(xmm, other.xmm);
931
0
        return *this;
932
0
    }
933
934
    inline XMMReg2Double operator+(const XMMReg2Double &other) const
935
0
    {
936
0
        XMMReg2Double ret;
937
0
        ret.xmm = _mm_add_pd(xmm, other.xmm);
938
0
        return ret;
939
0
    }
940
941
    inline XMMReg2Double operator-(const XMMReg2Double &other) const
942
0
    {
943
0
        XMMReg2Double ret;
944
0
        ret.xmm = _mm_sub_pd(xmm, other.xmm);
945
0
        return ret;
946
0
    }
947
948
    inline XMMReg2Double operator*(const XMMReg2Double &other) const
949
0
    {
950
0
        XMMReg2Double ret;
951
0
        ret.xmm = _mm_mul_pd(xmm, other.xmm);
952
0
        return ret;
953
0
    }
954
955
    inline XMMReg2Double operator/(const XMMReg2Double &other) const
956
0
    {
957
0
        XMMReg2Double ret;
958
0
        ret.xmm = _mm_div_pd(xmm, other.xmm);
959
0
        return ret;
960
0
    }
961
962
    inline double GetHorizSum() const
963
0
    {
964
0
        __m128d xmm2;
965
0
        xmm2 = _mm_shuffle_pd(
966
0
            xmm, xmm,
967
0
            _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
968
0
        return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
969
0
    }
970
971
    inline void Store2Val(double *ptr) const
972
0
    {
973
0
        _mm_storeu_pd(ptr, xmm);
974
0
    }
975
976
    inline void Store2ValAligned(double *ptr) const
977
0
    {
978
0
        _mm_store_pd(ptr, xmm);
979
0
    }
980
981
    inline void Store2Val(float *ptr) const
982
0
    {
983
0
        __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
984
0
        GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
985
0
    }
986
987
    inline void Store2Val(unsigned char *ptr) const
988
0
    {
989
0
        __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
990
0
            xmm,
991
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
992
0
        tmp = _mm_packs_epi32(tmp, tmp);
993
0
        tmp = _mm_packus_epi16(tmp, tmp);
994
0
        GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
995
0
    }
996
997
    inline void Store2Val(unsigned short *ptr) const
998
0
    {
999
0
        __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
1000
0
            xmm,
1001
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1002
        // X X X X 0 B 0 A --> X X X X A A B A
1003
0
        tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
1004
0
        GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
1005
0
    }
1006
1007
    inline void StoreMask(unsigned char *ptr) const
1008
0
    {
1009
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
1010
0
                         _mm_castpd_si128(xmm));
1011
0
    }
1012
1013
    inline operator double() const
1014
0
    {
1015
0
        return _mm_cvtsd_f64(xmm);
1016
0
    }
1017
};
1018
1019
#else
1020
1021
#ifndef NO_WARN_USE_SSE2_EMULATION
1022
#warning "Software emulation of SSE2 !"
1023
#endif
1024
1025
class XMMReg2Double
1026
{
1027
  public:
1028
    double low;
1029
    double high;
1030
1031
    // cppcheck-suppress uninitMemberVar
1032
    XMMReg2Double() = default;
1033
1034
    explicit XMMReg2Double(double val)
1035
    {
1036
        low = val;
1037
        high = 0.0;
1038
    }
1039
1040
    XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
1041
    {
1042
    }
1043
1044
    static inline XMMReg2Double Zero()
1045
    {
1046
        XMMReg2Double reg;
1047
        reg.Zeroize();
1048
        return reg;
1049
    }
1050
1051
    static inline XMMReg2Double Set1(double d)
1052
    {
1053
        XMMReg2Double reg;
1054
        reg.low = d;
1055
        reg.high = d;
1056
        return reg;
1057
    }
1058
1059
    static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
1060
    {
1061
        XMMReg2Double reg;
1062
        reg.nsLoad1ValHighAndLow(ptr);
1063
        return reg;
1064
    }
1065
1066
    static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
1067
                                       const XMMReg2Double &expr2)
1068
    {
1069
        XMMReg2Double reg;
1070
1071
        if (expr1.low == expr2.low)
1072
            memset(&(reg.low), 0xFF, sizeof(double));
1073
        else
1074
            reg.low = 0;
1075
1076
        if (expr1.high == expr2.high)
1077
            memset(&(reg.high), 0xFF, sizeof(double));
1078
        else
1079
            reg.high = 0;
1080
1081
        return reg;
1082
    }
1083
1084
    static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
1085
                                          const XMMReg2Double &expr2)
1086
    {
1087
        XMMReg2Double reg;
1088
1089
        if (expr1.low != expr2.low)
1090
            memset(&(reg.low), 0xFF, sizeof(double));
1091
        else
1092
            reg.low = 0;
1093
1094
        if (expr1.high != expr2.high)
1095
            memset(&(reg.high), 0xFF, sizeof(double));
1096
        else
1097
            reg.high = 0;
1098
1099
        return reg;
1100
    }
1101
1102
    static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
1103
                                        const XMMReg2Double &expr2)
1104
    {
1105
        XMMReg2Double reg;
1106
1107
        if (expr1.low > expr2.low)
1108
            memset(&(reg.low), 0xFF, sizeof(double));
1109
        else
1110
            reg.low = 0;
1111
1112
        if (expr1.high > expr2.high)
1113
            memset(&(reg.high), 0xFF, sizeof(double));
1114
        else
1115
            reg.high = 0;
1116
1117
        return reg;
1118
    }
1119
1120
    static inline XMMReg2Double And(const XMMReg2Double &expr1,
1121
                                    const XMMReg2Double &expr2)
1122
    {
1123
        XMMReg2Double reg;
1124
        int low1[2], high1[2];
1125
        int low2[2], high2[2];
1126
        memcpy(low1, &expr1.low, sizeof(double));
1127
        memcpy(high1, &expr1.high, sizeof(double));
1128
        memcpy(low2, &expr2.low, sizeof(double));
1129
        memcpy(high2, &expr2.high, sizeof(double));
1130
        low1[0] &= low2[0];
1131
        low1[1] &= low2[1];
1132
        high1[0] &= high2[0];
1133
        high1[1] &= high2[1];
1134
        memcpy(&reg.low, low1, sizeof(double));
1135
        memcpy(&reg.high, high1, sizeof(double));
1136
        return reg;
1137
    }
1138
1139
    static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
1140
                                        const XMMReg2Double &true_expr,
1141
                                        const XMMReg2Double &false_expr)
1142
    {
1143
        XMMReg2Double reg;
1144
        if (cond.low != 0)
1145
            reg.low = true_expr.low;
1146
        else
1147
            reg.low = false_expr.low;
1148
        if (cond.high != 0)
1149
            reg.high = true_expr.high;
1150
        else
1151
            reg.high = false_expr.high;
1152
        return reg;
1153
    }
1154
1155
    static inline XMMReg2Double Min(const XMMReg2Double &expr1,
1156
                                    const XMMReg2Double &expr2)
1157
    {
1158
        XMMReg2Double reg;
1159
        reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
1160
        reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
1161
        return reg;
1162
    }
1163
1164
    static inline XMMReg2Double Load2Val(const double *ptr)
1165
    {
1166
        XMMReg2Double reg;
1167
        reg.nsLoad2Val(ptr);
1168
        return reg;
1169
    }
1170
1171
    static inline XMMReg2Double Load2ValAligned(const double *ptr)
1172
    {
1173
        XMMReg2Double reg;
1174
        reg.nsLoad2ValAligned(ptr);
1175
        return reg;
1176
    }
1177
1178
    static inline XMMReg2Double Load2Val(const float *ptr)
1179
    {
1180
        XMMReg2Double reg;
1181
        reg.nsLoad2Val(ptr);
1182
        return reg;
1183
    }
1184
1185
    static inline XMMReg2Double Load2Val(const unsigned char *ptr)
1186
    {
1187
        XMMReg2Double reg;
1188
        reg.nsLoad2Val(ptr);
1189
        return reg;
1190
    }
1191
1192
    static inline XMMReg2Double Load2Val(const short *ptr)
1193
    {
1194
        XMMReg2Double reg;
1195
        reg.nsLoad2Val(ptr);
1196
        return reg;
1197
    }
1198
1199
    static inline XMMReg2Double Load2Val(const unsigned short *ptr)
1200
    {
1201
        XMMReg2Double reg;
1202
        reg.nsLoad2Val(ptr);
1203
        return reg;
1204
    }
1205
1206
    static inline XMMReg2Double Load2Val(const int *ptr)
1207
    {
1208
        XMMReg2Double reg;
1209
        reg.nsLoad2Val(ptr);
1210
        return reg;
1211
    }
1212
1213
    inline void nsLoad1ValHighAndLow(const double *ptr)
1214
    {
1215
        low = ptr[0];
1216
        high = ptr[0];
1217
    }
1218
1219
    inline void nsLoad2Val(const double *ptr)
1220
    {
1221
        low = ptr[0];
1222
        high = ptr[1];
1223
    }
1224
1225
    inline void nsLoad2ValAligned(const double *ptr)
1226
    {
1227
        low = ptr[0];
1228
        high = ptr[1];
1229
    }
1230
1231
    inline void nsLoad2Val(const float *ptr)
1232
    {
1233
        low = ptr[0];
1234
        high = ptr[1];
1235
    }
1236
1237
    inline void nsLoad2Val(const unsigned char *ptr)
1238
    {
1239
        low = ptr[0];
1240
        high = ptr[1];
1241
    }
1242
1243
    inline void nsLoad2Val(const short *ptr)
1244
    {
1245
        low = ptr[0];
1246
        high = ptr[1];
1247
    }
1248
1249
    inline void nsLoad2Val(const unsigned short *ptr)
1250
    {
1251
        low = ptr[0];
1252
        high = ptr[1];
1253
    }
1254
1255
    inline void nsLoad2Val(const int *ptr)
1256
    {
1257
        low = ptr[0];
1258
        high = ptr[1];
1259
    }
1260
1261
    static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
1262
                                XMMReg2Double &high)
1263
    {
1264
        low.low = ptr[0];
1265
        low.high = ptr[1];
1266
        high.low = ptr[2];
1267
        high.high = ptr[3];
1268
    }
1269
1270
    static inline void Load4Val(const short *ptr, XMMReg2Double &low,
1271
                                XMMReg2Double &high)
1272
    {
1273
        low.nsLoad2Val(ptr);
1274
        high.nsLoad2Val(ptr + 2);
1275
    }
1276
1277
    static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
1278
                                XMMReg2Double &high)
1279
    {
1280
        low.nsLoad2Val(ptr);
1281
        high.nsLoad2Val(ptr + 2);
1282
    }
1283
1284
    static inline void Load4Val(const double *ptr, XMMReg2Double &low,
1285
                                XMMReg2Double &high)
1286
    {
1287
        low.nsLoad2Val(ptr);
1288
        high.nsLoad2Val(ptr + 2);
1289
    }
1290
1291
    static inline void Load4Val(const float *ptr, XMMReg2Double &low,
1292
                                XMMReg2Double &high)
1293
    {
1294
        low.nsLoad2Val(ptr);
1295
        high.nsLoad2Val(ptr + 2);
1296
    }
1297
1298
    inline void Zeroize()
1299
    {
1300
        low = 0.0;
1301
        high = 0.0;
1302
    }
1303
1304
    inline XMMReg2Double &operator=(const XMMReg2Double &other)
1305
    {
1306
        low = other.low;
1307
        high = other.high;
1308
        return *this;
1309
    }
1310
1311
    inline XMMReg2Double &operator+=(const XMMReg2Double &other)
1312
    {
1313
        low += other.low;
1314
        high += other.high;
1315
        return *this;
1316
    }
1317
1318
    inline XMMReg2Double &operator*=(const XMMReg2Double &other)
1319
    {
1320
        low *= other.low;
1321
        high *= other.high;
1322
        return *this;
1323
    }
1324
1325
    inline XMMReg2Double operator+(const XMMReg2Double &other) const
1326
    {
1327
        XMMReg2Double ret;
1328
        ret.low = low + other.low;
1329
        ret.high = high + other.high;
1330
        return ret;
1331
    }
1332
1333
    inline XMMReg2Double operator-(const XMMReg2Double &other) const
1334
    {
1335
        XMMReg2Double ret;
1336
        ret.low = low - other.low;
1337
        ret.high = high - other.high;
1338
        return ret;
1339
    }
1340
1341
    inline XMMReg2Double operator*(const XMMReg2Double &other) const
1342
    {
1343
        XMMReg2Double ret;
1344
        ret.low = low * other.low;
1345
        ret.high = high * other.high;
1346
        return ret;
1347
    }
1348
1349
    inline XMMReg2Double operator/(const XMMReg2Double &other) const
1350
    {
1351
        XMMReg2Double ret;
1352
        ret.low = low / other.low;
1353
        ret.high = high / other.high;
1354
        return ret;
1355
    }
1356
1357
    inline double GetHorizSum() const
1358
    {
1359
        return low + high;
1360
    }
1361
1362
    inline void Store2Val(double *ptr) const
1363
    {
1364
        ptr[0] = low;
1365
        ptr[1] = high;
1366
    }
1367
1368
    inline void Store2ValAligned(double *ptr) const
1369
    {
1370
        ptr[0] = low;
1371
        ptr[1] = high;
1372
    }
1373
1374
    inline void Store2Val(float *ptr) const
1375
    {
1376
        ptr[0] = static_cast<float>(low);
1377
        ptr[1] = static_cast<float>(high);
1378
    }
1379
1380
    void Store2Val(unsigned char *ptr) const
1381
    {
1382
        ptr[0] = static_cast<unsigned char>(low + 0.5);
1383
        ptr[1] = static_cast<unsigned char>(high + 0.5);
1384
    }
1385
1386
    void Store2Val(unsigned short *ptr) const
1387
    {
1388
        ptr[0] = static_cast<GUInt16>(low + 0.5);
1389
        ptr[1] = static_cast<GUInt16>(high + 0.5);
1390
    }
1391
1392
    inline void StoreMask(unsigned char *ptr) const
1393
    {
1394
        memcpy(ptr, &low, 8);
1395
        memcpy(ptr + 8, &high, 8);
1396
    }
1397
1398
    inline operator double() const
1399
    {
1400
        return low;
1401
    }
1402
};
1403
1404
#endif /*  defined(__x86_64) || defined(_M_X64) */
1405
1406
#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
1407
1408
#include <immintrin.h>
1409
1410
class XMMReg4Double
1411
{
1412
  public:
1413
    __m256d ymm;
1414
1415
    XMMReg4Double() : ymm(_mm256_setzero_pd())
1416
    {
1417
    }
1418
1419
    XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
1420
    {
1421
    }
1422
1423
    static inline XMMReg4Double Zero()
1424
    {
1425
        XMMReg4Double reg;
1426
        reg.Zeroize();
1427
        return reg;
1428
    }
1429
1430
    static inline XMMReg4Double Set1(double d)
1431
    {
1432
        XMMReg4Double reg;
1433
        reg.ymm = _mm256_set1_pd(d);
1434
        return reg;
1435
    }
1436
1437
    inline void Zeroize()
1438
    {
1439
        ymm = _mm256_setzero_pd();
1440
    }
1441
1442
    static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1443
    {
1444
        XMMReg4Double reg;
1445
        reg.nsLoad1ValHighAndLow(ptr);
1446
        return reg;
1447
    }
1448
1449
    inline void nsLoad1ValHighAndLow(const double *ptr)
1450
    {
1451
        ymm = _mm256_set1_pd(*ptr);
1452
    }
1453
1454
    static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1455
    {
1456
        XMMReg4Double reg;
1457
        reg.nsLoad4Val(ptr);
1458
        return reg;
1459
    }
1460
1461
    inline void nsLoad4Val(const unsigned char *ptr)
1462
    {
1463
        __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
1464
        xmm_i = _mm_cvtepu8_epi32(xmm_i);
1465
        ymm = _mm256_cvtepi32_pd(xmm_i);
1466
    }
1467
1468
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
1469
                                XMMReg4Double &high)
1470
    {
1471
        const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1472
        const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);
1473
        low.ymm = _mm256_cvtepi32_pd(xmm_i_low);
1474
        const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));
1475
        high.ymm = _mm256_cvtepi32_pd(xmm_i_high);
1476
    }
1477
1478
    static inline XMMReg4Double Load4Val(const short *ptr)
1479
    {
1480
        XMMReg4Double reg;
1481
        reg.nsLoad4Val(ptr);
1482
        return reg;
1483
    }
1484
1485
    inline void nsLoad4Val(const short *ptr)
1486
    {
1487
        __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1488
        xmm_i = _mm_cvtepi16_epi32(xmm_i);
1489
        ymm = _mm256_cvtepi32_pd(xmm_i);
1490
    }
1491
1492
    static inline void Load8Val(const short *ptr, XMMReg4Double &low,
1493
                                XMMReg4Double &high)
1494
    {
1495
        low.nsLoad4Val(ptr);
1496
        high.nsLoad4Val(ptr + 4);
1497
    }
1498
1499
    static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1500
    {
1501
        XMMReg4Double reg;
1502
        reg.nsLoad4Val(ptr);
1503
        return reg;
1504
    }
1505
1506
    inline void nsLoad4Val(const unsigned short *ptr)
1507
    {
1508
        __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1509
        xmm_i = _mm_cvtepu16_epi32(xmm_i);
1510
        ymm = _mm256_cvtepi32_pd(
1511
            xmm_i);  // ok to use signed conversion since we are in the ushort
1512
                     // range, so cannot be interpreted as negative int32
1513
    }
1514
1515
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
1516
                                XMMReg4Double &high)
1517
    {
1518
        low.nsLoad4Val(ptr);
1519
        high.nsLoad4Val(ptr + 4);
1520
    }
1521
1522
    static inline XMMReg4Double Load4Val(const double *ptr)
1523
    {
1524
        XMMReg4Double reg;
1525
        reg.nsLoad4Val(ptr);
1526
        return reg;
1527
    }
1528
1529
    inline void nsLoad4Val(const double *ptr)
1530
    {
1531
        ymm = _mm256_loadu_pd(ptr);
1532
    }
1533
1534
    static inline void Load8Val(const double *ptr, XMMReg4Double &low,
1535
                                XMMReg4Double &high)
1536
    {
1537
        low.nsLoad4Val(ptr);
1538
        high.nsLoad4Val(ptr + 4);
1539
    }
1540
1541
    static inline XMMReg4Double Load4ValAligned(const double *ptr)
1542
    {
1543
        XMMReg4Double reg;
1544
        reg.nsLoad4ValAligned(ptr);
1545
        return reg;
1546
    }
1547
1548
    inline void nsLoad4ValAligned(const double *ptr)
1549
    {
1550
        ymm = _mm256_load_pd(ptr);
1551
    }
1552
1553
    static inline XMMReg4Double Load4Val(const float *ptr)
1554
    {
1555
        XMMReg4Double reg;
1556
        reg.nsLoad4Val(ptr);
1557
        return reg;
1558
    }
1559
1560
    inline void nsLoad4Val(const float *ptr)
1561
    {
1562
        ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
1563
    }
1564
1565
    static inline void Load8Val(const float *ptr, XMMReg4Double &low,
1566
                                XMMReg4Double &high)
1567
    {
1568
        low.nsLoad4Val(ptr);
1569
        high.nsLoad4Val(ptr + 4);
1570
    }
1571
1572
    static inline XMMReg4Double Load4Val(const int *ptr)
1573
    {
1574
        XMMReg4Double reg;
1575
        reg.nsLoad4Val(ptr);
1576
        return reg;
1577
    }
1578
1579
    inline void nsLoad4Val(const int *ptr)
1580
    {
1581
        ymm = _mm256_cvtepi32_pd(
1582
            _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)));
1583
    }
1584
1585
    static inline void Load8Val(const int *ptr, XMMReg4Double &low,
1586
                                XMMReg4Double &high)
1587
    {
1588
        low.nsLoad4Val(ptr);
1589
        high.nsLoad4Val(ptr + 4);
1590
    }
1591
1592
    static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1593
                                       const XMMReg4Double &expr2)
1594
    {
1595
        XMMReg4Double reg;
1596
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
1597
        return reg;
1598
    }
1599
1600
    static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1601
                                          const XMMReg4Double &expr2)
1602
    {
1603
        XMMReg4Double reg;
1604
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
1605
        return reg;
1606
    }
1607
1608
    static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1609
                                        const XMMReg4Double &expr2)
1610
    {
1611
        XMMReg4Double reg;
1612
        reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
1613
        return reg;
1614
    }
1615
1616
    static inline XMMReg4Double And(const XMMReg4Double &expr1,
1617
                                    const XMMReg4Double &expr2)
1618
    {
1619
        XMMReg4Double reg;
1620
        reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
1621
        return reg;
1622
    }
1623
1624
    static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1625
                                        const XMMReg4Double &true_expr,
1626
                                        const XMMReg4Double &false_expr)
1627
    {
1628
        XMMReg4Double reg;
1629
        reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
1630
                               _mm256_andnot_pd(cond.ymm, false_expr.ymm));
1631
        return reg;
1632
    }
1633
1634
    static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1635
                                    const XMMReg4Double &expr2)
1636
    {
1637
        XMMReg4Double reg;
1638
        reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
1639
        return reg;
1640
    }
1641
1642
    inline XMMReg4Double &operator=(const XMMReg4Double &other)
1643
    {
1644
        ymm = other.ymm;
1645
        return *this;
1646
    }
1647
1648
    inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1649
    {
1650
        ymm = _mm256_add_pd(ymm, other.ymm);
1651
        return *this;
1652
    }
1653
1654
    inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1655
    {
1656
        ymm = _mm256_mul_pd(ymm, other.ymm);
1657
        return *this;
1658
    }
1659
1660
    inline XMMReg4Double operator+(const XMMReg4Double &other) const
1661
    {
1662
        XMMReg4Double ret;
1663
        ret.ymm = _mm256_add_pd(ymm, other.ymm);
1664
        return ret;
1665
    }
1666
1667
    inline XMMReg4Double operator-(const XMMReg4Double &other) const
1668
    {
1669
        XMMReg4Double ret;
1670
        ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1671
        return ret;
1672
    }
1673
1674
    inline XMMReg4Double operator*(const XMMReg4Double &other) const
1675
    {
1676
        XMMReg4Double ret;
1677
        ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1678
        return ret;
1679
    }
1680
1681
    inline XMMReg4Double operator/(const XMMReg4Double &other) const
1682
    {
1683
        XMMReg4Double ret;
1684
        ret.ymm = _mm256_div_pd(ymm, other.ymm);
1685
        return ret;
1686
    }
1687
1688
    void AddToLow(const XMMReg2Double &other)
1689
    {
1690
        __m256d ymm2 = _mm256_setzero_pd();
1691
        ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1692
        ymm = _mm256_add_pd(ymm, ymm2);
1693
    }
1694
1695
    inline double GetHorizSum() const
1696
    {
1697
        __m256d ymm_tmp1, ymm_tmp2;
1698
        ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1699
        ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1700
        ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1701
        return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1702
    }
1703
1704
    inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
1705
                                         const XMMReg4Double &half) const
1706
    {
1707
        __m256d reg = ymm;
1708
        __m256d reg_half = _mm256_mul_pd(reg, half.ymm);
1709
        // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
1710
        reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));
1711
        // And perform one step of Newton-Raphson approximation to improve it
1712
        // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
1713
        //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
1714
        const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);
1715
        reg = _mm256_mul_pd(
1716
            reg,
1717
            _mm256_sub_pd(one_and_a_half,
1718
                          _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));
1719
        XMMReg4Double ret;
1720
        ret.ymm = reg;
1721
        return ret;
1722
    }
1723
1724
    inline XMMReg4Float cast_to_float() const
1725
    {
1726
        XMMReg4Float ret;
1727
        ret.xmm = _mm256_cvtpd_ps(ymm);
1728
        return ret;
1729
    }
1730
1731
    inline void Store4Val(unsigned char *ptr) const
1732
    {
1733
        __m128i xmm_i =
1734
            _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1735
        // xmm_i = _mm_packs_epi32(xmm_i, xmm_i);   // Pack int32 to int16
1736
        // xmm_i = _mm_packus_epi16(xmm_i, xmm_i);  // Pack int16 to uint8
1737
        xmm_i =
1738
            _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1739
                                                      (12 << 24)));  //  SSSE3
1740
        GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1741
    }
1742
1743
    inline void Store4Val(unsigned short *ptr) const
1744
    {
1745
        __m128i xmm_i =
1746
            _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1747
        xmm_i = _mm_packus_epi32(xmm_i, xmm_i);  // Pack uint32 to uint16
1748
        GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1749
    }
1750
1751
    inline void Store4Val(float *ptr) const
1752
    {
1753
        _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1754
    }
1755
1756
    inline void Store4Val(double *ptr) const
1757
    {
1758
        _mm256_storeu_pd(ptr, ymm);
1759
    }
1760
1761
    inline void StoreMask(unsigned char *ptr) const
1762
    {
1763
        _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1764
                            _mm256_castpd_si256(ymm));
1765
    }
1766
};
1767
1768
inline XMMReg4Double XMMReg4Float::cast_to_double() const
1769
{
1770
    XMMReg4Double ret;
1771
    ret.ymm = _mm256_cvtps_pd(xmm);
1772
    return ret;
1773
}
1774
1775
inline XMMReg4Double XMMReg4Int::cast_to_double() const
1776
{
1777
    XMMReg4Double ret;
1778
    ret.ymm = _mm256_cvtepi32_pd(xmm);
1779
    return ret;
1780
}
1781
1782
class XMMReg8Float
1783
{
1784
  public:
1785
    __m256 ymm;
1786
1787
    XMMReg8Float()
1788
#if !defined(_MSC_VER)
1789
        : ymm(_mm256_undefined_ps())
1790
#endif
1791
    {
1792
    }
1793
1794
    XMMReg8Float(const XMMReg8Float &other) : ymm(other.ymm)
1795
    {
1796
    }
1797
1798
    static inline XMMReg8Float Set1(float f)
1799
    {
1800
        XMMReg8Float reg;
1801
        reg.ymm = _mm256_set1_ps(f);
1802
        return reg;
1803
    }
1804
1805
    static inline XMMReg8Float LoadAllVal(const float *ptr)
1806
    {
1807
        return Load8Val(ptr);
1808
    }
1809
1810
    static inline XMMReg8Float Load8Val(const float *ptr)
1811
    {
1812
        XMMReg8Float reg;
1813
        reg.nsLoad8Val(ptr);
1814
        return reg;
1815
    }
1816
1817
    static inline XMMReg8Float Load8Val(const int *ptr)
1818
    {
1819
        XMMReg8Float reg;
1820
        reg.nsLoad8Val(ptr);
1821
        return reg;
1822
    }
1823
1824
    static inline XMMReg8Float Max(const XMMReg8Float &expr1,
1825
                                   const XMMReg8Float &expr2)
1826
    {
1827
        XMMReg8Float reg;
1828
        reg.ymm = _mm256_max_ps(expr1.ymm, expr2.ymm);
1829
        return reg;
1830
    }
1831
1832
    inline void nsLoad8Val(const float *ptr)
1833
    {
1834
        ymm = _mm256_loadu_ps(ptr);
1835
    }
1836
1837
    inline void nsLoad8Val(const int *ptr)
1838
    {
1839
        const __m256i ymm_i =
1840
            _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
1841
        ymm = _mm256_cvtepi32_ps(ymm_i);
1842
    }
1843
1844
    inline XMMReg8Float &operator=(const XMMReg8Float &other)
1845
    {
1846
        ymm = other.ymm;
1847
        return *this;
1848
    }
1849
1850
    inline XMMReg8Float &operator+=(const XMMReg8Float &other)
1851
    {
1852
        ymm = _mm256_add_ps(ymm, other.ymm);
1853
        return *this;
1854
    }
1855
1856
    inline XMMReg8Float &operator-=(const XMMReg8Float &other)
1857
    {
1858
        ymm = _mm256_sub_ps(ymm, other.ymm);
1859
        return *this;
1860
    }
1861
1862
    inline XMMReg8Float &operator*=(const XMMReg8Float &other)
1863
    {
1864
        ymm = _mm256_mul_ps(ymm, other.ymm);
1865
        return *this;
1866
    }
1867
1868
    inline XMMReg8Float operator+(const XMMReg8Float &other) const
1869
    {
1870
        XMMReg8Float ret;
1871
        ret.ymm = _mm256_add_ps(ymm, other.ymm);
1872
        return ret;
1873
    }
1874
1875
    inline XMMReg8Float operator-(const XMMReg8Float &other) const
1876
    {
1877
        XMMReg8Float ret;
1878
        ret.ymm = _mm256_sub_ps(ymm, other.ymm);
1879
        return ret;
1880
    }
1881
1882
    inline XMMReg8Float operator*(const XMMReg8Float &other) const
1883
    {
1884
        XMMReg8Float ret;
1885
        ret.ymm = _mm256_mul_ps(ymm, other.ymm);
1886
        return ret;
1887
    }
1888
1889
    inline XMMReg8Float operator/(const XMMReg8Float &other) const
1890
    {
1891
        XMMReg8Float ret;
1892
        ret.ymm = _mm256_div_ps(ymm, other.ymm);
1893
        return ret;
1894
    }
1895
1896
    inline XMMReg8Float inverse() const
1897
    {
1898
        XMMReg8Float ret;
1899
        ret.ymm = _mm256_div_ps(_mm256_set1_ps(1.0f), ymm);
1900
        return ret;
1901
    }
1902
1903
    inline XMMReg8Float approx_inv_sqrt(const XMMReg8Float &one,
1904
                                        const XMMReg8Float &half) const
1905
    {
1906
        __m256 reg = ymm;
1907
        __m256 reg_half = _mm256_mul_ps(reg, half.ymm);
1908
        // Compute rough approximation of 1 / sqrt(b) with _mm256_rsqrt_ps
1909
        reg = _mm256_rsqrt_ps(reg);
1910
        // And perform one step of Newton-Raphson approximation to improve it
1911
        // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
1912
        //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
1913
        const __m256 one_and_a_half = _mm256_add_ps(one.ymm, half.ymm);
1914
        reg = _mm256_mul_ps(
1915
            reg,
1916
            _mm256_sub_ps(one_and_a_half,
1917
                          _mm256_mul_ps(reg_half, _mm256_mul_ps(reg, reg))));
1918
        XMMReg8Float ret;
1919
        ret.ymm = reg;
1920
        return ret;
1921
    }
1922
1923
    inline void StoreAllVal(float *ptr) const
1924
    {
1925
        Store8Val(ptr);
1926
    }
1927
1928
    inline void Store8Val(float *ptr) const
1929
    {
1930
        _mm256_storeu_ps(ptr, ymm);
1931
    }
1932
1933
    XMMReg8Float cast_to_float() const
1934
    {
1935
        return *this;
1936
    }
1937
};
1938
1939
#if defined(__AVX2__)
1940
1941
class XMMReg8Int
1942
{
1943
  public:
1944
    __m256i ymm;
1945
1946
    XMMReg8Int()
1947
#if !defined(_MSC_VER)
1948
        : ymm(_mm256_undefined_si256())
1949
#endif
1950
    {
1951
    }
1952
1953
    XMMReg8Int(const XMMReg8Int &other) : ymm(other.ymm)
1954
    {
1955
    }
1956
1957
    inline XMMReg8Int &operator=(const XMMReg8Int &other)
1958
    {
1959
        ymm = other.ymm;
1960
        return *this;
1961
    }
1962
1963
    static inline XMMReg8Int Zero()
1964
    {
1965
        XMMReg8Int reg;
1966
        reg.ymm = _mm256_setzero_si256();
1967
        return reg;
1968
    }
1969
1970
    static inline XMMReg8Int Set1(int i)
1971
    {
1972
        XMMReg8Int reg;
1973
        reg.ymm = _mm256_set1_epi32(i);
1974
        return reg;
1975
    }
1976
1977
    static inline XMMReg8Int LoadAllVal(const int *ptr)
1978
    {
1979
        return Load8Val(ptr);
1980
    }
1981
1982
    static inline XMMReg8Int Load8Val(const int *ptr)
1983
    {
1984
        XMMReg8Int reg;
1985
        reg.nsLoad8Val(ptr);
1986
        return reg;
1987
    }
1988
1989
    inline void nsLoad8Val(const int *ptr)
1990
    {
1991
        ymm = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
1992
    }
1993
1994
    static inline XMMReg8Int Equals(const XMMReg8Int &expr1,
1995
                                    const XMMReg8Int &expr2)
1996
    {
1997
        XMMReg8Int reg;
1998
        reg.ymm = _mm256_cmpeq_epi32(expr1.ymm, expr2.ymm);
1999
        return reg;
2000
    }
2001
2002
    static inline XMMReg8Int Ternary(const XMMReg8Int &cond,
2003
                                     const XMMReg8Int &true_expr,
2004
                                     const XMMReg8Int &false_expr)
2005
    {
2006
        XMMReg8Int reg;
2007
        reg.ymm =
2008
            _mm256_or_si256(_mm256_and_si256(cond.ymm, true_expr.ymm),
2009
                            _mm256_andnot_si256(cond.ymm, false_expr.ymm));
2010
        return reg;
2011
    }
2012
2013
    inline XMMReg8Int &operator+=(const XMMReg8Int &other)
2014
    {
2015
        ymm = _mm256_add_epi32(ymm, other.ymm);
2016
        return *this;
2017
    }
2018
2019
    inline XMMReg8Int &operator-=(const XMMReg8Int &other)
2020
    {
2021
        ymm = _mm256_sub_epi32(ymm, other.ymm);
2022
        return *this;
2023
    }
2024
2025
    inline XMMReg8Int operator+(const XMMReg8Int &other) const
2026
    {
2027
        XMMReg8Int ret;
2028
        ret.ymm = _mm256_add_epi32(ymm, other.ymm);
2029
        return ret;
2030
    }
2031
2032
    inline XMMReg8Int operator-(const XMMReg8Int &other) const
2033
    {
2034
        XMMReg8Int ret;
2035
        ret.ymm = _mm256_sub_epi32(ymm, other.ymm);
2036
        return ret;
2037
    }
2038
2039
    XMMReg8Float cast_to_float() const
2040
    {
2041
        XMMReg8Float ret;
2042
        ret.ymm = _mm256_cvtepi32_ps(ymm);
2043
        return ret;
2044
    }
2045
};
2046
2047
#endif
2048
2049
#else
2050
2051
class XMMReg4Double
2052
{
2053
  public:
2054
    XMMReg2Double low, high;
2055
2056
0
    XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())
2057
0
    {
2058
0
    }
2059
2060
0
    XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
2061
0
    {
2062
0
    }
2063
2064
    static inline XMMReg4Double Zero()
2065
0
    {
2066
0
        XMMReg4Double reg;
2067
0
        reg.low.Zeroize();
2068
0
        reg.high.Zeroize();
2069
0
        return reg;
2070
0
    }
2071
2072
    static inline XMMReg4Double Set1(double d)
2073
0
    {
2074
0
        XMMReg4Double reg;
2075
0
        reg.low = XMMReg2Double::Set1(d);
2076
0
        reg.high = XMMReg2Double::Set1(d);
2077
0
        return reg;
2078
0
    }
2079
2080
    static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
2081
0
    {
2082
0
        XMMReg4Double reg;
2083
0
        reg.low.nsLoad1ValHighAndLow(ptr);
2084
0
        reg.high = reg.low;
2085
0
        return reg;
2086
0
    }
2087
2088
    static inline XMMReg4Double Load4Val(const unsigned char *ptr)
2089
0
    {
2090
0
        XMMReg4Double reg;
2091
0
        XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
2092
0
        return reg;
2093
0
    }
2094
2095
    static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
2096
                                XMMReg4Double &high)
2097
0
    {
2098
0
        low = Load4Val(ptr);
2099
0
        high = Load4Val(ptr + 4);
2100
0
    }
2101
2102
    static inline XMMReg4Double Load4Val(const short *ptr)
2103
0
    {
2104
0
        XMMReg4Double reg;
2105
0
        reg.low.nsLoad2Val(ptr);
2106
0
        reg.high.nsLoad2Val(ptr + 2);
2107
0
        return reg;
2108
0
    }
2109
2110
    static inline void Load8Val(const short *ptr, XMMReg4Double &low,
2111
                                XMMReg4Double &high)
2112
0
    {
2113
0
        low = Load4Val(ptr);
2114
0
        high = Load4Val(ptr + 4);
2115
0
    }
2116
2117
    static inline XMMReg4Double Load4Val(const unsigned short *ptr)
2118
0
    {
2119
0
        XMMReg4Double reg;
2120
0
        reg.low.nsLoad2Val(ptr);
2121
0
        reg.high.nsLoad2Val(ptr + 2);
2122
0
        return reg;
2123
0
    }
2124
2125
    static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
2126
                                XMMReg4Double &high)
2127
0
    {
2128
0
        low = Load4Val(ptr);
2129
0
        high = Load4Val(ptr + 4);
2130
0
    }
2131
2132
    static inline XMMReg4Double Load4Val(const int *ptr)
2133
0
    {
2134
0
        XMMReg4Double reg;
2135
0
        reg.low.nsLoad2Val(ptr);
2136
0
        reg.high.nsLoad2Val(ptr + 2);
2137
0
        return reg;
2138
0
    }
2139
2140
    static inline void Load8Val(const int *ptr, XMMReg4Double &low,
2141
                                XMMReg4Double &high)
2142
0
    {
2143
0
        low = Load4Val(ptr);
2144
0
        high = Load4Val(ptr + 4);
2145
0
    }
2146
2147
    static inline XMMReg4Double Load4Val(const double *ptr)
2148
0
    {
2149
0
        XMMReg4Double reg;
2150
0
        reg.low.nsLoad2Val(ptr);
2151
0
        reg.high.nsLoad2Val(ptr + 2);
2152
0
        return reg;
2153
0
    }
2154
2155
    static inline void Load8Val(const double *ptr, XMMReg4Double &low,
2156
                                XMMReg4Double &high)
2157
0
    {
2158
0
        low = Load4Val(ptr);
2159
0
        high = Load4Val(ptr + 4);
2160
0
    }
2161
2162
    static inline XMMReg4Double Load4ValAligned(const double *ptr)
2163
0
    {
2164
0
        XMMReg4Double reg;
2165
0
        reg.low.nsLoad2ValAligned(ptr);
2166
0
        reg.high.nsLoad2ValAligned(ptr + 2);
2167
0
        return reg;
2168
0
    }
2169
2170
    static inline XMMReg4Double Load4Val(const float *ptr)
2171
0
    {
2172
0
        XMMReg4Double reg;
2173
0
        XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
2174
0
        return reg;
2175
0
    }
2176
2177
    static inline void Load8Val(const float *ptr, XMMReg4Double &low,
2178
                                XMMReg4Double &high)
2179
0
    {
2180
0
        low = Load4Val(ptr);
2181
0
        high = Load4Val(ptr + 4);
2182
0
    }
2183
2184
    static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
2185
                                       const XMMReg4Double &expr2)
2186
0
    {
2187
0
        XMMReg4Double reg;
2188
0
        reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
2189
0
        reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
2190
0
        return reg;
2191
0
    }
2192
2193
    static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
2194
                                          const XMMReg4Double &expr2)
2195
0
    {
2196
0
        XMMReg4Double reg;
2197
0
        reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
2198
0
        reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
2199
0
        return reg;
2200
0
    }
2201
2202
    static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
2203
                                        const XMMReg4Double &expr2)
2204
0
    {
2205
0
        XMMReg4Double reg;
2206
0
        reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
2207
0
        reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
2208
0
        return reg;
2209
0
    }
2210
2211
    static inline XMMReg4Double And(const XMMReg4Double &expr1,
2212
                                    const XMMReg4Double &expr2)
2213
0
    {
2214
0
        XMMReg4Double reg;
2215
0
        reg.low = XMMReg2Double::And(expr1.low, expr2.low);
2216
0
        reg.high = XMMReg2Double::And(expr1.high, expr2.high);
2217
0
        return reg;
2218
0
    }
2219
2220
    static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
2221
                                        const XMMReg4Double &true_expr,
2222
                                        const XMMReg4Double &false_expr)
2223
0
    {
2224
0
        XMMReg4Double reg;
2225
0
        reg.low =
2226
0
            XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
2227
0
        reg.high =
2228
0
            XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
2229
0
        return reg;
2230
0
    }
2231
2232
    static inline XMMReg4Double Min(const XMMReg4Double &expr1,
2233
                                    const XMMReg4Double &expr2)
2234
0
    {
2235
0
        XMMReg4Double reg;
2236
0
        reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
2237
0
        reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
2238
0
        return reg;
2239
0
    }
2240
2241
    inline XMMReg4Double &operator=(const XMMReg4Double &other)
2242
0
    {
2243
0
        low = other.low;
2244
0
        high = other.high;
2245
0
        return *this;
2246
0
    }
2247
2248
    inline XMMReg4Double &operator+=(const XMMReg4Double &other)
2249
0
    {
2250
0
        low += other.low;
2251
0
        high += other.high;
2252
0
        return *this;
2253
0
    }
2254
2255
    inline XMMReg4Double &operator*=(const XMMReg4Double &other)
2256
0
    {
2257
0
        low *= other.low;
2258
0
        high *= other.high;
2259
0
        return *this;
2260
0
    }
2261
2262
    inline XMMReg4Double operator+(const XMMReg4Double &other) const
2263
0
    {
2264
0
        XMMReg4Double ret;
2265
0
        ret.low = low + other.low;
2266
0
        ret.high = high + other.high;
2267
0
        return ret;
2268
0
    }
2269
2270
    inline XMMReg4Double operator-(const XMMReg4Double &other) const
2271
0
    {
2272
0
        XMMReg4Double ret;
2273
0
        ret.low = low - other.low;
2274
0
        ret.high = high - other.high;
2275
0
        return ret;
2276
0
    }
2277
2278
    inline XMMReg4Double operator*(const XMMReg4Double &other) const
2279
0
    {
2280
0
        XMMReg4Double ret;
2281
0
        ret.low = low * other.low;
2282
0
        ret.high = high * other.high;
2283
0
        return ret;
2284
0
    }
2285
2286
    inline XMMReg4Double operator/(const XMMReg4Double &other) const
2287
0
    {
2288
0
        XMMReg4Double ret;
2289
0
        ret.low = low / other.low;
2290
0
        ret.high = high / other.high;
2291
0
        return ret;
2292
0
    }
2293
2294
    void AddToLow(const XMMReg2Double &other)
2295
0
    {
2296
0
        low += other;
2297
0
    }
2298
2299
    inline double GetHorizSum() const
2300
0
    {
2301
0
        return (low + high).GetHorizSum();
2302
0
    }
2303
2304
#if !defined(USE_SSE2_EMULATION)
2305
    inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
2306
                                         const XMMReg4Double &half) const
2307
0
    {
2308
0
        __m128d reg0 = low.xmm;
2309
0
        __m128d reg1 = high.xmm;
2310
0
        __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);
2311
0
        __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);
2312
0
        // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
2313
0
        reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));
2314
0
        reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));
2315
0
        // And perform one step of Newton-Raphson approximation to improve it
2316
0
        // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
2317
0
        //                            0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
2318
0
        const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);
2319
0
        reg0 = _mm_mul_pd(
2320
0
            reg0, _mm_sub_pd(one_and_a_half,
2321
0
                             _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));
2322
0
        reg1 = _mm_mul_pd(
2323
0
            reg1, _mm_sub_pd(one_and_a_half,
2324
0
                             _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));
2325
0
        XMMReg4Double ret;
2326
0
        ret.low.xmm = reg0;
2327
0
        ret.high.xmm = reg1;
2328
0
        return ret;
2329
0
    }
2330
2331
    inline XMMReg4Float cast_to_float() const
2332
0
    {
2333
0
        XMMReg4Float ret;
2334
0
        ret.xmm = _mm_castsi128_ps(
2335
0
            _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),
2336
0
                               _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));
2337
0
        return ret;
2338
0
    }
2339
#endif
2340
2341
    inline void Store4Val(unsigned char *ptr) const
2342
0
    {
2343
#ifdef USE_SSE2_EMULATION
2344
        low.Store2Val(ptr);
2345
        high.Store2Val(ptr + 2);
2346
#else
2347
0
        __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
2348
0
            low.xmm,
2349
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2350
0
        __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
2351
0
            high.xmm,
2352
0
            _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2353
0
        auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
2354
0
                                                   _mm_castsi128_ps(tmpHigh),
2355
0
                                                   _MM_SHUFFLE(1, 0, 1, 0)));
2356
0
        tmp = _mm_packs_epi32(tmp, tmp);
2357
0
        tmp = _mm_packus_epi16(tmp, tmp);
2358
0
        GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
2359
0
#endif
2360
0
    }
2361
2362
    inline void Store4Val(unsigned short *ptr) const
2363
0
    {
2364
0
#if 1
2365
0
        low.Store2Val(ptr);
2366
0
        high.Store2Val(ptr + 2);
2367
#else
2368
        __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
2369
        __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
2370
        xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
2371
#if __SSE4_1__
2372
        xmm0 = _mm_packus_epi32(xmm0, xmm0);  // Pack uint32 to uint16
2373
#else
2374
        xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
2375
        xmm0 = _mm_packs_epi32(xmm0, xmm0);
2376
        xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
2377
#endif
2378
        GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
2379
#endif
2380
0
    }
2381
2382
    inline void Store4Val(float *ptr) const
2383
0
    {
2384
0
        low.Store2Val(ptr);
2385
0
        high.Store2Val(ptr + 2);
2386
0
    }
2387
2388
    inline void Store4Val(double *ptr) const
2389
0
    {
2390
0
        low.Store2Val(ptr);
2391
0
        high.Store2Val(ptr + 2);
2392
0
    }
2393
2394
    inline void StoreMask(unsigned char *ptr) const
2395
0
    {
2396
0
        low.StoreMask(ptr);
2397
0
        high.StoreMask(ptr + 16);
2398
0
    }
2399
};
2400
2401
#if !defined(USE_SSE2_EMULATION)
2402
inline XMMReg4Double XMMReg4Float::cast_to_double() const
2403
0
{
2404
0
    XMMReg4Double ret;
2405
0
    ret.low.xmm = _mm_cvtps_pd(xmm);
2406
0
    ret.high.xmm = _mm_cvtps_pd(
2407
0
        _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));
2408
0
    return ret;
2409
0
}
2410
2411
inline XMMReg4Double XMMReg4Int::cast_to_double() const
2412
0
{
2413
0
    XMMReg4Double ret;
2414
0
    ret.low.xmm = _mm_cvtepi32_pd(xmm);
2415
0
    ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));
2416
0
    return ret;
2417
0
}
2418
#endif
2419
2420
#endif
2421
2422
#endif /* #ifndef DOXYGEN_SKIP */
2423
2424
#endif /* GDALSSE_PRIV_H_INCLUDED */