Coverage Report

Created: 2025-06-13 06:29

/src/gdal/gcore/rasterio_ssse3.cpp
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Project:  GDAL Core
4
 * Purpose:  SSSE3 specializations
5
 * Author:   Even Rouault <even dot rouault at spatialys dot com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#include "cpl_port.h"
14
15
#include <algorithm>
16
17
#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
18
     (defined(__x86_64) || defined(_M_X64))) ||                                \
19
    defined(USE_NEON_OPTIMIZATIONS)
20
21
#include "rasterio_ssse3.h"
22
23
#ifdef USE_NEON_OPTIMIZATIONS
24
#include "include_sse2neon.h"
25
#else
26
#include <tmmintrin.h>
27
#endif
28
29
#include "gdal_priv_templates.hpp"
30
31
void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
32
                                      const GByte *CPL_RESTRICT pSrc,
33
                                      GPtrDiff_t nIters)
34
0
{
35
0
    decltype(nIters) i;
36
0
    const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
37
0
                                              -1, -1, 15, 12, 9, 6, 3, 0);
38
0
    const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
39
0
                                              2, -1, -1, -1, -1, -1, -1);
40
0
    const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
41
0
                                              -1, -1, -1, -1, -1, -1, -1);
42
    // If we were sure that there would always be 2 trailing bytes, we could
43
    // check against nIters - 15
44
0
    for (i = 0; i < nIters - 16; i += 16)
45
0
    {
46
0
        __m128i xmm0 =
47
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
48
0
        __m128i xmm1 =
49
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
50
0
        __m128i xmm2 =
51
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
52
53
        // From LSB to MSB:
54
        // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
55
0
        xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
56
        // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
57
0
        xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
58
        // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
59
        // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
60
0
        xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
61
0
        xmm0 = _mm_or_si128(xmm0, xmm1);
62
0
        xmm0 = _mm_or_si128(xmm0, xmm2);
63
64
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
65
66
0
        pSrc += 3 * 16;
67
0
    }
68
0
    for (; i < nIters; i++)
69
0
    {
70
0
        pDest[i] = *pSrc;
71
0
        pSrc += 3;
72
0
    }
73
0
}
74
75
/************************************************************************/
76
/*                  GDALDeinterleave3Byte_SSSE3()                       */
77
/************************************************************************/
78
79
#if defined(__GNUC__) && !defined(__clang__)
80
// GCC autovectorizer does an excellent job
81
__attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
82
    const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
83
    GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
84
{
85
    for (size_t i = 0; i < nIters; ++i)
86
    {
87
        pabyDest0[i] = pabySrc[3 * i + 0];
88
        pabyDest1[i] = pabySrc[3 * i + 1];
89
        pabyDest2[i] = pabySrc[3 * i + 2];
90
    }
91
}
92
#else
93
void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
94
                                 GByte *CPL_RESTRICT pabyDest0,
95
                                 GByte *CPL_RESTRICT pabyDest1,
96
                                 GByte *CPL_RESTRICT pabyDest2, size_t nIters)
97
0
{
98
0
    size_t i = 0;
99
0
    for (; i + 15 < nIters; i += 16)
100
0
    {
101
0
        __m128i xmm0 = _mm_loadu_si128(
102
0
            reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
103
0
        __m128i xmm1 = _mm_loadu_si128(
104
0
            reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
105
0
        __m128i xmm2 = _mm_loadu_si128(
106
0
            reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
107
0
        auto xmm0_new =
108
0
            _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
109
0
                                                7, 4, 1, 9, 6, 3, 0));
110
0
        auto xmm1_new = _mm_shuffle_epi8(
111
0
            _mm_alignr_epi8(xmm1, xmm0, 12),
112
0
            _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
113
0
        auto xmm2_new = _mm_shuffle_epi8(
114
0
            _mm_alignr_epi8(xmm2, xmm1, 8),
115
0
            _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
116
0
        auto xmm3_new =
117
0
            _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
118
0
                                                14, 11, 8, 5, 13, 10, 7, 4));
119
120
0
        __m128i xmm01lo =
121
0
            _mm_unpacklo_epi32(xmm0_new, xmm1_new);  // W0 W4 W1 W5
122
0
        __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new);  // W2 W6 -  -
123
0
        __m128i xmm23lo =
124
0
            _mm_unpacklo_epi32(xmm2_new, xmm3_new);  // W8 WC W9 WD
125
0
        __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new);  // WA WE -  -
126
0
        xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo);  // W0 W4 W8 WC
127
0
        xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo);  // W1 W5 W9 WD
128
0
        xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi);  // W2 W6 WA WE
129
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
130
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
131
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
132
0
    }
133
0
#if defined(__clang__)
134
0
#pragma clang loop vectorize(disable)
135
0
#endif
136
0
    for (; i < nIters; ++i)
137
0
    {
138
0
        pabyDest0[i] = pabySrc[3 * i + 0];
139
0
        pabyDest1[i] = pabySrc[3 * i + 1];
140
0
        pabyDest2[i] = pabySrc[3 * i + 2];
141
0
    }
142
0
}
143
#endif
144
145
/************************************************************************/
146
/*                     GDALTranspose4x4Int32()                          */
147
/************************************************************************/
148
149
// Consider that the input registers for 4x4 words of size 4 bytes each,
150
// Return the transposition of this 4x4 matrix
151
// Considering that in0 = (in00, in01, in02, in03)
152
// Considering that in1 = (in10, in11, in12, in13)
153
// Considering that in2 = (in20, in21, in22, in23)
154
// Considering that in3 = (in30, in31, in32, in33)
155
// Return          out0 = (in00, in10, in20, in30)
156
// Return          out1 = (in01, in11, in21, in31)
157
// Return          out2 = (in02, in12, in22, in32)
158
// Return          out3 = (in03, in13, in23, in33)
159
inline void GDALTranspose4x4Int32(__m128i in0, __m128i in1, __m128i in2,
160
                                  __m128i in3, __m128i &out0, __m128i &out1,
161
                                  __m128i &out2, __m128i &out3)
162
0
{
163
0
    __m128i tmp0 = _mm_unpacklo_epi32(in0, in1);  // (in00, in10, in01, in11)
164
0
    __m128i tmp1 = _mm_unpackhi_epi32(in0, in1);  // (in02, in12, in03, in13)
165
0
    __m128i tmp2 = _mm_unpacklo_epi32(in2, in3);  // (in20, in30, in21, in31)
166
0
    __m128i tmp3 = _mm_unpackhi_epi32(in2, in3);  // (in22, in32, in23, in33)
167
168
0
    out0 = _mm_unpacklo_epi64(tmp0, tmp2);  // (in00, in10, in20, in30)
169
0
    out1 = _mm_unpackhi_epi64(tmp0, tmp2);  // (in01, in11, in21, in31)
170
0
    out2 = _mm_unpacklo_epi64(tmp1, tmp3);  // (in02, in12, in22, in32)
171
0
    out3 = _mm_unpackhi_epi64(tmp1, tmp3);  // (in03, in13, in23, in33)
172
0
}
173
174
/************************************************************************/
175
/*                  GDALDeinterleave4Byte_SSSE3()                       */
176
/************************************************************************/
177
178
#if !defined(__GNUC__) || defined(__clang__)
179
void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
180
                                 GByte *CPL_RESTRICT pabyDest0,
181
                                 GByte *CPL_RESTRICT pabyDest1,
182
                                 GByte *CPL_RESTRICT pabyDest2,
183
                                 GByte *CPL_RESTRICT pabyDest3, size_t nIters)
184
0
{
185
0
    const __m128i shuffle_mask =
186
0
        _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
187
0
    size_t i = 0;
188
0
    for (; i + 15 < nIters; i += 16)
189
0
    {
190
0
        __m128i xmm0 = _mm_loadu_si128(
191
0
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
192
0
        __m128i xmm1 = _mm_loadu_si128(
193
0
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
194
0
        __m128i xmm2 = _mm_loadu_si128(
195
0
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
196
0
        __m128i xmm3 = _mm_loadu_si128(
197
0
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
198
0
        xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask);  // W0 W1 W2 W3
199
0
        xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask);  // W4 W5 W6 W7
200
0
        xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask);  // W8 W9 WA WB
201
0
        xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask);  // WC WD WE WF
202
203
0
        GDALTranspose4x4Int32(xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3);
204
205
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
206
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
207
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
208
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
209
0
    }
210
0
#if defined(__clang__)
211
0
#pragma clang loop vectorize(disable)
212
0
#endif
213
0
    for (; i < nIters; ++i)
214
0
    {
215
0
        pabyDest0[i] = pabySrc[4 * i + 0];
216
0
        pabyDest1[i] = pabySrc[4 * i + 1];
217
0
        pabyDest2[i] = pabySrc[4 * i + 2];
218
0
        pabyDest3[i] = pabySrc[4 * i + 3];
219
0
    }
220
0
}
221
#endif
222
223
/************************************************************************/
224
/*                  GDALDeinterleave3UInt16_SSSE3()                     */
225
/************************************************************************/
226
227
#if (defined(__GNUC__) && !defined(__clang__)) ||                              \
228
    defined(__INTEL_CLANG_COMPILER)
229
#if !defined(__INTEL_CLANG_COMPILER)
230
// GCC autovectorizer does an excellent job
231
__attribute__((optimize("tree-vectorize")))
232
#endif
233
void GDALDeinterleave3UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
234
                                  GUInt16* CPL_RESTRICT panDest0,
235
                                  GUInt16* CPL_RESTRICT panDest1,
236
                                  GUInt16* CPL_RESTRICT panDest2,
237
                                  size_t nIters)
238
{
239
    for (size_t i = 0; i < nIters; ++i)
240
    {
241
        panDest0[i] = panSrc[3 * i + 0];
242
        panDest1[i] = panSrc[3 * i + 1];
243
        panDest2[i] = panSrc[3 * i + 2];
244
    }
245
}
246
#endif
247
248
/************************************************************************/
249
/*                  GDALDeinterleave4UInt16_SSSE3()                     */
250
/************************************************************************/
251
252
#if (defined(__GNUC__) && !defined(__clang__)) ||                              \
253
    defined(__INTEL_CLANG_COMPILER)
254
#if !defined(__INTEL_CLANG_COMPILER)
255
// GCC autovectorizer does an excellent job
256
__attribute__((optimize("tree-vectorize")))
257
#endif
258
void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc,
259
                                  GUInt16* CPL_RESTRICT panDest0,
260
                                  GUInt16* CPL_RESTRICT panDest1,
261
                                  GUInt16* CPL_RESTRICT panDest2,
262
                                  GUInt16* CPL_RESTRICT panDest3,
263
                                  size_t nIters)
264
{
265
    for (size_t i = 0; i < nIters; ++i)
266
    {
267
        panDest0[i] = panSrc[4 * i + 0];
268
        panDest1[i] = panSrc[4 * i + 1];
269
        panDest2[i] = panSrc[4 * i + 2];
270
        panDest3[i] = panSrc[4 * i + 3];
271
    }
272
}
273
#endif
274
275
/************************************************************************/
276
/*                               loadu()                                */
277
/************************************************************************/
278
279
inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride)
280
0
{
281
0
    return _mm_loadu_si128(
282
0
        reinterpret_cast<const __m128i *>(pSrc + i * srcStride));
283
0
}
284
285
/************************************************************************/
286
/*                               storeu()                               */
287
/************************************************************************/
288
289
inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg)
290
0
{
291
0
    _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg);
292
0
}
293
294
/************************************************************************/
295
/*                      GDALInterleave3Byte_SSSE3()                     */
296
/************************************************************************/
297
298
#if (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
299
300
inline __m128i GDAL_mm_or_3_si128(__m128i r0, __m128i r1, __m128i r2)
301
{
302
    return _mm_or_si128(_mm_or_si128(r0, r1), r2);
303
}
304
305
// ICC autovectorizer doesn't do a good job at generating good SSE code,
306
// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
307
#if defined(__GNUC__)
308
__attribute__((noinline))
309
#endif
310
static void
311
GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
312
                          uint8_t *CPL_RESTRICT pDst, size_t nIters)
313
{
314
    size_t i = 0;
315
    constexpr size_t VALS_PER_ITER = 16;
316
317
    if (nIters >= VALS_PER_ITER)
318
    {
319
        // clang-format off
320
        constexpr char X = -1;
321
        // How to dispatch 16 values of row=0 onto 3x16 bytes
322
        const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X,
323
                                                    1, X, X,
324
                                                    2, X, X,
325
                                                    3, X, X,
326
                                                    4, X, X,
327
                                                    5);
328
        const __m128i xmm_shuffle01 = _mm_setr_epi8(   X, X,
329
                                                    6, X, X,
330
                                                    7, X, X,
331
                                                    8, X, X,
332
                                                    9, X, X,
333
                                                    10,X);
334
        const __m128i xmm_shuffle02 = _mm_setr_epi8(       X,
335
                                                    11, X, X,
336
                                                    12, X, X,
337
                                                    13, X, X,
338
                                                    14, X, X,
339
                                                    15, X, X);
340
341
        // How to dispatch 16 values of row=1 onto 3x16 bytes
342
        const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X,
343
                                                    X, 1, X,
344
                                                    X, 2, X,
345
                                                    X, 3, X,
346
                                                    X, 4, X,
347
                                                    X);
348
        const __m128i xmm_shuffle11 = _mm_setr_epi8(   5, X,
349
                                                    X, 6, X,
350
                                                    X, 7, X,
351
                                                    X, 8, X,
352
                                                    X, 9, X,
353
                                                    X,10);
354
        const __m128i xmm_shuffle12 = _mm_setr_epi8(       X,
355
                                                    X, 11, X,
356
                                                    X, 12, X,
357
                                                    X, 13, X,
358
                                                    X, 14, X,
359
                                                    X, 15, X);
360
361
        // How to dispatch 16 values of row=2 onto 3x16 bytes
362
        const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0,
363
                                                    X, X, 1,
364
                                                    X, X, 2,
365
                                                    X, X, 3,
366
                                                    X, X, 4,
367
                                                    X);
368
        const __m128i xmm_shuffle21 = _mm_setr_epi8(   X, 5,
369
                                                    X, X, 6,
370
                                                    X, X, 7,
371
                                                    X, X, 8,
372
                                                    X, X, 9,
373
                                                    X, X);
374
        const __m128i xmm_shuffle22 = _mm_setr_epi8(      10,
375
                                                    X, X, 11,
376
                                                    X, X, 12,
377
                                                    X, X, 13,
378
                                                    X, X, 14,
379
                                                    X, X, 15);
380
        // clang-format on
381
382
        for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
383
        {
384
#define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
385
            LOAD(0);
386
            LOAD(1);
387
            LOAD(2);
388
389
#define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
390
#define COMBINE_3(x)                                                           \
391
    GDAL_mm_or_3_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2))
392
393
#define STORE(x)                                                               \
394
    storeu(pDst, 3 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_3(x))
395
            STORE(0);
396
            STORE(1);
397
            STORE(2);
398
#undef LOAD
399
#undef COMBINE_3
400
#undef SHUFFLE
401
#undef STORE
402
        }
403
    }
404
405
    for (; i < nIters; ++i)
406
    {
407
#define INTERLEAVE(x) pDst[3 * i + x] = pSrc[i + x * nIters]
408
        INTERLEAVE(0);
409
        INTERLEAVE(1);
410
        INTERLEAVE(2);
411
#undef INTERLEAVE
412
    }
413
}
414
415
#else
416
417
#if defined(__GNUC__) && !defined(__clang__)
418
__attribute__((optimize("tree-vectorize")))
419
#endif
420
#if defined(__GNUC__)
421
__attribute__((noinline))
422
#endif
423
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
424
// clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
425
#pragma clang diagnostic push
426
#pragma clang diagnostic ignored "-Wpass-failed"
427
#endif
428
static void
429
GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
430
                          uint8_t *CPL_RESTRICT pDst, size_t nIters)
431
0
{
432
0
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
433
0
#pragma clang loop vectorize(enable)
434
0
#endif
435
0
    for (size_t i = 0; i < nIters; ++i)
436
0
    {
437
0
        pDst[3 * i + 0] = pSrc[i + 0 * nIters];
438
0
        pDst[3 * i + 1] = pSrc[i + 1 * nIters];
439
0
        pDst[3 * i + 2] = pSrc[i + 2 * nIters];
440
0
    }
441
0
}
442
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
443
#pragma clang diagnostic pop
444
#endif
445
446
#endif
447
448
/************************************************************************/
449
/*                      GDALInterleave5Byte_SSSE3()                     */
450
/************************************************************************/
451
452
inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2,
453
                                  __m128i r3, __m128i r4)
454
0
{
455
0
    return _mm_or_si128(
456
0
        _mm_or_si128(_mm_or_si128(r0, r1), _mm_or_si128(r2, r3)), r4);
457
0
}
458
459
static void GDALInterleave5Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
460
                                      uint8_t *CPL_RESTRICT pDst, size_t nIters)
461
0
{
462
0
    size_t i = 0;
463
0
    constexpr size_t VALS_PER_ITER = 16;
464
465
0
    if (nIters >= VALS_PER_ITER)
466
0
    {
467
        // clang-format off
468
0
        constexpr char X = -1;
469
        // How to dispatch 16 values of row=0 onto 5x16 bytes
470
0
        const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, X, X,
471
0
                                                    1, X, X, X, X,
472
0
                                                    2, X, X, X, X,
473
0
                                                    3);
474
0
        const __m128i xmm_shuffle01 = _mm_setr_epi8(   X, X, X, X,
475
0
                                                    4, X, X, X, X,
476
0
                                                    5, X, X, X, X,
477
0
                                                    6, X);
478
0
        const __m128i xmm_shuffle02 = _mm_setr_epi8(      X, X, X,
479
0
                                                    7, X, X, X, X,
480
0
                                                    8, X, X, X, X,
481
0
                                                    9, X, X);
482
0
        const __m128i xmm_shuffle03 = _mm_setr_epi8(          X, X,
483
0
                                                    10, X, X, X, X,
484
0
                                                    11, X, X, X, X,
485
0
                                                    12, X, X, X);
486
0
        const __m128i xmm_shuffle04 = _mm_setr_epi8(             X,
487
0
                                                    13, X, X, X, X,
488
0
                                                    14, X, X, X, X,
489
0
                                                    15, X, X, X, X);
490
491
        // How to dispatch 16 values of row=1 onto 5x16 bytes
492
0
        const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, X, X,
493
0
                                                    X, 1, X, X, X,
494
0
                                                    X, 2, X, X, X,
495
0
                                                    X);
496
0
        const __m128i xmm_shuffle11 = _mm_setr_epi8(   3, X, X, X,
497
0
                                                    X, 4, X, X, X,
498
0
                                                    X, 5, X, X, X,
499
0
                                                    X, 6);
500
0
        const __m128i xmm_shuffle12 = _mm_setr_epi8(      X, X, X,
501
0
                                                    X, 7, X, X, X,
502
0
                                                    X, 8, X, X, X,
503
0
                                                    X, 9, X);
504
0
        const __m128i xmm_shuffle13 = _mm_setr_epi8(          X, X,
505
0
                                                    X, 10, X, X, X,
506
0
                                                    X, 11, X, X, X,
507
0
                                                    X, 12, X, X);
508
0
        const __m128i xmm_shuffle14 = _mm_setr_epi8(             X,
509
0
                                                    X, 13, X, X, X,
510
0
                                                    X, 14, X, X, X,
511
0
                                                    X, 15, X, X, X);
512
513
        // How to dispatch 16 values of row=2 onto 5x16 bytes
514
0
        const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, X, X,
515
0
                                                    X, X, 1, X, X,
516
0
                                                    X, X, 2, X, X,
517
0
                                                    X);
518
0
        const __m128i xmm_shuffle21 = _mm_setr_epi8(   X, 3, X, X,
519
0
                                                    X, X, 4, X, X,
520
0
                                                    X, X, 5, X, X,
521
0
                                                    X, X);
522
0
        const __m128i xmm_shuffle22 = _mm_setr_epi8(      6, X, X,
523
0
                                                    X, X, 7, X, X,
524
0
                                                    X, X, 8, X, X,
525
0
                                                    X, X, 9);
526
0
        const __m128i xmm_shuffle23 = _mm_setr_epi8(          X, X,
527
0
                                                    X, X, 10, X, X,
528
0
                                                    X, X, 11, X, X,
529
0
                                                    X, X, 12, X);
530
0
        const __m128i xmm_shuffle24 = _mm_setr_epi8(             X,
531
0
                                                    X, X, 13, X, X,
532
0
                                                    X, X, 14, X, X,
533
0
                                                    X, X, 15, X, X);
534
535
        // How to dispatch 16 values of row=3 onto 5x16 bytes
536
0
        const __m128i xmm_shuffle30 = _mm_setr_epi8(X, X, X, 0, X,
537
0
                                                    X, X, X, 1, X,
538
0
                                                    X, X, X, 2, X,
539
0
                                                    X);
540
0
        const __m128i xmm_shuffle31 = _mm_setr_epi8(   X, X, 3, X,
541
0
                                                    X, X, X, 4, X,
542
0
                                                    X, X, X, 5, X,
543
0
                                                    X, X);
544
0
        const __m128i xmm_shuffle32 = _mm_setr_epi8(      X, 6, X,
545
0
                                                    X, X, X, 7, X,
546
0
                                                    X, X, X, 8, X,
547
0
                                                    X, X, X);
548
0
        const __m128i xmm_shuffle33 = _mm_setr_epi8(          9, X,
549
0
                                                    X, X, X, 10, X,
550
0
                                                    X, X, X, 11, X,
551
0
                                                    X, X, X, 12);
552
0
        const __m128i xmm_shuffle34 = _mm_setr_epi8(             X,
553
0
                                                    X, X, X, 13, X,
554
0
                                                    X, X, X, 14, X,
555
0
                                                    X, X, X, 15, X);
556
557
        // How to dispatch 16 values of row=4 onto 5x16 bytes
558
0
        const __m128i xmm_shuffle40 = _mm_setr_epi8(X, X, X, X, 0,
559
0
                                                    X, X, X, X, 1,
560
0
                                                    X, X, X, X, 2,
561
0
                                                    X);
562
0
        const __m128i xmm_shuffle41 = _mm_setr_epi8(   X, X, X, 3,
563
0
                                                    X, X, X, X, 4,
564
0
                                                    X, X, X, X, 5,
565
0
                                                    X, X);
566
0
        const __m128i xmm_shuffle42 = _mm_setr_epi8(      X, X, 6,
567
0
                                                    X, X, X, X, 7,
568
0
                                                    X, X, X, X, 8,
569
0
                                                    X, X, X);
570
0
        const __m128i xmm_shuffle43 = _mm_setr_epi8(         X,  9,
571
0
                                                    X, X, X, X, 10,
572
0
                                                    X, X, X, X, 11,
573
0
                                                    X, X, X, X);
574
0
        const __m128i xmm_shuffle44 = _mm_setr_epi8(            12,
575
0
                                                    X, X, X, X, 13,
576
0
                                                    X, X, X, X, 14,
577
0
                                                    X, X, X, X, 15);
578
        // clang-format on
579
580
0
        for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
581
0
        {
582
0
#define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
583
0
            LOAD(0);
584
0
            LOAD(1);
585
0
            LOAD(2);
586
0
            LOAD(3);
587
0
            LOAD(4);
588
589
0
#define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
590
0
#define COMBINE_5(x)                                                           \
591
0
    GDAL_mm_or_5_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2),            \
592
0
                       SHUFFLE(x, 3), SHUFFLE(x, 4))
593
594
0
#define STORE(x)                                                               \
595
0
    storeu(pDst, 5 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_5(x))
596
0
            STORE(0);
597
0
            STORE(1);
598
0
            STORE(2);
599
0
            STORE(3);
600
0
            STORE(4);
601
0
#undef LOAD
602
0
#undef COMBINE_5
603
0
#undef SHUFFLE
604
0
#undef STORE
605
0
        }
606
0
    }
607
608
0
    for (; i < nIters; ++i)
609
0
    {
610
0
#define INTERLEAVE(x) pDst[5 * i + x] = pSrc[i + x * nIters]
611
0
        INTERLEAVE(0);
612
0
        INTERLEAVE(1);
613
0
        INTERLEAVE(2);
614
0
        INTERLEAVE(3);
615
0
        INTERLEAVE(4);
616
0
#undef INTERLEAVE
617
0
    }
618
0
}
619
620
/************************************************************************/
621
/*                      GDALTranspose2D_Byte_SSSE3()                    */
622
/************************************************************************/
623
624
// Given r = (b00, b01, b02, b03,
625
//            b10, b11, b12, b13,
626
//            b20, b21, b22, b23,
627
//            b30, b31, b32, b33)
628
// Return    (b00, b10, b20, b30,
629
//            b01, b11, b21, b31,
630
//            b02, b12, b22, b32,
631
//            b03, b13, b22, b33)
632
inline void GDALReorderForTranspose4x4(__m128i &r)
633
0
{
634
0
    const __m128i shuffle_mask =
635
0
        _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
636
637
0
    r = _mm_shuffle_epi8(r, shuffle_mask);
638
0
}
639
640
// Transpose the 16x16 byte values contained in the 16 SSE registers
641
inline void GDALTranspose16x16ByteBlock_SSSE3(
642
    __m128i &r00, __m128i &r01, __m128i &r02, __m128i &r03, __m128i &r04,
643
    __m128i &r05, __m128i &r06, __m128i &r07, __m128i &r08, __m128i &r09,
644
    __m128i &r10, __m128i &r11, __m128i &r12, __m128i &r13, __m128i &r14,
645
    __m128i &r15)
646
0
{
647
0
    __m128i tmp00, tmp01, tmp02, tmp03;
648
0
    __m128i tmp10, tmp11, tmp12, tmp13;
649
0
    __m128i tmp20, tmp21, tmp22, tmp23;
650
0
    __m128i tmp30, tmp31, tmp32, tmp33;
651
652
0
    GDALTranspose4x4Int32(r00, r01, r02, r03, tmp00, tmp01, tmp02, tmp03);
653
0
    GDALTranspose4x4Int32(r04, r05, r06, r07, tmp10, tmp11, tmp12, tmp13);
654
0
    GDALTranspose4x4Int32(r08, r09, r10, r11, tmp20, tmp21, tmp22, tmp23);
655
0
    GDALTranspose4x4Int32(r12, r13, r14, r15, tmp30, tmp31, tmp32, tmp33);
656
657
0
    GDALReorderForTranspose4x4(tmp00);
658
0
    GDALReorderForTranspose4x4(tmp01);
659
0
    GDALReorderForTranspose4x4(tmp02);
660
0
    GDALReorderForTranspose4x4(tmp03);
661
0
    GDALReorderForTranspose4x4(tmp10);
662
0
    GDALReorderForTranspose4x4(tmp11);
663
0
    GDALReorderForTranspose4x4(tmp12);
664
0
    GDALReorderForTranspose4x4(tmp13);
665
0
    GDALReorderForTranspose4x4(tmp20);
666
0
    GDALReorderForTranspose4x4(tmp21);
667
0
    GDALReorderForTranspose4x4(tmp22);
668
0
    GDALReorderForTranspose4x4(tmp23);
669
0
    GDALReorderForTranspose4x4(tmp30);
670
0
    GDALReorderForTranspose4x4(tmp31);
671
0
    GDALReorderForTranspose4x4(tmp32);
672
0
    GDALReorderForTranspose4x4(tmp33);
673
674
0
    GDALTranspose4x4Int32(tmp00, tmp10, tmp20, tmp30, r00, r01, r02, r03);
675
0
    GDALTranspose4x4Int32(tmp01, tmp11, tmp21, tmp31, r04, r05, r06, r07);
676
0
    GDALTranspose4x4Int32(tmp02, tmp12, tmp22, tmp32, r08, r09, r10, r11);
677
0
    GDALTranspose4x4Int32(tmp03, tmp13, tmp23, tmp33, r12, r13, r14, r15);
678
0
}
679
680
inline void GDALTranspose2D16x16Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
681
                                           uint8_t *CPL_RESTRICT pDst,
682
                                           size_t srcStride, size_t dstStride)
683
0
{
684
0
#define LOAD(x) __m128i r##x = loadu(pSrc, x, srcStride)
685
0
    LOAD(0);
686
0
    LOAD(1);
687
0
    LOAD(2);
688
0
    LOAD(3);
689
0
    LOAD(4);
690
0
    LOAD(5);
691
0
    LOAD(6);
692
0
    LOAD(7);
693
0
    LOAD(8);
694
0
    LOAD(9);
695
0
    LOAD(10);
696
0
    LOAD(11);
697
0
    LOAD(12);
698
0
    LOAD(13);
699
0
    LOAD(14);
700
0
    LOAD(15);
701
0
#undef LOAD
702
703
0
    GDALTranspose16x16ByteBlock_SSSE3(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9,
704
0
                                      r10, r11, r12, r13, r14, r15);
705
706
0
#define STORE(x) storeu(pDst, x, dstStride, r##x)
707
0
    STORE(0);
708
0
    STORE(1);
709
0
    STORE(2);
710
0
    STORE(3);
711
0
    STORE(4);
712
0
    STORE(5);
713
0
    STORE(6);
714
0
    STORE(7);
715
0
    STORE(8);
716
0
    STORE(9);
717
0
    STORE(10);
718
0
    STORE(11);
719
0
    STORE(12);
720
0
    STORE(13);
721
0
    STORE(14);
722
0
    STORE(15);
723
0
#undef STORE
724
0
}
725
726
void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
727
                                uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth,
728
                                size_t nSrcHeight)
729
0
{
730
0
    if (nSrcHeight == 3)
731
0
    {
732
0
        GDALInterleave3Byte_SSSE3(pSrc, pDst, nSrcWidth);
733
0
    }
734
0
    else if (nSrcHeight == 5)
735
0
    {
736
0
        GDALInterleave5Byte_SSSE3(pSrc, pDst, nSrcWidth);
737
0
    }
738
0
    else
739
0
    {
740
0
        constexpr size_t blocksize = 16;
741
0
        for (size_t i = 0; i < nSrcHeight; i += blocksize)
742
0
        {
743
0
            const size_t max_k = std::min(i + blocksize, nSrcHeight);
744
0
            for (size_t j = 0; j < nSrcWidth; j += blocksize)
745
0
            {
746
                // transpose the block beginning at [i,j]
747
0
                const size_t max_l = std::min(j + blocksize, nSrcWidth);
748
0
                if (max_k - i == blocksize && max_l - j == blocksize)
749
0
                {
750
0
                    GDALTranspose2D16x16Byte_SSSE3(&pSrc[j + i * nSrcWidth],
751
0
                                                   &pDst[i + j * nSrcHeight],
752
0
                                                   nSrcWidth, nSrcHeight);
753
0
                }
754
0
                else
755
0
                {
756
0
                    for (size_t k = i; k < max_k; ++k)
757
0
                    {
758
0
                        for (size_t l = j; l < max_l; ++l)
759
0
                        {
760
0
                            GDALCopyWord(pSrc[l + k * nSrcWidth],
761
0
                                         pDst[k + l * nSrcHeight]);
762
0
                        }
763
0
                    }
764
0
                }
765
0
            }
766
0
        }
767
0
    }
768
0
}
769
770
#endif  // HAVE_SSSE3_AT_COMPILE_TIME