Coverage Report

Created: 2026-02-14 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/gdal/gcore/rasterio_ssse3.cpp
Line
Count
Source
1
/******************************************************************************
2
 *
3
 * Project:  GDAL Core
4
 * Purpose:  SSSE3 specializations
5
 * Author:   Even Rouault <even dot rouault at spatialys dot com>
6
 *
7
 ******************************************************************************
8
 * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com>
9
 *
10
 * SPDX-License-Identifier: MIT
11
 ****************************************************************************/
12
13
#include "cpl_port.h"
14
15
#include <algorithm>
16
17
#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
18
     (defined(__x86_64) || defined(_M_X64))) ||                                \
19
    defined(USE_NEON_OPTIMIZATIONS)
20
21
#include "rasterio_ssse3.h"
22
23
#ifdef USE_NEON_OPTIMIZATIONS
24
#include "include_sse2neon.h"
25
#else
26
#include <tmmintrin.h>
27
#endif
28
29
#include "gdal_priv_templates.hpp"
30
31
void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
32
                                      const GByte *CPL_RESTRICT pSrc,
33
                                      GPtrDiff_t nIters)
34
0
{
35
0
    decltype(nIters) i;
36
0
    const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
37
0
                                              -1, -1, 15, 12, 9, 6, 3, 0);
38
0
    const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5,
39
0
                                              2, -1, -1, -1, -1, -1, -1);
40
0
    const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1,
41
0
                                              -1, -1, -1, -1, -1, -1, -1);
42
    // If we were sure that there would always be 2 trailing bytes, we could
43
    // check against nIters - 15
44
0
    for (i = 0; i < nIters - 16; i += 16)
45
0
    {
46
0
        __m128i xmm0 =
47
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0));
48
0
        __m128i xmm1 =
49
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16));
50
0
        __m128i xmm2 =
51
0
            _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32));
52
53
        // From LSB to MSB:
54
        // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0
55
0
        xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0);
56
        // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0
57
0
        xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1);
58
        // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x -->
59
        // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15
60
0
        xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2);
61
0
        xmm0 = _mm_or_si128(xmm0, xmm1);
62
0
        xmm0 = _mm_or_si128(xmm0, xmm2);
63
64
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0);
65
66
0
        pSrc += 3 * 16;
67
0
    }
68
0
    for (; i < nIters; i++)
69
0
    {
70
0
        pDest[i] = *pSrc;
71
0
        pSrc += 3;
72
0
    }
73
0
}
74
75
/************************************************************************/
76
/*                    GDALDeinterleave3Byte_SSSE3()                     */
77
/************************************************************************/
78
79
#if defined(__GNUC__) && !defined(__clang__)
80
// GCC autovectorizer does an excellent job
81
__attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3(
82
    const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0,
83
    GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters)
84
{
85
    for (size_t i = 0; i < nIters; ++i)
86
    {
87
        pabyDest0[i] = pabySrc[3 * i + 0];
88
        pabyDest1[i] = pabySrc[3 * i + 1];
89
        pabyDest2[i] = pabySrc[3 * i + 2];
90
    }
91
}
92
#else
93
void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
94
                                 GByte *CPL_RESTRICT pabyDest0,
95
                                 GByte *CPL_RESTRICT pabyDest1,
96
                                 GByte *CPL_RESTRICT pabyDest2, size_t nIters)
97
0
{
98
0
    size_t i = 0;
99
0
    for (; i + 15 < nIters; i += 16)
100
0
    {
101
0
        __m128i xmm0 = _mm_loadu_si128(
102
0
            reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0));
103
0
        __m128i xmm1 = _mm_loadu_si128(
104
0
            reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16));
105
0
        __m128i xmm2 = _mm_loadu_si128(
106
0
            reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32));
107
0
        auto xmm0_new =
108
0
            _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10,
109
0
                                                7, 4, 1, 9, 6, 3, 0));
110
0
        auto xmm1_new = _mm_shuffle_epi8(
111
0
            _mm_alignr_epi8(xmm1, xmm0, 12),
112
0
            _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
113
0
        auto xmm2_new = _mm_shuffle_epi8(
114
0
            _mm_alignr_epi8(xmm2, xmm1, 8),
115
0
            _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0));
116
0
        auto xmm3_new =
117
0
            _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6,
118
0
                                                14, 11, 8, 5, 13, 10, 7, 4));
119
120
0
        __m128i xmm01lo =
121
0
            _mm_unpacklo_epi32(xmm0_new, xmm1_new);  // W0 W4 W1 W5
122
0
        __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new);  // W2 W6 -  -
123
0
        __m128i xmm23lo =
124
0
            _mm_unpacklo_epi32(xmm2_new, xmm3_new);  // W8 WC W9 WD
125
0
        __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new);  // WA WE -  -
126
0
        xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo);  // W0 W4 W8 WC
127
0
        xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo);  // W1 W5 W9 WD
128
0
        xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi);  // W2 W6 WA WE
129
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new);
130
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new);
131
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new);
132
0
    }
133
0
#if defined(__clang__)
134
0
#pragma clang loop vectorize(disable)
135
0
#endif
136
0
    for (; i < nIters; ++i)
137
0
    {
138
0
        pabyDest0[i] = pabySrc[3 * i + 0];
139
0
        pabyDest1[i] = pabySrc[3 * i + 1];
140
0
        pabyDest2[i] = pabySrc[3 * i + 2];
141
0
    }
142
0
}
143
#endif
144
145
/************************************************************************/
146
/*                       GDALTranspose4x4Int32()                        */
147
/************************************************************************/
148
149
// Consider that the input registers for 4x4 words of size 4 bytes each,
150
// Return the transposition of this 4x4 matrix
151
// Considering that in0 = (in00, in01, in02, in03)
152
// Considering that in1 = (in10, in11, in12, in13)
153
// Considering that in2 = (in20, in21, in22, in23)
154
// Considering that in3 = (in30, in31, in32, in33)
155
// Return          out0 = (in00, in10, in20, in30)
156
// Return          out1 = (in01, in11, in21, in31)
157
// Return          out2 = (in02, in12, in22, in32)
158
// Return          out3 = (in03, in13, in23, in33)
159
inline void GDALTranspose4x4Int32(__m128i in0, __m128i in1, __m128i in2,
160
                                  __m128i in3, __m128i &out0, __m128i &out1,
161
                                  __m128i &out2, __m128i &out3)
162
0
{
163
0
    __m128i tmp0 = _mm_unpacklo_epi32(in0, in1);  // (in00, in10, in01, in11)
164
0
    __m128i tmp1 = _mm_unpackhi_epi32(in0, in1);  // (in02, in12, in03, in13)
165
0
    __m128i tmp2 = _mm_unpacklo_epi32(in2, in3);  // (in20, in30, in21, in31)
166
0
    __m128i tmp3 = _mm_unpackhi_epi32(in2, in3);  // (in22, in32, in23, in33)
167
168
0
    out0 = _mm_unpacklo_epi64(tmp0, tmp2);  // (in00, in10, in20, in30)
169
0
    out1 = _mm_unpackhi_epi64(tmp0, tmp2);  // (in01, in11, in21, in31)
170
0
    out2 = _mm_unpacklo_epi64(tmp1, tmp3);  // (in02, in12, in22, in32)
171
0
    out3 = _mm_unpackhi_epi64(tmp1, tmp3);  // (in03, in13, in23, in33)
172
0
}
173
174
/************************************************************************/
175
/*                    GDALDeinterleave4Byte_SSSE3()                     */
176
/************************************************************************/
177
178
#if !defined(__GNUC__) || defined(__clang__)
179
void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc,
180
                                 GByte *CPL_RESTRICT pabyDest0,
181
                                 GByte *CPL_RESTRICT pabyDest1,
182
                                 GByte *CPL_RESTRICT pabyDest2,
183
                                 GByte *CPL_RESTRICT pabyDest3, size_t nIters)
184
0
{
185
0
    const __m128i shuffle_mask =
186
0
        _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
187
0
    size_t i = 0;
188
0
    for (; i + 15 < nIters; i += 16)
189
0
    {
190
0
        __m128i xmm0 = _mm_loadu_si128(
191
0
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0));
192
0
        __m128i xmm1 = _mm_loadu_si128(
193
0
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16));
194
0
        __m128i xmm2 = _mm_loadu_si128(
195
0
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32));
196
0
        __m128i xmm3 = _mm_loadu_si128(
197
0
            reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48));
198
0
        xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask);  // W0 W1 W2 W3
199
0
        xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask);  // W4 W5 W6 W7
200
0
        xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask);  // W8 W9 WA WB
201
0
        xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask);  // WC WD WE WF
202
203
0
        GDALTranspose4x4Int32(xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3);
204
205
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0);
206
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1);
207
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2);
208
0
        _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3);
209
0
    }
210
0
#if defined(__clang__)
211
0
#pragma clang loop vectorize(disable)
212
0
#endif
213
0
    for (; i < nIters; ++i)
214
0
    {
215
0
        pabyDest0[i] = pabySrc[4 * i + 0];
216
0
        pabyDest1[i] = pabySrc[4 * i + 1];
217
0
        pabyDest2[i] = pabySrc[4 * i + 2];
218
0
        pabyDest3[i] = pabySrc[4 * i + 3];
219
0
    }
220
0
}
221
#endif
222
223
/************************************************************************/
224
/*                   GDALDeinterleave3UInt16_SSSE3()                    */
225
/************************************************************************/
226
227
#if (defined(__GNUC__) && !defined(__clang__)) ||                              \
228
    defined(__INTEL_CLANG_COMPILER)
229
#if !defined(__INTEL_CLANG_COMPILER)
230
// GCC autovectorizer does an excellent job
231
__attribute__((optimize("tree-vectorize")))
232
#endif
233
void GDALDeinterleave3UInt16_SSSE3(const GUInt16 *CPL_RESTRICT panSrc,
234
                                   GUInt16 *CPL_RESTRICT panDest0,
235
                                   GUInt16 *CPL_RESTRICT panDest1,
236
                                   GUInt16 *CPL_RESTRICT panDest2,
237
                                   size_t nIters)
238
{
239
    for (size_t i = 0; i < nIters; ++i)
240
    {
241
        panDest0[i] = panSrc[3 * i + 0];
242
        panDest1[i] = panSrc[3 * i + 1];
243
        panDest2[i] = panSrc[3 * i + 2];
244
    }
245
}
246
#endif
247
248
/************************************************************************/
249
/*                   GDALDeinterleave4UInt16_SSSE3()                    */
250
/************************************************************************/
251
252
#if (defined(__GNUC__) && !defined(__clang__)) ||                              \
253
    defined(__INTEL_CLANG_COMPILER)
254
#if !defined(__INTEL_CLANG_COMPILER)
255
// GCC autovectorizer does an excellent job
256
__attribute__((optimize("tree-vectorize")))
257
#endif
258
void GDALDeinterleave4UInt16_SSSE3(const GUInt16 *CPL_RESTRICT panSrc,
259
                                   GUInt16 *CPL_RESTRICT panDest0,
260
                                   GUInt16 *CPL_RESTRICT panDest1,
261
                                   GUInt16 *CPL_RESTRICT panDest2,
262
                                   GUInt16 *CPL_RESTRICT panDest3,
263
                                   size_t nIters)
264
{
265
    for (size_t i = 0; i < nIters; ++i)
266
    {
267
        panDest0[i] = panSrc[4 * i + 0];
268
        panDest1[i] = panSrc[4 * i + 1];
269
        panDest2[i] = panSrc[4 * i + 2];
270
        panDest3[i] = panSrc[4 * i + 3];
271
    }
272
}
273
#endif
274
275
/************************************************************************/
276
/*                               loadu()                                */
277
/************************************************************************/
278
279
inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride)
280
0
{
281
0
    return _mm_loadu_si128(
282
0
        reinterpret_cast<const __m128i *>(pSrc + i * srcStride));
283
0
}
284
285
/************************************************************************/
286
/*                               storeu()                               */
287
/************************************************************************/
288
289
inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg)
290
0
{
291
0
    _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg);
292
0
}
293
294
/************************************************************************/
295
/*                     GDALInterleave3Byte_SSSE3()                      */
296
/************************************************************************/
297
298
#if (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER))
299
300
inline __m128i GDAL_mm_or_3_si128(__m128i r0, __m128i r1, __m128i r2)
301
{
302
    return _mm_or_si128(_mm_or_si128(r0, r1), r2);
303
}
304
305
// ICC autovectorizer doesn't do a good job at generating good SSE code,
306
// at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop.
307
#if defined(__GNUC__)
308
__attribute__((noinline))
309
#endif
310
static void GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
311
                                      uint8_t *CPL_RESTRICT pDst, size_t nIters)
312
{
313
    size_t i = 0;
314
    constexpr size_t VALS_PER_ITER = 16;
315
316
    if (nIters >= VALS_PER_ITER)
317
    {
318
        // clang-format off
319
        constexpr char X = -1;
320
        // How to dispatch 16 values of row=0 onto 3x16 bytes
321
        const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X,
322
                                                    1, X, X,
323
                                                    2, X, X,
324
                                                    3, X, X,
325
                                                    4, X, X,
326
                                                    5);
327
        const __m128i xmm_shuffle01 = _mm_setr_epi8(   X, X,
328
                                                    6, X, X,
329
                                                    7, X, X,
330
                                                    8, X, X,
331
                                                    9, X, X,
332
                                                    10,X);
333
        const __m128i xmm_shuffle02 = _mm_setr_epi8(       X,
334
                                                    11, X, X,
335
                                                    12, X, X,
336
                                                    13, X, X,
337
                                                    14, X, X,
338
                                                    15, X, X);
339
340
        // How to dispatch 16 values of row=1 onto 3x16 bytes
341
        const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X,
342
                                                    X, 1, X,
343
                                                    X, 2, X,
344
                                                    X, 3, X,
345
                                                    X, 4, X,
346
                                                    X);
347
        const __m128i xmm_shuffle11 = _mm_setr_epi8(   5, X,
348
                                                    X, 6, X,
349
                                                    X, 7, X,
350
                                                    X, 8, X,
351
                                                    X, 9, X,
352
                                                    X,10);
353
        const __m128i xmm_shuffle12 = _mm_setr_epi8(       X,
354
                                                    X, 11, X,
355
                                                    X, 12, X,
356
                                                    X, 13, X,
357
                                                    X, 14, X,
358
                                                    X, 15, X);
359
360
        // How to dispatch 16 values of row=2 onto 3x16 bytes
361
        const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0,
362
                                                    X, X, 1,
363
                                                    X, X, 2,
364
                                                    X, X, 3,
365
                                                    X, X, 4,
366
                                                    X);
367
        const __m128i xmm_shuffle21 = _mm_setr_epi8(   X, 5,
368
                                                    X, X, 6,
369
                                                    X, X, 7,
370
                                                    X, X, 8,
371
                                                    X, X, 9,
372
                                                    X, X);
373
        const __m128i xmm_shuffle22 = _mm_setr_epi8(      10,
374
                                                    X, X, 11,
375
                                                    X, X, 12,
376
                                                    X, X, 13,
377
                                                    X, X, 14,
378
                                                    X, X, 15);
379
        // clang-format on
380
381
        for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
382
        {
383
#define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
384
            LOAD(0);
385
            LOAD(1);
386
            LOAD(2);
387
388
#define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
389
#define COMBINE_3(x)                                                           \
390
    GDAL_mm_or_3_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2))
391
392
#define STORE(x)                                                               \
393
    storeu(pDst, 3 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_3(x))
394
            STORE(0);
395
            STORE(1);
396
            STORE(2);
397
#undef LOAD
398
#undef COMBINE_3
399
#undef SHUFFLE
400
#undef STORE
401
        }
402
    }
403
404
    for (; i < nIters; ++i)
405
    {
406
#define INTERLEAVE(x) pDst[3 * i + x] = pSrc[i + x * nIters]
407
        INTERLEAVE(0);
408
        INTERLEAVE(1);
409
        INTERLEAVE(2);
410
#undef INTERLEAVE
411
    }
412
}
413
414
#else
415
416
#if defined(__GNUC__) && !defined(__clang__)
417
__attribute__((optimize("tree-vectorize")))
418
#endif
419
#if defined(__GNUC__)
420
__attribute__((noinline))
421
#endif
422
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
423
// clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning
424
#pragma clang diagnostic push
425
#pragma clang diagnostic ignored "-Wpass-failed"
426
#endif
427
static void GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
428
                                      uint8_t *CPL_RESTRICT pDst, size_t nIters)
429
0
{
430
0
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
431
0
#pragma clang loop vectorize(enable)
432
0
#endif
433
0
    for (size_t i = 0; i < nIters; ++i)
434
0
    {
435
0
        pDst[3 * i + 0] = pSrc[i + 0 * nIters];
436
0
        pDst[3 * i + 1] = pSrc[i + 1 * nIters];
437
0
        pDst[3 * i + 2] = pSrc[i + 2 * nIters];
438
0
    }
439
0
}
440
#if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER)
441
#pragma clang diagnostic pop
442
#endif
443
444
#endif
445
446
/************************************************************************/
447
/*                     GDALInterleave5Byte_SSSE3()                      */
448
/************************************************************************/
449
450
inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2,
451
                                  __m128i r3, __m128i r4)
452
0
{
453
0
    return _mm_or_si128(
454
0
        _mm_or_si128(_mm_or_si128(r0, r1), _mm_or_si128(r2, r3)), r4);
455
0
}
456
457
static void GDALInterleave5Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
458
                                      uint8_t *CPL_RESTRICT pDst, size_t nIters)
459
0
{
460
0
    size_t i = 0;
461
0
    constexpr size_t VALS_PER_ITER = 16;
462
463
0
    if (nIters >= VALS_PER_ITER)
464
0
    {
465
        // clang-format off
466
0
        constexpr char X = -1;
467
        // How to dispatch 16 values of row=0 onto 5x16 bytes
468
0
        const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, X, X,
469
0
                                                    1, X, X, X, X,
470
0
                                                    2, X, X, X, X,
471
0
                                                    3);
472
0
        const __m128i xmm_shuffle01 = _mm_setr_epi8(   X, X, X, X,
473
0
                                                    4, X, X, X, X,
474
0
                                                    5, X, X, X, X,
475
0
                                                    6, X);
476
0
        const __m128i xmm_shuffle02 = _mm_setr_epi8(      X, X, X,
477
0
                                                    7, X, X, X, X,
478
0
                                                    8, X, X, X, X,
479
0
                                                    9, X, X);
480
0
        const __m128i xmm_shuffle03 = _mm_setr_epi8(          X, X,
481
0
                                                    10, X, X, X, X,
482
0
                                                    11, X, X, X, X,
483
0
                                                    12, X, X, X);
484
0
        const __m128i xmm_shuffle04 = _mm_setr_epi8(             X,
485
0
                                                    13, X, X, X, X,
486
0
                                                    14, X, X, X, X,
487
0
                                                    15, X, X, X, X);
488
489
        // How to dispatch 16 values of row=1 onto 5x16 bytes
490
0
        const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, X, X,
491
0
                                                    X, 1, X, X, X,
492
0
                                                    X, 2, X, X, X,
493
0
                                                    X);
494
0
        const __m128i xmm_shuffle11 = _mm_setr_epi8(   3, X, X, X,
495
0
                                                    X, 4, X, X, X,
496
0
                                                    X, 5, X, X, X,
497
0
                                                    X, 6);
498
0
        const __m128i xmm_shuffle12 = _mm_setr_epi8(      X, X, X,
499
0
                                                    X, 7, X, X, X,
500
0
                                                    X, 8, X, X, X,
501
0
                                                    X, 9, X);
502
0
        const __m128i xmm_shuffle13 = _mm_setr_epi8(          X, X,
503
0
                                                    X, 10, X, X, X,
504
0
                                                    X, 11, X, X, X,
505
0
                                                    X, 12, X, X);
506
0
        const __m128i xmm_shuffle14 = _mm_setr_epi8(             X,
507
0
                                                    X, 13, X, X, X,
508
0
                                                    X, 14, X, X, X,
509
0
                                                    X, 15, X, X, X);
510
511
        // How to dispatch 16 values of row=2 onto 5x16 bytes
512
0
        const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, X, X,
513
0
                                                    X, X, 1, X, X,
514
0
                                                    X, X, 2, X, X,
515
0
                                                    X);
516
0
        const __m128i xmm_shuffle21 = _mm_setr_epi8(   X, 3, X, X,
517
0
                                                    X, X, 4, X, X,
518
0
                                                    X, X, 5, X, X,
519
0
                                                    X, X);
520
0
        const __m128i xmm_shuffle22 = _mm_setr_epi8(      6, X, X,
521
0
                                                    X, X, 7, X, X,
522
0
                                                    X, X, 8, X, X,
523
0
                                                    X, X, 9);
524
0
        const __m128i xmm_shuffle23 = _mm_setr_epi8(          X, X,
525
0
                                                    X, X, 10, X, X,
526
0
                                                    X, X, 11, X, X,
527
0
                                                    X, X, 12, X);
528
0
        const __m128i xmm_shuffle24 = _mm_setr_epi8(             X,
529
0
                                                    X, X, 13, X, X,
530
0
                                                    X, X, 14, X, X,
531
0
                                                    X, X, 15, X, X);
532
533
        // How to dispatch 16 values of row=3 onto 5x16 bytes
534
0
        const __m128i xmm_shuffle30 = _mm_setr_epi8(X, X, X, 0, X,
535
0
                                                    X, X, X, 1, X,
536
0
                                                    X, X, X, 2, X,
537
0
                                                    X);
538
0
        const __m128i xmm_shuffle31 = _mm_setr_epi8(   X, X, 3, X,
539
0
                                                    X, X, X, 4, X,
540
0
                                                    X, X, X, 5, X,
541
0
                                                    X, X);
542
0
        const __m128i xmm_shuffle32 = _mm_setr_epi8(      X, 6, X,
543
0
                                                    X, X, X, 7, X,
544
0
                                                    X, X, X, 8, X,
545
0
                                                    X, X, X);
546
0
        const __m128i xmm_shuffle33 = _mm_setr_epi8(          9, X,
547
0
                                                    X, X, X, 10, X,
548
0
                                                    X, X, X, 11, X,
549
0
                                                    X, X, X, 12);
550
0
        const __m128i xmm_shuffle34 = _mm_setr_epi8(             X,
551
0
                                                    X, X, X, 13, X,
552
0
                                                    X, X, X, 14, X,
553
0
                                                    X, X, X, 15, X);
554
555
        // How to dispatch 16 values of row=4 onto 5x16 bytes
556
0
        const __m128i xmm_shuffle40 = _mm_setr_epi8(X, X, X, X, 0,
557
0
                                                    X, X, X, X, 1,
558
0
                                                    X, X, X, X, 2,
559
0
                                                    X);
560
0
        const __m128i xmm_shuffle41 = _mm_setr_epi8(   X, X, X, 3,
561
0
                                                    X, X, X, X, 4,
562
0
                                                    X, X, X, X, 5,
563
0
                                                    X, X);
564
0
        const __m128i xmm_shuffle42 = _mm_setr_epi8(      X, X, 6,
565
0
                                                    X, X, X, X, 7,
566
0
                                                    X, X, X, X, 8,
567
0
                                                    X, X, X);
568
0
        const __m128i xmm_shuffle43 = _mm_setr_epi8(         X,  9,
569
0
                                                    X, X, X, X, 10,
570
0
                                                    X, X, X, X, 11,
571
0
                                                    X, X, X, X);
572
0
        const __m128i xmm_shuffle44 = _mm_setr_epi8(            12,
573
0
                                                    X, X, X, X, 13,
574
0
                                                    X, X, X, X, 14,
575
0
                                                    X, X, X, X, 15);
576
        // clang-format on
577
578
0
        for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER)
579
0
        {
580
0
#define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters)
581
0
            LOAD(0);
582
0
            LOAD(1);
583
0
            LOAD(2);
584
0
            LOAD(3);
585
0
            LOAD(4);
586
587
0
#define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x)
588
0
#define COMBINE_5(x)                                                           \
589
0
    GDAL_mm_or_5_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2),            \
590
0
                       SHUFFLE(x, 3), SHUFFLE(x, 4))
591
592
0
#define STORE(x)                                                               \
593
0
    storeu(pDst, 5 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_5(x))
594
0
            STORE(0);
595
0
            STORE(1);
596
0
            STORE(2);
597
0
            STORE(3);
598
0
            STORE(4);
599
0
#undef LOAD
600
0
#undef COMBINE_5
601
0
#undef SHUFFLE
602
0
#undef STORE
603
0
        }
604
0
    }
605
606
0
    for (; i < nIters; ++i)
607
0
    {
608
0
#define INTERLEAVE(x) pDst[5 * i + x] = pSrc[i + x * nIters]
609
0
        INTERLEAVE(0);
610
0
        INTERLEAVE(1);
611
0
        INTERLEAVE(2);
612
0
        INTERLEAVE(3);
613
0
        INTERLEAVE(4);
614
0
#undef INTERLEAVE
615
0
    }
616
0
}
617
618
/************************************************************************/
619
/*                     GDALTranspose2D_Byte_SSSE3()                     */
620
/************************************************************************/
621
622
// Given r = (b00, b01, b02, b03,
623
//            b10, b11, b12, b13,
624
//            b20, b21, b22, b23,
625
//            b30, b31, b32, b33)
626
// Return    (b00, b10, b20, b30,
627
//            b01, b11, b21, b31,
628
//            b02, b12, b22, b32,
629
//            b03, b13, b23, b33)
630
inline void GDALReorderForTranspose4x4(__m128i &r)
631
0
{
632
0
    const __m128i shuffle_mask =
633
0
        _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
634
635
0
    r = _mm_shuffle_epi8(r, shuffle_mask);
636
0
}
637
638
// Transpose the 16x16 byte values contained in the 16 SSE registers
639
inline void GDALTranspose16x16ByteBlock_SSSE3(
640
    __m128i &r00, __m128i &r01, __m128i &r02, __m128i &r03, __m128i &r04,
641
    __m128i &r05, __m128i &r06, __m128i &r07, __m128i &r08, __m128i &r09,
642
    __m128i &r10, __m128i &r11, __m128i &r12, __m128i &r13, __m128i &r14,
643
    __m128i &r15)
644
0
{
645
0
    __m128i tmp00, tmp01, tmp02, tmp03;
646
0
    __m128i tmp10, tmp11, tmp12, tmp13;
647
0
    __m128i tmp20, tmp21, tmp22, tmp23;
648
0
    __m128i tmp30, tmp31, tmp32, tmp33;
649
650
0
    GDALTranspose4x4Int32(r00, r01, r02, r03, tmp00, tmp01, tmp02, tmp03);
651
0
    GDALTranspose4x4Int32(r04, r05, r06, r07, tmp10, tmp11, tmp12, tmp13);
652
0
    GDALTranspose4x4Int32(r08, r09, r10, r11, tmp20, tmp21, tmp22, tmp23);
653
0
    GDALTranspose4x4Int32(r12, r13, r14, r15, tmp30, tmp31, tmp32, tmp33);
654
655
0
    GDALReorderForTranspose4x4(tmp00);
656
0
    GDALReorderForTranspose4x4(tmp01);
657
0
    GDALReorderForTranspose4x4(tmp02);
658
0
    GDALReorderForTranspose4x4(tmp03);
659
0
    GDALReorderForTranspose4x4(tmp10);
660
0
    GDALReorderForTranspose4x4(tmp11);
661
0
    GDALReorderForTranspose4x4(tmp12);
662
0
    GDALReorderForTranspose4x4(tmp13);
663
0
    GDALReorderForTranspose4x4(tmp20);
664
0
    GDALReorderForTranspose4x4(tmp21);
665
0
    GDALReorderForTranspose4x4(tmp22);
666
0
    GDALReorderForTranspose4x4(tmp23);
667
0
    GDALReorderForTranspose4x4(tmp30);
668
0
    GDALReorderForTranspose4x4(tmp31);
669
0
    GDALReorderForTranspose4x4(tmp32);
670
0
    GDALReorderForTranspose4x4(tmp33);
671
672
0
    GDALTranspose4x4Int32(tmp00, tmp10, tmp20, tmp30, r00, r01, r02, r03);
673
0
    GDALTranspose4x4Int32(tmp01, tmp11, tmp21, tmp31, r04, r05, r06, r07);
674
0
    GDALTranspose4x4Int32(tmp02, tmp12, tmp22, tmp32, r08, r09, r10, r11);
675
0
    GDALTranspose4x4Int32(tmp03, tmp13, tmp23, tmp33, r12, r13, r14, r15);
676
0
}
677
678
inline void GDALTranspose2D16x16Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
679
                                           uint8_t *CPL_RESTRICT pDst,
680
                                           size_t srcStride, size_t dstStride)
681
0
{
682
0
#define LOAD(x) __m128i r##x = loadu(pSrc, x, srcStride)
683
0
    LOAD(0);
684
0
    LOAD(1);
685
0
    LOAD(2);
686
0
    LOAD(3);
687
0
    LOAD(4);
688
0
    LOAD(5);
689
0
    LOAD(6);
690
0
    LOAD(7);
691
0
    LOAD(8);
692
0
    LOAD(9);
693
0
    LOAD(10);
694
0
    LOAD(11);
695
0
    LOAD(12);
696
0
    LOAD(13);
697
0
    LOAD(14);
698
0
    LOAD(15);
699
0
#undef LOAD
700
701
0
    GDALTranspose16x16ByteBlock_SSSE3(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9,
702
0
                                      r10, r11, r12, r13, r14, r15);
703
704
0
#define STORE(x) storeu(pDst, x, dstStride, r##x)
705
0
    STORE(0);
706
0
    STORE(1);
707
0
    STORE(2);
708
0
    STORE(3);
709
0
    STORE(4);
710
0
    STORE(5);
711
0
    STORE(6);
712
0
    STORE(7);
713
0
    STORE(8);
714
0
    STORE(9);
715
0
    STORE(10);
716
0
    STORE(11);
717
0
    STORE(12);
718
0
    STORE(13);
719
0
    STORE(14);
720
0
    STORE(15);
721
0
#undef STORE
722
0
}
723
724
void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc,
725
                                uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth,
726
                                size_t nSrcHeight)
727
0
{
728
0
    if (nSrcHeight == 3)
729
0
    {
730
0
        GDALInterleave3Byte_SSSE3(pSrc, pDst, nSrcWidth);
731
0
    }
732
0
    else if (nSrcHeight == 5)
733
0
    {
734
0
        GDALInterleave5Byte_SSSE3(pSrc, pDst, nSrcWidth);
735
0
    }
736
0
    else
737
0
    {
738
0
        constexpr size_t blocksize = 16;
739
0
        for (size_t i = 0; i < nSrcHeight; i += blocksize)
740
0
        {
741
0
            const size_t max_k = std::min(i + blocksize, nSrcHeight);
742
0
            for (size_t j = 0; j < nSrcWidth; j += blocksize)
743
0
            {
744
                // transpose the block beginning at [i,j]
745
0
                const size_t max_l = std::min(j + blocksize, nSrcWidth);
746
0
                if (max_k - i == blocksize && max_l - j == blocksize)
747
0
                {
748
0
                    GDALTranspose2D16x16Byte_SSSE3(&pSrc[j + i * nSrcWidth],
749
0
                                                   &pDst[i + j * nSrcHeight],
750
0
                                                   nSrcWidth, nSrcHeight);
751
0
                }
752
0
                else
753
0
                {
754
0
                    for (size_t k = i; k < max_k; ++k)
755
0
                    {
756
0
                        for (size_t l = j; l < max_l; ++l)
757
0
                        {
758
0
                            GDALCopyWord(pSrc[l + k * nSrcWidth],
759
0
                                         pDst[k + l * nSrcHeight]);
760
0
                        }
761
0
                    }
762
0
                }
763
0
            }
764
0
        }
765
0
    }
766
0
}
767
768
#endif  // HAVE_SSSE3_AT_COMPILE_TIME