/src/gdal/gcore/rasterio_ssse3.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Project: GDAL Core |
4 | | * Purpose: SSSE3 specializations |
5 | | * Author: Even Rouault <even dot rouault at spatialys dot com> |
6 | | * |
7 | | ****************************************************************************** |
8 | | * Copyright (c) 2016, Even Rouault <even dot rouault at spatialys dot com> |
9 | | * |
10 | | * SPDX-License-Identifier: MIT |
11 | | ****************************************************************************/ |
12 | | |
13 | | #include "cpl_port.h" |
14 | | |
15 | | #include <algorithm> |
16 | | |
17 | | #if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ |
18 | | (defined(__x86_64) || defined(_M_X64))) || \ |
19 | | defined(USE_NEON_OPTIMIZATIONS) |
20 | | |
21 | | #include "rasterio_ssse3.h" |
22 | | |
23 | | #ifdef USE_NEON_OPTIMIZATIONS |
24 | | #include "include_sse2neon.h" |
25 | | #else |
26 | | #include <tmmintrin.h> |
27 | | #endif |
28 | | |
29 | | #include "gdal_priv_templates.hpp" |
30 | | |
31 | | void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest, |
32 | | const GByte *CPL_RESTRICT pSrc, |
33 | | GPtrDiff_t nIters) |
34 | 0 | { |
35 | 0 | decltype(nIters) i; |
36 | 0 | const __m128i xmm_shuffle0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, |
37 | 0 | -1, -1, 15, 12, 9, 6, 3, 0); |
38 | 0 | const __m128i xmm_shuffle1 = _mm_set_epi8(-1, -1, -1, -1, -1, 14, 11, 8, 5, |
39 | 0 | 2, -1, -1, -1, -1, -1, -1); |
40 | 0 | const __m128i xmm_shuffle2 = _mm_set_epi8(13, 10, 7, 4, 1, -1, -1, -1, -1, |
41 | 0 | -1, -1, -1, -1, -1, -1, -1); |
42 | | // If we were sure that there would always be 2 trailing bytes, we could |
43 | | // check against nIters - 15 |
44 | 0 | for (i = 0; i < nIters - 16; i += 16) |
45 | 0 | { |
46 | 0 | __m128i xmm0 = |
47 | 0 | _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 0)); |
48 | 0 | __m128i xmm1 = |
49 | 0 | _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 16)); |
50 | 0 | __m128i xmm2 = |
51 | 0 | _mm_loadu_si128(reinterpret_cast<__m128i const *>(pSrc + 32)); |
52 | | |
53 | | // From LSB to MSB: |
54 | | // 0,x,x,1,x,x,2,x,x,3,x,x,4,x,x,5 --> 0,1,2,3,4,5,0,0,0,0,0,0,0,0,0 |
55 | 0 | xmm0 = _mm_shuffle_epi8(xmm0, xmm_shuffle0); |
56 | | // x,x,6,x,x,7,x,x,8,x,x,9,x,x,10,x --> 0,0,0,0,0,0,6,7,8,9,10,0,0,0,0,0 |
57 | 0 | xmm1 = _mm_shuffle_epi8(xmm1, xmm_shuffle1); |
58 | | // x,11,x,x,12,x,x,13,x,x,14,x,x,15,x,x --> |
59 | | // 0,0,0,0,0,0,0,0,0,0,0,11,12,13,14,15 |
60 | 0 | xmm2 = _mm_shuffle_epi8(xmm2, xmm_shuffle2); |
61 | 0 | xmm0 = _mm_or_si128(xmm0, xmm1); |
62 | 0 | xmm0 = _mm_or_si128(xmm0, xmm2); |
63 | |
|
64 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pDest + i), xmm0); |
65 | |
|
66 | 0 | pSrc += 3 * 16; |
67 | 0 | } |
68 | 0 | for (; i < nIters; i++) |
69 | 0 | { |
70 | 0 | pDest[i] = *pSrc; |
71 | 0 | pSrc += 3; |
72 | 0 | } |
73 | 0 | } |
74 | | |
75 | | /************************************************************************/ |
76 | | /* GDALDeinterleave3Byte_SSSE3() */ |
77 | | /************************************************************************/ |
78 | | |
79 | | #if defined(__GNUC__) && !defined(__clang__) |
80 | | // GCC autovectorizer does an excellent job |
81 | | __attribute__((optimize("tree-vectorize"))) void GDALDeinterleave3Byte_SSSE3( |
82 | | const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0, |
83 | | GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters) |
84 | | { |
85 | | for (size_t i = 0; i < nIters; ++i) |
86 | | { |
87 | | pabyDest0[i] = pabySrc[3 * i + 0]; |
88 | | pabyDest1[i] = pabySrc[3 * i + 1]; |
89 | | pabyDest2[i] = pabySrc[3 * i + 2]; |
90 | | } |
91 | | } |
92 | | #else |
93 | | void GDALDeinterleave3Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc, |
94 | | GByte *CPL_RESTRICT pabyDest0, |
95 | | GByte *CPL_RESTRICT pabyDest1, |
96 | | GByte *CPL_RESTRICT pabyDest2, size_t nIters) |
97 | 0 | { |
98 | 0 | size_t i = 0; |
99 | 0 | for (; i + 15 < nIters; i += 16) |
100 | 0 | { |
101 | 0 | __m128i xmm0 = _mm_loadu_si128( |
102 | 0 | reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 0)); |
103 | 0 | __m128i xmm1 = _mm_loadu_si128( |
104 | 0 | reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 16)); |
105 | 0 | __m128i xmm2 = _mm_loadu_si128( |
106 | 0 | reinterpret_cast<__m128i const *>(pabySrc + 3 * i + 32)); |
107 | 0 | auto xmm0_new = |
108 | 0 | _mm_shuffle_epi8(xmm0, _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, |
109 | 0 | 7, 4, 1, 9, 6, 3, 0)); |
110 | 0 | auto xmm1_new = _mm_shuffle_epi8( |
111 | 0 | _mm_alignr_epi8(xmm1, xmm0, 12), |
112 | 0 | _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0)); |
113 | 0 | auto xmm2_new = _mm_shuffle_epi8( |
114 | 0 | _mm_alignr_epi8(xmm2, xmm1, 8), |
115 | 0 | _mm_set_epi8(-1, -1, -1, -1, 11, 8, 5, 2, 10, 7, 4, 1, 9, 6, 3, 0)); |
116 | 0 | auto xmm3_new = |
117 | 0 | _mm_shuffle_epi8(xmm2, _mm_set_epi8(-1, -1, -1, -1, 15, 12, 9, 6, |
118 | 0 | 14, 11, 8, 5, 13, 10, 7, 4)); |
119 | |
|
120 | 0 | __m128i xmm01lo = |
121 | 0 | _mm_unpacklo_epi32(xmm0_new, xmm1_new); // W0 W4 W1 W5 |
122 | 0 | __m128i xmm01hi = _mm_unpackhi_epi32(xmm0_new, xmm1_new); // W2 W6 - - |
123 | 0 | __m128i xmm23lo = |
124 | 0 | _mm_unpacklo_epi32(xmm2_new, xmm3_new); // W8 WC W9 WD |
125 | 0 | __m128i xmm23hi = _mm_unpackhi_epi32(xmm2_new, xmm3_new); // WA WE - - |
126 | 0 | xmm0_new = _mm_unpacklo_epi64(xmm01lo, xmm23lo); // W0 W4 W8 WC |
127 | 0 | xmm1_new = _mm_unpackhi_epi64(xmm01lo, xmm23lo); // W1 W5 W9 WD |
128 | 0 | xmm2_new = _mm_unpacklo_epi64(xmm01hi, xmm23hi); // W2 W6 WA WE |
129 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0_new); |
130 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1_new); |
131 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2_new); |
132 | 0 | } |
133 | 0 | #if defined(__clang__) |
134 | 0 | #pragma clang loop vectorize(disable) |
135 | 0 | #endif |
136 | 0 | for (; i < nIters; ++i) |
137 | 0 | { |
138 | 0 | pabyDest0[i] = pabySrc[3 * i + 0]; |
139 | 0 | pabyDest1[i] = pabySrc[3 * i + 1]; |
140 | 0 | pabyDest2[i] = pabySrc[3 * i + 2]; |
141 | 0 | } |
142 | 0 | } |
143 | | #endif |
144 | | |
145 | | /************************************************************************/ |
146 | | /* GDALTranspose4x4Int32() */ |
147 | | /************************************************************************/ |
148 | | |
149 | | // Consider that the input registers for 4x4 words of size 4 bytes each, |
150 | | // Return the transposition of this 4x4 matrix |
151 | | // Considering that in0 = (in00, in01, in02, in03) |
152 | | // Considering that in1 = (in10, in11, in12, in13) |
153 | | // Considering that in2 = (in20, in21, in22, in23) |
154 | | // Considering that in3 = (in30, in31, in32, in33) |
155 | | // Return out0 = (in00, in10, in20, in30) |
156 | | // Return out1 = (in01, in11, in21, in31) |
157 | | // Return out2 = (in02, in12, in22, in32) |
158 | | // Return out3 = (in03, in13, in23, in33) |
159 | | inline void GDALTranspose4x4Int32(__m128i in0, __m128i in1, __m128i in2, |
160 | | __m128i in3, __m128i &out0, __m128i &out1, |
161 | | __m128i &out2, __m128i &out3) |
162 | 0 | { |
163 | 0 | __m128i tmp0 = _mm_unpacklo_epi32(in0, in1); // (in00, in10, in01, in11) |
164 | 0 | __m128i tmp1 = _mm_unpackhi_epi32(in0, in1); // (in02, in12, in03, in13) |
165 | 0 | __m128i tmp2 = _mm_unpacklo_epi32(in2, in3); // (in20, in30, in21, in31) |
166 | 0 | __m128i tmp3 = _mm_unpackhi_epi32(in2, in3); // (in22, in32, in23, in33) |
167 | |
|
168 | 0 | out0 = _mm_unpacklo_epi64(tmp0, tmp2); // (in00, in10, in20, in30) |
169 | 0 | out1 = _mm_unpackhi_epi64(tmp0, tmp2); // (in01, in11, in21, in31) |
170 | 0 | out2 = _mm_unpacklo_epi64(tmp1, tmp3); // (in02, in12, in22, in32) |
171 | 0 | out3 = _mm_unpackhi_epi64(tmp1, tmp3); // (in03, in13, in23, in33) |
172 | 0 | } |
173 | | |
174 | | /************************************************************************/ |
175 | | /* GDALDeinterleave4Byte_SSSE3() */ |
176 | | /************************************************************************/ |
177 | | |
178 | | #if !defined(__GNUC__) || defined(__clang__) |
179 | | void GDALDeinterleave4Byte_SSSE3(const GByte *CPL_RESTRICT pabySrc, |
180 | | GByte *CPL_RESTRICT pabyDest0, |
181 | | GByte *CPL_RESTRICT pabyDest1, |
182 | | GByte *CPL_RESTRICT pabyDest2, |
183 | | GByte *CPL_RESTRICT pabyDest3, size_t nIters) |
184 | 0 | { |
185 | 0 | const __m128i shuffle_mask = |
186 | 0 | _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); |
187 | 0 | size_t i = 0; |
188 | 0 | for (; i + 15 < nIters; i += 16) |
189 | 0 | { |
190 | 0 | __m128i xmm0 = _mm_loadu_si128( |
191 | 0 | reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 0)); |
192 | 0 | __m128i xmm1 = _mm_loadu_si128( |
193 | 0 | reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 16)); |
194 | 0 | __m128i xmm2 = _mm_loadu_si128( |
195 | 0 | reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 32)); |
196 | 0 | __m128i xmm3 = _mm_loadu_si128( |
197 | 0 | reinterpret_cast<__m128i const *>(pabySrc + 4 * i + 48)); |
198 | 0 | xmm0 = _mm_shuffle_epi8(xmm0, shuffle_mask); // W0 W1 W2 W3 |
199 | 0 | xmm1 = _mm_shuffle_epi8(xmm1, shuffle_mask); // W4 W5 W6 W7 |
200 | 0 | xmm2 = _mm_shuffle_epi8(xmm2, shuffle_mask); // W8 W9 WA WB |
201 | 0 | xmm3 = _mm_shuffle_epi8(xmm3, shuffle_mask); // WC WD WE WF |
202 | |
|
203 | 0 | GDALTranspose4x4Int32(xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3); |
204 | |
|
205 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest0 + i), xmm0); |
206 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest1 + i), xmm1); |
207 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest2 + i), xmm2); |
208 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pabyDest3 + i), xmm3); |
209 | 0 | } |
210 | 0 | #if defined(__clang__) |
211 | 0 | #pragma clang loop vectorize(disable) |
212 | 0 | #endif |
213 | 0 | for (; i < nIters; ++i) |
214 | 0 | { |
215 | 0 | pabyDest0[i] = pabySrc[4 * i + 0]; |
216 | 0 | pabyDest1[i] = pabySrc[4 * i + 1]; |
217 | 0 | pabyDest2[i] = pabySrc[4 * i + 2]; |
218 | 0 | pabyDest3[i] = pabySrc[4 * i + 3]; |
219 | 0 | } |
220 | 0 | } |
221 | | #endif |
222 | | |
223 | | /************************************************************************/ |
224 | | /* GDALDeinterleave3UInt16_SSSE3() */ |
225 | | /************************************************************************/ |
226 | | |
227 | | #if (defined(__GNUC__) && !defined(__clang__)) || \ |
228 | | defined(__INTEL_CLANG_COMPILER) |
229 | | #if !defined(__INTEL_CLANG_COMPILER) |
230 | | // GCC autovectorizer does an excellent job |
231 | | __attribute__((optimize("tree-vectorize"))) |
232 | | #endif |
233 | | void GDALDeinterleave3UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc, |
234 | | GUInt16* CPL_RESTRICT panDest0, |
235 | | GUInt16* CPL_RESTRICT panDest1, |
236 | | GUInt16* CPL_RESTRICT panDest2, |
237 | | size_t nIters) |
238 | | { |
239 | | for (size_t i = 0; i < nIters; ++i) |
240 | | { |
241 | | panDest0[i] = panSrc[3 * i + 0]; |
242 | | panDest1[i] = panSrc[3 * i + 1]; |
243 | | panDest2[i] = panSrc[3 * i + 2]; |
244 | | } |
245 | | } |
246 | | #endif |
247 | | |
248 | | /************************************************************************/ |
249 | | /* GDALDeinterleave4UInt16_SSSE3() */ |
250 | | /************************************************************************/ |
251 | | |
252 | | #if (defined(__GNUC__) && !defined(__clang__)) || \ |
253 | | defined(__INTEL_CLANG_COMPILER) |
254 | | #if !defined(__INTEL_CLANG_COMPILER) |
255 | | // GCC autovectorizer does an excellent job |
256 | | __attribute__((optimize("tree-vectorize"))) |
257 | | #endif |
258 | | void GDALDeinterleave4UInt16_SSSE3(const GUInt16* CPL_RESTRICT panSrc, |
259 | | GUInt16* CPL_RESTRICT panDest0, |
260 | | GUInt16* CPL_RESTRICT panDest1, |
261 | | GUInt16* CPL_RESTRICT panDest2, |
262 | | GUInt16* CPL_RESTRICT panDest3, |
263 | | size_t nIters) |
264 | | { |
265 | | for (size_t i = 0; i < nIters; ++i) |
266 | | { |
267 | | panDest0[i] = panSrc[4 * i + 0]; |
268 | | panDest1[i] = panSrc[4 * i + 1]; |
269 | | panDest2[i] = panSrc[4 * i + 2]; |
270 | | panDest3[i] = panSrc[4 * i + 3]; |
271 | | } |
272 | | } |
273 | | #endif |
274 | | |
275 | | /************************************************************************/ |
276 | | /* loadu() */ |
277 | | /************************************************************************/ |
278 | | |
279 | | inline __m128i loadu(const uint8_t *pSrc, size_t i, size_t srcStride) |
280 | 0 | { |
281 | 0 | return _mm_loadu_si128( |
282 | 0 | reinterpret_cast<const __m128i *>(pSrc + i * srcStride)); |
283 | 0 | } |
284 | | |
285 | | /************************************************************************/ |
286 | | /* storeu() */ |
287 | | /************************************************************************/ |
288 | | |
289 | | inline void storeu(uint8_t *pDst, size_t i, size_t dstStride, __m128i reg) |
290 | 0 | { |
291 | 0 | _mm_storeu_si128(reinterpret_cast<__m128i *>(pDst + i * dstStride), reg); |
292 | 0 | } |
293 | | |
294 | | /************************************************************************/ |
295 | | /* GDALInterleave3Byte_SSSE3() */ |
296 | | /************************************************************************/ |
297 | | |
298 | | #if (!defined(__GNUC__) || defined(__INTEL_CLANG_COMPILER)) |
299 | | |
300 | | inline __m128i GDAL_mm_or_3_si128(__m128i r0, __m128i r1, __m128i r2) |
301 | | { |
302 | | return _mm_or_si128(_mm_or_si128(r0, r1), r2); |
303 | | } |
304 | | |
305 | | // ICC autovectorizer doesn't do a good job at generating good SSE code, |
306 | | // at least with icx 2024.0.2.20231213, but it nicely unrolls the below loop. |
307 | | #if defined(__GNUC__) |
308 | | __attribute__((noinline)) |
309 | | #endif |
310 | | static void |
311 | | GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, |
312 | | uint8_t *CPL_RESTRICT pDst, size_t nIters) |
313 | | { |
314 | | size_t i = 0; |
315 | | constexpr size_t VALS_PER_ITER = 16; |
316 | | |
317 | | if (nIters >= VALS_PER_ITER) |
318 | | { |
319 | | // clang-format off |
320 | | constexpr char X = -1; |
321 | | // How to dispatch 16 values of row=0 onto 3x16 bytes |
322 | | const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, |
323 | | 1, X, X, |
324 | | 2, X, X, |
325 | | 3, X, X, |
326 | | 4, X, X, |
327 | | 5); |
328 | | const __m128i xmm_shuffle01 = _mm_setr_epi8( X, X, |
329 | | 6, X, X, |
330 | | 7, X, X, |
331 | | 8, X, X, |
332 | | 9, X, X, |
333 | | 10,X); |
334 | | const __m128i xmm_shuffle02 = _mm_setr_epi8( X, |
335 | | 11, X, X, |
336 | | 12, X, X, |
337 | | 13, X, X, |
338 | | 14, X, X, |
339 | | 15, X, X); |
340 | | |
341 | | // How to dispatch 16 values of row=1 onto 3x16 bytes |
342 | | const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, |
343 | | X, 1, X, |
344 | | X, 2, X, |
345 | | X, 3, X, |
346 | | X, 4, X, |
347 | | X); |
348 | | const __m128i xmm_shuffle11 = _mm_setr_epi8( 5, X, |
349 | | X, 6, X, |
350 | | X, 7, X, |
351 | | X, 8, X, |
352 | | X, 9, X, |
353 | | X,10); |
354 | | const __m128i xmm_shuffle12 = _mm_setr_epi8( X, |
355 | | X, 11, X, |
356 | | X, 12, X, |
357 | | X, 13, X, |
358 | | X, 14, X, |
359 | | X, 15, X); |
360 | | |
361 | | // How to dispatch 16 values of row=2 onto 3x16 bytes |
362 | | const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, |
363 | | X, X, 1, |
364 | | X, X, 2, |
365 | | X, X, 3, |
366 | | X, X, 4, |
367 | | X); |
368 | | const __m128i xmm_shuffle21 = _mm_setr_epi8( X, 5, |
369 | | X, X, 6, |
370 | | X, X, 7, |
371 | | X, X, 8, |
372 | | X, X, 9, |
373 | | X, X); |
374 | | const __m128i xmm_shuffle22 = _mm_setr_epi8( 10, |
375 | | X, X, 11, |
376 | | X, X, 12, |
377 | | X, X, 13, |
378 | | X, X, 14, |
379 | | X, X, 15); |
380 | | // clang-format on |
381 | | |
382 | | for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER) |
383 | | { |
384 | | #define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters) |
385 | | LOAD(0); |
386 | | LOAD(1); |
387 | | LOAD(2); |
388 | | |
389 | | #define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x) |
390 | | #define COMBINE_3(x) \ |
391 | | GDAL_mm_or_3_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2)) |
392 | | |
393 | | #define STORE(x) \ |
394 | | storeu(pDst, 3 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_3(x)) |
395 | | STORE(0); |
396 | | STORE(1); |
397 | | STORE(2); |
398 | | #undef LOAD |
399 | | #undef COMBINE_3 |
400 | | #undef SHUFFLE |
401 | | #undef STORE |
402 | | } |
403 | | } |
404 | | |
405 | | for (; i < nIters; ++i) |
406 | | { |
407 | | #define INTERLEAVE(x) pDst[3 * i + x] = pSrc[i + x * nIters] |
408 | | INTERLEAVE(0); |
409 | | INTERLEAVE(1); |
410 | | INTERLEAVE(2); |
411 | | #undef INTERLEAVE |
412 | | } |
413 | | } |
414 | | |
415 | | #else |
416 | | |
417 | | #if defined(__GNUC__) && !defined(__clang__) |
418 | | __attribute__((optimize("tree-vectorize"))) |
419 | | #endif |
420 | | #if defined(__GNUC__) |
421 | | __attribute__((noinline)) |
422 | | #endif |
423 | | #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER) |
424 | | // clang++ -O2 -fsanitize=undefined fails to vectorize, ignore that warning |
425 | | #pragma clang diagnostic push |
426 | | #pragma clang diagnostic ignored "-Wpass-failed" |
427 | | #endif |
428 | | static void |
429 | | GDALInterleave3Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, |
430 | | uint8_t *CPL_RESTRICT pDst, size_t nIters) |
431 | 0 | { |
432 | 0 | #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER) |
433 | 0 | #pragma clang loop vectorize(enable) |
434 | 0 | #endif |
435 | 0 | for (size_t i = 0; i < nIters; ++i) |
436 | 0 | { |
437 | 0 | pDst[3 * i + 0] = pSrc[i + 0 * nIters]; |
438 | 0 | pDst[3 * i + 1] = pSrc[i + 1 * nIters]; |
439 | 0 | pDst[3 * i + 2] = pSrc[i + 2 * nIters]; |
440 | 0 | } |
441 | 0 | } |
442 | | #if defined(__clang__) && !defined(__INTEL_CLANG_COMPILER) |
443 | | #pragma clang diagnostic pop |
444 | | #endif |
445 | | |
446 | | #endif |
447 | | |
448 | | /************************************************************************/ |
449 | | /* GDALInterleave5Byte_SSSE3() */ |
450 | | /************************************************************************/ |
451 | | |
452 | | inline __m128i GDAL_mm_or_5_si128(__m128i r0, __m128i r1, __m128i r2, |
453 | | __m128i r3, __m128i r4) |
454 | 0 | { |
455 | 0 | return _mm_or_si128( |
456 | 0 | _mm_or_si128(_mm_or_si128(r0, r1), _mm_or_si128(r2, r3)), r4); |
457 | 0 | } |
458 | | |
459 | | static void GDALInterleave5Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, |
460 | | uint8_t *CPL_RESTRICT pDst, size_t nIters) |
461 | 0 | { |
462 | 0 | size_t i = 0; |
463 | 0 | constexpr size_t VALS_PER_ITER = 16; |
464 | |
|
465 | 0 | if (nIters >= VALS_PER_ITER) |
466 | 0 | { |
467 | | // clang-format off |
468 | 0 | constexpr char X = -1; |
469 | | // How to dispatch 16 values of row=0 onto 5x16 bytes |
470 | 0 | const __m128i xmm_shuffle00 = _mm_setr_epi8(0, X, X, X, X, |
471 | 0 | 1, X, X, X, X, |
472 | 0 | 2, X, X, X, X, |
473 | 0 | 3); |
474 | 0 | const __m128i xmm_shuffle01 = _mm_setr_epi8( X, X, X, X, |
475 | 0 | 4, X, X, X, X, |
476 | 0 | 5, X, X, X, X, |
477 | 0 | 6, X); |
478 | 0 | const __m128i xmm_shuffle02 = _mm_setr_epi8( X, X, X, |
479 | 0 | 7, X, X, X, X, |
480 | 0 | 8, X, X, X, X, |
481 | 0 | 9, X, X); |
482 | 0 | const __m128i xmm_shuffle03 = _mm_setr_epi8( X, X, |
483 | 0 | 10, X, X, X, X, |
484 | 0 | 11, X, X, X, X, |
485 | 0 | 12, X, X, X); |
486 | 0 | const __m128i xmm_shuffle04 = _mm_setr_epi8( X, |
487 | 0 | 13, X, X, X, X, |
488 | 0 | 14, X, X, X, X, |
489 | 0 | 15, X, X, X, X); |
490 | | |
491 | | // How to dispatch 16 values of row=1 onto 5x16 bytes |
492 | 0 | const __m128i xmm_shuffle10 = _mm_setr_epi8(X, 0, X, X, X, |
493 | 0 | X, 1, X, X, X, |
494 | 0 | X, 2, X, X, X, |
495 | 0 | X); |
496 | 0 | const __m128i xmm_shuffle11 = _mm_setr_epi8( 3, X, X, X, |
497 | 0 | X, 4, X, X, X, |
498 | 0 | X, 5, X, X, X, |
499 | 0 | X, 6); |
500 | 0 | const __m128i xmm_shuffle12 = _mm_setr_epi8( X, X, X, |
501 | 0 | X, 7, X, X, X, |
502 | 0 | X, 8, X, X, X, |
503 | 0 | X, 9, X); |
504 | 0 | const __m128i xmm_shuffle13 = _mm_setr_epi8( X, X, |
505 | 0 | X, 10, X, X, X, |
506 | 0 | X, 11, X, X, X, |
507 | 0 | X, 12, X, X); |
508 | 0 | const __m128i xmm_shuffle14 = _mm_setr_epi8( X, |
509 | 0 | X, 13, X, X, X, |
510 | 0 | X, 14, X, X, X, |
511 | 0 | X, 15, X, X, X); |
512 | | |
513 | | // How to dispatch 16 values of row=2 onto 5x16 bytes |
514 | 0 | const __m128i xmm_shuffle20 = _mm_setr_epi8(X, X, 0, X, X, |
515 | 0 | X, X, 1, X, X, |
516 | 0 | X, X, 2, X, X, |
517 | 0 | X); |
518 | 0 | const __m128i xmm_shuffle21 = _mm_setr_epi8( X, 3, X, X, |
519 | 0 | X, X, 4, X, X, |
520 | 0 | X, X, 5, X, X, |
521 | 0 | X, X); |
522 | 0 | const __m128i xmm_shuffle22 = _mm_setr_epi8( 6, X, X, |
523 | 0 | X, X, 7, X, X, |
524 | 0 | X, X, 8, X, X, |
525 | 0 | X, X, 9); |
526 | 0 | const __m128i xmm_shuffle23 = _mm_setr_epi8( X, X, |
527 | 0 | X, X, 10, X, X, |
528 | 0 | X, X, 11, X, X, |
529 | 0 | X, X, 12, X); |
530 | 0 | const __m128i xmm_shuffle24 = _mm_setr_epi8( X, |
531 | 0 | X, X, 13, X, X, |
532 | 0 | X, X, 14, X, X, |
533 | 0 | X, X, 15, X, X); |
534 | | |
535 | | // How to dispatch 16 values of row=3 onto 5x16 bytes |
536 | 0 | const __m128i xmm_shuffle30 = _mm_setr_epi8(X, X, X, 0, X, |
537 | 0 | X, X, X, 1, X, |
538 | 0 | X, X, X, 2, X, |
539 | 0 | X); |
540 | 0 | const __m128i xmm_shuffle31 = _mm_setr_epi8( X, X, 3, X, |
541 | 0 | X, X, X, 4, X, |
542 | 0 | X, X, X, 5, X, |
543 | 0 | X, X); |
544 | 0 | const __m128i xmm_shuffle32 = _mm_setr_epi8( X, 6, X, |
545 | 0 | X, X, X, 7, X, |
546 | 0 | X, X, X, 8, X, |
547 | 0 | X, X, X); |
548 | 0 | const __m128i xmm_shuffle33 = _mm_setr_epi8( 9, X, |
549 | 0 | X, X, X, 10, X, |
550 | 0 | X, X, X, 11, X, |
551 | 0 | X, X, X, 12); |
552 | 0 | const __m128i xmm_shuffle34 = _mm_setr_epi8( X, |
553 | 0 | X, X, X, 13, X, |
554 | 0 | X, X, X, 14, X, |
555 | 0 | X, X, X, 15, X); |
556 | | |
557 | | // How to dispatch 16 values of row=4 onto 5x16 bytes |
558 | 0 | const __m128i xmm_shuffle40 = _mm_setr_epi8(X, X, X, X, 0, |
559 | 0 | X, X, X, X, 1, |
560 | 0 | X, X, X, X, 2, |
561 | 0 | X); |
562 | 0 | const __m128i xmm_shuffle41 = _mm_setr_epi8( X, X, X, 3, |
563 | 0 | X, X, X, X, 4, |
564 | 0 | X, X, X, X, 5, |
565 | 0 | X, X); |
566 | 0 | const __m128i xmm_shuffle42 = _mm_setr_epi8( X, X, 6, |
567 | 0 | X, X, X, X, 7, |
568 | 0 | X, X, X, X, 8, |
569 | 0 | X, X, X); |
570 | 0 | const __m128i xmm_shuffle43 = _mm_setr_epi8( X, 9, |
571 | 0 | X, X, X, X, 10, |
572 | 0 | X, X, X, X, 11, |
573 | 0 | X, X, X, X); |
574 | 0 | const __m128i xmm_shuffle44 = _mm_setr_epi8( 12, |
575 | 0 | X, X, X, X, 13, |
576 | 0 | X, X, X, X, 14, |
577 | 0 | X, X, X, X, 15); |
578 | | // clang-format on |
579 | |
|
580 | 0 | for (; i + VALS_PER_ITER <= nIters; i += VALS_PER_ITER) |
581 | 0 | { |
582 | 0 | #define LOAD(x) __m128i xmm##x = loadu(pSrc + i, x, nIters) |
583 | 0 | LOAD(0); |
584 | 0 | LOAD(1); |
585 | 0 | LOAD(2); |
586 | 0 | LOAD(3); |
587 | 0 | LOAD(4); |
588 | |
|
589 | 0 | #define SHUFFLE(x, y) _mm_shuffle_epi8(xmm##y, xmm_shuffle##y##x) |
590 | 0 | #define COMBINE_5(x) \ |
591 | 0 | GDAL_mm_or_5_si128(SHUFFLE(x, 0), SHUFFLE(x, 1), SHUFFLE(x, 2), \ |
592 | 0 | SHUFFLE(x, 3), SHUFFLE(x, 4)) |
593 | |
|
594 | 0 | #define STORE(x) \ |
595 | 0 | storeu(pDst, 5 * (i / VALS_PER_ITER) + x, VALS_PER_ITER, COMBINE_5(x)) |
596 | 0 | STORE(0); |
597 | 0 | STORE(1); |
598 | 0 | STORE(2); |
599 | 0 | STORE(3); |
600 | 0 | STORE(4); |
601 | 0 | #undef LOAD |
602 | 0 | #undef COMBINE_5 |
603 | 0 | #undef SHUFFLE |
604 | 0 | #undef STORE |
605 | 0 | } |
606 | 0 | } |
607 | |
|
608 | 0 | for (; i < nIters; ++i) |
609 | 0 | { |
610 | 0 | #define INTERLEAVE(x) pDst[5 * i + x] = pSrc[i + x * nIters] |
611 | 0 | INTERLEAVE(0); |
612 | 0 | INTERLEAVE(1); |
613 | 0 | INTERLEAVE(2); |
614 | 0 | INTERLEAVE(3); |
615 | 0 | INTERLEAVE(4); |
616 | 0 | #undef INTERLEAVE |
617 | 0 | } |
618 | 0 | } |
619 | | |
620 | | /************************************************************************/ |
621 | | /* GDALTranspose2D_Byte_SSSE3() */ |
622 | | /************************************************************************/ |
623 | | |
624 | | // Given r = (b00, b01, b02, b03, |
625 | | // b10, b11, b12, b13, |
626 | | // b20, b21, b22, b23, |
627 | | // b30, b31, b32, b33) |
628 | | // Return (b00, b10, b20, b30, |
629 | | // b01, b11, b21, b31, |
630 | | // b02, b12, b22, b32, |
631 | | // b03, b13, b22, b33) |
632 | | inline void GDALReorderForTranspose4x4(__m128i &r) |
633 | 0 | { |
634 | 0 | const __m128i shuffle_mask = |
635 | 0 | _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); |
636 | |
|
637 | 0 | r = _mm_shuffle_epi8(r, shuffle_mask); |
638 | 0 | } |
639 | | |
640 | | // Transpose the 16x16 byte values contained in the 16 SSE registers |
641 | | inline void GDALTranspose16x16ByteBlock_SSSE3( |
642 | | __m128i &r00, __m128i &r01, __m128i &r02, __m128i &r03, __m128i &r04, |
643 | | __m128i &r05, __m128i &r06, __m128i &r07, __m128i &r08, __m128i &r09, |
644 | | __m128i &r10, __m128i &r11, __m128i &r12, __m128i &r13, __m128i &r14, |
645 | | __m128i &r15) |
646 | 0 | { |
647 | 0 | __m128i tmp00, tmp01, tmp02, tmp03; |
648 | 0 | __m128i tmp10, tmp11, tmp12, tmp13; |
649 | 0 | __m128i tmp20, tmp21, tmp22, tmp23; |
650 | 0 | __m128i tmp30, tmp31, tmp32, tmp33; |
651 | |
|
652 | 0 | GDALTranspose4x4Int32(r00, r01, r02, r03, tmp00, tmp01, tmp02, tmp03); |
653 | 0 | GDALTranspose4x4Int32(r04, r05, r06, r07, tmp10, tmp11, tmp12, tmp13); |
654 | 0 | GDALTranspose4x4Int32(r08, r09, r10, r11, tmp20, tmp21, tmp22, tmp23); |
655 | 0 | GDALTranspose4x4Int32(r12, r13, r14, r15, tmp30, tmp31, tmp32, tmp33); |
656 | |
|
657 | 0 | GDALReorderForTranspose4x4(tmp00); |
658 | 0 | GDALReorderForTranspose4x4(tmp01); |
659 | 0 | GDALReorderForTranspose4x4(tmp02); |
660 | 0 | GDALReorderForTranspose4x4(tmp03); |
661 | 0 | GDALReorderForTranspose4x4(tmp10); |
662 | 0 | GDALReorderForTranspose4x4(tmp11); |
663 | 0 | GDALReorderForTranspose4x4(tmp12); |
664 | 0 | GDALReorderForTranspose4x4(tmp13); |
665 | 0 | GDALReorderForTranspose4x4(tmp20); |
666 | 0 | GDALReorderForTranspose4x4(tmp21); |
667 | 0 | GDALReorderForTranspose4x4(tmp22); |
668 | 0 | GDALReorderForTranspose4x4(tmp23); |
669 | 0 | GDALReorderForTranspose4x4(tmp30); |
670 | 0 | GDALReorderForTranspose4x4(tmp31); |
671 | 0 | GDALReorderForTranspose4x4(tmp32); |
672 | 0 | GDALReorderForTranspose4x4(tmp33); |
673 | |
|
674 | 0 | GDALTranspose4x4Int32(tmp00, tmp10, tmp20, tmp30, r00, r01, r02, r03); |
675 | 0 | GDALTranspose4x4Int32(tmp01, tmp11, tmp21, tmp31, r04, r05, r06, r07); |
676 | 0 | GDALTranspose4x4Int32(tmp02, tmp12, tmp22, tmp32, r08, r09, r10, r11); |
677 | 0 | GDALTranspose4x4Int32(tmp03, tmp13, tmp23, tmp33, r12, r13, r14, r15); |
678 | 0 | } |
679 | | |
680 | | inline void GDALTranspose2D16x16Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, |
681 | | uint8_t *CPL_RESTRICT pDst, |
682 | | size_t srcStride, size_t dstStride) |
683 | 0 | { |
684 | 0 | #define LOAD(x) __m128i r##x = loadu(pSrc, x, srcStride) |
685 | 0 | LOAD(0); |
686 | 0 | LOAD(1); |
687 | 0 | LOAD(2); |
688 | 0 | LOAD(3); |
689 | 0 | LOAD(4); |
690 | 0 | LOAD(5); |
691 | 0 | LOAD(6); |
692 | 0 | LOAD(7); |
693 | 0 | LOAD(8); |
694 | 0 | LOAD(9); |
695 | 0 | LOAD(10); |
696 | 0 | LOAD(11); |
697 | 0 | LOAD(12); |
698 | 0 | LOAD(13); |
699 | 0 | LOAD(14); |
700 | 0 | LOAD(15); |
701 | 0 | #undef LOAD |
702 | |
|
703 | 0 | GDALTranspose16x16ByteBlock_SSSE3(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, |
704 | 0 | r10, r11, r12, r13, r14, r15); |
705 | |
|
706 | 0 | #define STORE(x) storeu(pDst, x, dstStride, r##x) |
707 | 0 | STORE(0); |
708 | 0 | STORE(1); |
709 | 0 | STORE(2); |
710 | 0 | STORE(3); |
711 | 0 | STORE(4); |
712 | 0 | STORE(5); |
713 | 0 | STORE(6); |
714 | 0 | STORE(7); |
715 | 0 | STORE(8); |
716 | 0 | STORE(9); |
717 | 0 | STORE(10); |
718 | 0 | STORE(11); |
719 | 0 | STORE(12); |
720 | 0 | STORE(13); |
721 | 0 | STORE(14); |
722 | 0 | STORE(15); |
723 | 0 | #undef STORE |
724 | 0 | } |
725 | | |
726 | | void GDALTranspose2D_Byte_SSSE3(const uint8_t *CPL_RESTRICT pSrc, |
727 | | uint8_t *CPL_RESTRICT pDst, size_t nSrcWidth, |
728 | | size_t nSrcHeight) |
729 | 0 | { |
730 | 0 | if (nSrcHeight == 3) |
731 | 0 | { |
732 | 0 | GDALInterleave3Byte_SSSE3(pSrc, pDst, nSrcWidth); |
733 | 0 | } |
734 | 0 | else if (nSrcHeight == 5) |
735 | 0 | { |
736 | 0 | GDALInterleave5Byte_SSSE3(pSrc, pDst, nSrcWidth); |
737 | 0 | } |
738 | 0 | else |
739 | 0 | { |
740 | 0 | constexpr size_t blocksize = 16; |
741 | 0 | for (size_t i = 0; i < nSrcHeight; i += blocksize) |
742 | 0 | { |
743 | 0 | const size_t max_k = std::min(i + blocksize, nSrcHeight); |
744 | 0 | for (size_t j = 0; j < nSrcWidth; j += blocksize) |
745 | 0 | { |
746 | | // transpose the block beginning at [i,j] |
747 | 0 | const size_t max_l = std::min(j + blocksize, nSrcWidth); |
748 | 0 | if (max_k - i == blocksize && max_l - j == blocksize) |
749 | 0 | { |
750 | 0 | GDALTranspose2D16x16Byte_SSSE3(&pSrc[j + i * nSrcWidth], |
751 | 0 | &pDst[i + j * nSrcHeight], |
752 | 0 | nSrcWidth, nSrcHeight); |
753 | 0 | } |
754 | 0 | else |
755 | 0 | { |
756 | 0 | for (size_t k = i; k < max_k; ++k) |
757 | 0 | { |
758 | 0 | for (size_t l = j; l < max_l; ++l) |
759 | 0 | { |
760 | 0 | GDALCopyWord(pSrc[l + k * nSrcWidth], |
761 | 0 | pDst[k + l * nSrcHeight]); |
762 | 0 | } |
763 | 0 | } |
764 | 0 | } |
765 | 0 | } |
766 | 0 | } |
767 | 0 | } |
768 | 0 | } |
769 | | |
770 | | #endif // HAVE_SSSE3_AT_COMPILE_TIME |