Coverage Report

Created: 2025-11-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx512bwTransform.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2022 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdDefs.h"
25
#include "Simd/SimdMemory.h"
26
#include "Simd/SimdStore.h"
27
#include "Simd/SimdTransform.h"
28
29
namespace Simd
30
{
31
#ifdef SIMD_AVX512BW_ENABLE    
32
    namespace Avx512bw
33
    {
34
        template<size_t N> void TransformImageRotate90(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride);
35
36
        template<> void TransformImageRotate90<1>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
37
0
        {
38
0
            dst += (width - 1) * dstStride;
39
0
            size_t width16 = AlignLo(width, 16);
40
0
            size_t height8 = AlignLo(height, 8);
41
0
            size_t height16 = AlignLo(height, 16);
42
0
            size_t height64 = AlignLo(height, 64);
43
0
            size_t row = 0;
44
0
            for (; row < height64; row += 64)
45
0
            {
46
0
                size_t col = 0;
47
0
                for (; col < width16; col += 16)
48
0
                    Avx512bw::TransformImageTranspose_1x64x16(src + col * 1, srcStride, dst - col * dstStride, -dstStride);
49
0
                for (; col < width; ++col)
50
0
                    for (size_t i = 0; i < 64; ++i)
51
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst - col * dstStride + i * 1);
52
0
                src += 64 * srcStride;
53
0
                dst += 64;
54
0
            }
55
0
            for (; row < height16; row += 16)
56
0
            {
57
0
                size_t col = 0;
58
0
                for (; col < width16; col += 16)
59
0
                    Avx2::TransformImageTranspose_1x16x16(src + col * 1, srcStride, dst - col * dstStride, -dstStride);
60
0
                for (; col < width; ++col)
61
0
                    for (size_t i = 0; i < 16; ++i)
62
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst - col * dstStride + i * 1);
63
0
                src += 16 * srcStride;
64
0
                dst += 16;
65
0
            }
66
0
            for (; row < height8; row += 8)
67
0
            {
68
0
                size_t col = 0;
69
0
                for (; col < width16; col += 16)
70
0
                    Sse41::TransformImageTranspose_1x8x16(src + col * 1, srcStride, dst - col * dstStride, -dstStride);
71
0
                for (; col < width; ++col)
72
0
                    for (size_t i = 0; i < 8; ++i)
73
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst - col * dstStride + i * 1);
74
0
                src += 8 * srcStride;
75
0
                dst += 8;
76
0
            }
77
0
            for (; row < height; ++row)
78
0
            {
79
0
                for (size_t col = 0; col < width; ++col)
80
0
                    Base::CopyPixel<1>(src + col * 1, dst - col * dstStride);
81
0
                src += srcStride;
82
0
                dst += 1;
83
0
            }
84
0
        }
85
86
        template<> void TransformImageRotate90<2>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
87
0
        {
88
0
            dst += (width - 1) * dstStride;
89
0
            size_t width8 = AlignLo(width, 8);
90
0
            size_t width16 = AlignLo(width, 16);
91
0
            size_t height8 = AlignLo(height, 8);
92
0
            size_t height16 = AlignLo(height, 16);
93
0
            size_t height32= AlignLo(height, 32);
94
0
            size_t row = 0;
95
0
            for (; row < height32; row += 32)
96
0
            {
97
0
                size_t col = 0;
98
0
                for (; col < width8; col += 8)
99
0
                    Avx512bw::TransformImageTranspose_2x32x8(src + col * 2, srcStride, dst - col * dstStride, -dstStride);
100
0
                for (; col < width; ++col)
101
0
                    for (size_t i = 0; i < 32; ++i)
102
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst - col * dstStride + i * 2);
103
0
                src += 32 * srcStride;
104
0
                dst += 64;
105
0
            }
106
0
            for (; row < height16; row += 16)
107
0
            {
108
0
                size_t col = 0;
109
0
                for (; col < width8; col += 8)
110
0
                    Avx2::TransformImageTranspose_2x16x8(src + col * 2, srcStride, dst - col * dstStride, -dstStride);
111
0
                for (; col < width; ++col)
112
0
                    for (size_t i = 0; i < 16; ++i)
113
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst - col * dstStride + i * 2);
114
0
                src += 16 * srcStride;
115
0
                dst += 32;
116
0
            }
117
0
            for (; row < height8; row += 8)
118
0
            {
119
0
                size_t col = 0;
120
0
                for (; col < width16; col += 16)
121
0
                    Avx2::TransformImageTranspose_2x8x16(src + col * 2, srcStride, dst - col * dstStride, -dstStride);
122
0
                for (; col < width8; col += 8)
123
0
                    Sse41::TransformImageTranspose_2x8x8(src + col * 2, srcStride, dst - col * dstStride, -dstStride);
124
0
                for (; col < width; ++col)
125
0
                    for (size_t i = 0; i < 8; ++i)
126
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst - col * dstStride + i * 2);
127
0
                src += 8 * srcStride;
128
0
                dst += 16;
129
0
            }
130
0
            for (; row < height; ++row)
131
0
            {
132
0
                for (size_t col = 0; col < width; ++col)
133
0
                    Base::CopyPixel<2>(src + col * 2, dst - col * dstStride);
134
0
                src += srcStride;
135
0
                dst += 2;
136
0
            }
137
0
        }
138
139
        template<> void TransformImageRotate90<3>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
140
0
        {
141
0
            dst += (width - 1) * dstStride;
142
0
            size_t width4 = AlignLo(width - 5, 4);
143
0
            size_t width8 = AlignLo(width - 9, 8);
144
0
            size_t width16 = AlignLo(width, 16);
145
0
            size_t height4 = AlignLo(height - 5, 4);
146
0
            size_t height8 = AlignLo(height - 9, 8);
147
0
            size_t height16 = AlignLo(height, 16);
148
0
            size_t row = 0;
149
0
            for (; row < height16; row += 16)
150
0
            {
151
0
                size_t col = 0;
152
0
                for (; col < width16; col += 16)
153
0
                    Avx512bw::TransformImageTranspose_3x16x16(src + col * 3, srcStride, dst - col * dstStride, -dstStride);
154
0
                for (; col < width4; col += 4)
155
0
                    Avx512bw::TransformImageTranspose_3x16x4(src + col * 3, srcStride, dst - col * dstStride, -dstStride);
156
0
                for (; col < width; ++col)
157
0
                    for (size_t i = 0; i < 16; ++i)
158
0
                        Base::CopyPixel<3>(src + col * 3 + i * srcStride, dst - col * dstStride + i * 3);
159
0
                src += 16 * srcStride;
160
0
                dst += 48;
161
0
            }
162
0
            for (; row < height8; row += 8)
163
0
            {
164
0
                size_t col = 0;
165
0
                for (; col < width4; col += 4)
166
0
                    Avx2::TransformImageTranspose_3x8x4(src + col * 3, srcStride, dst - col * dstStride, -dstStride);
167
0
                for (; col < width; ++col)
168
0
                    for (size_t i = 0; i < 8; ++i)
169
0
                        Base::CopyPixel<3>(src + col * 3 + i * srcStride, dst - col * dstStride + i * 3);
170
0
                src += 8 * srcStride;
171
0
                dst += 24;
172
0
            }
173
0
            for (; row < height4; row += 4)
174
0
            {
175
0
                size_t col = 0;
176
0
                for (; col < width8; col += 8)
177
0
                    Avx2::TransformImageTranspose_3x4x8(src + col * 3, srcStride, dst - col * dstStride, -dstStride);
178
0
                for (; col < width4; col += 4)
179
0
                    Sse41::TransformImageTranspose_3x4x4(src + col * 3, srcStride, dst - col * dstStride, -dstStride);
180
0
                for (; col < width; ++col)
181
0
                    for (size_t i = 0; i < 4; ++i)
182
0
                        Base::CopyPixel<3>(src + col * 3 + i * srcStride, dst - col * dstStride + i * 3);
183
0
                src += 4 * srcStride;
184
0
                dst += 12;
185
0
            }
186
0
            for (; row < height; ++row)
187
0
            {
188
0
                for (size_t col = 0; col < width; ++col)
189
0
                    Base::CopyPixel<3>(src + col * 3, dst - col * dstStride);
190
0
                src += srcStride;
191
0
                dst += 3;
192
0
            }
193
0
        }
194
195
        template<> void TransformImageRotate90<4>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
196
0
        {
197
0
            dst += (width - 1) * dstStride;
198
0
            size_t width4 = AlignLo(width, 4);
199
0
            size_t width8 = AlignLo(width, 8);
200
0
            size_t width16 = AlignLo(width, 16);
201
0
            size_t height4 = AlignLo(height, 4);
202
0
            size_t height8 = AlignLo(height, 8);
203
0
            size_t height16 = AlignLo(height, 16);
204
0
            size_t row = 0;
205
0
            for (; row < height16; row += 16)
206
0
            {
207
0
                size_t col = 0;
208
0
                for (; col < width16; col += 16)
209
0
                    Avx512bw::TransformImageTranspose_4x16x16(src + col * 4, srcStride, dst - col * dstStride, -dstStride);
210
0
                for (; col < width8; col += 8)
211
0
                    Avx512bw::TransformImageTranspose_4x16x8(src + col * 4, srcStride, dst - col * dstStride, -dstStride);
212
0
                for (; col < width; ++col)
213
0
                    for (size_t i = 0; i < 16; ++i)
214
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst - col * dstStride + i * 4);
215
0
                src += 16 * srcStride;
216
0
                dst += 64;
217
0
            }
218
0
            for (; row < height8; row += 8)
219
0
            {
220
0
                size_t col = 0;
221
0
                for (; col < width16; col += 16)
222
0
                    Avx512bw::TransformImageTranspose_4x8x16(src + col * 4, srcStride, dst - col * dstStride, -dstStride);
223
0
                for (; col < width8; col += 8)
224
0
                    Avx2::TransformImageTranspose_4x8x8(src + col * 4, srcStride, dst - col * dstStride, -dstStride);
225
0
                for (; col < width4; col += 4)
226
0
                    Avx2::TransformImageTranspose_4x8x4(src + col * 4, srcStride, dst - col * dstStride, -dstStride);
227
0
                for (; col < width; ++col)
228
0
                    for (size_t i = 0; i < 8; ++i)
229
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst - col * dstStride + i * 4);
230
0
                src += 8 * srcStride;
231
0
                dst += 32;
232
0
            }
233
0
            for (; row < height4; row += 4)
234
0
            {
235
0
                size_t col = 0;
236
0
                for (; col < width16; col += 16)
237
0
                    Avx512bw::TransformImageTranspose_4x4x16(src + col * 4, srcStride, dst - col * dstStride, -dstStride);
238
0
                for (; col < width8; col += 8)
239
0
                    Avx2::TransformImageTranspose_4x4x8(src + col * 4, srcStride, dst - col * dstStride, -dstStride);
240
0
                for (; col < width4; col += 4)
241
0
                    Sse41::TransformImageTranspose_4x4x4(src + col * 4, srcStride, dst - col * dstStride, -dstStride);
242
0
                for (; col < width; ++col)
243
0
                    for (size_t i = 0; i < 4; ++i)
244
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst - col * dstStride + i * 4);
245
0
                src += 4 * srcStride;
246
0
                dst += 16;
247
0
            }
248
0
            for (; row < height; ++row)
249
0
            {
250
0
                for (size_t col = 0; col < width; ++col)
251
0
                    Base::CopyPixel<4>(src + col * 4, dst - col * dstStride);
252
0
                src += srcStride;
253
0
                dst += 4;
254
0
            }
255
0
        }
256
257
        //-----------------------------------------------------------------------------------------
258
259
        template<size_t N> void TransformImageRotate180(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride);
260
261
        template<> void TransformImageRotate180<1>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
262
0
        {
263
0
            dst += (height - 1) * dstStride + (width - 64) * 1;
264
0
            size_t width64= AlignLo(width, 64);
265
0
            __mmask64 tail = TailMask64(width - width64), nose = NoseMask64(width - width64);
266
0
            size_t size = width * 1, size64 = width64 * 1, size256 = AlignLo(size, 256);
267
0
            for (size_t row = 0; row < height; ++row)
268
0
            {
269
0
                size_t offs = 0;
270
0
                for (; offs < size256; offs += 256)
271
0
                    Avx512bw::TransformImageMirror1x256(src + offs, dst - offs);
272
0
                for (; offs < size64; offs += 64)
273
0
                    Avx512bw::TransformImageMirror1x64(src + offs, dst - offs);
274
0
                if (offs < size)
275
0
                    Avx512bw::TransformImageMirror1x64(src + offs, dst - offs, tail, nose);
276
0
                src += srcStride;
277
0
                dst -= dstStride;
278
0
            }
279
0
        }
280
281
        template<> void TransformImageRotate180<2>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
282
0
        {
283
0
            dst += (height - 1) * dstStride + (width - 32) * 2;
284
0
            size_t width32 = AlignLo(width, 32);
285
0
            __mmask32 tail = TailMask32(width - width32), nose = NoseMask32(width - width32);
286
0
            size_t size = width * 2, size64 = width32 * 2, size256 = AlignLo(size, 256);
287
0
            for (size_t row = 0; row < height; ++row)
288
0
            {
289
0
                size_t offs = 0;
290
0
                for (; offs < size256; offs += 256)
291
0
                    Avx512bw::TransformImageMirror2x128(src + offs, dst - offs);
292
0
                for (; offs < size64; offs += 64)
293
0
                    Avx512bw::TransformImageMirror2x32(src + offs, dst - offs);
294
0
                if (offs < size)
295
0
                    Avx512bw::TransformImageMirror2x32(src + offs, dst - offs, tail, nose);
296
0
                src += srcStride;
297
0
                dst -= dstStride;
298
0
            }
299
0
        }
300
301
        template<> void TransformImageRotate180<3>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
302
0
        {
303
0
            dst += (height - 1) * dstStride + width * 3 - 48;
304
0
            size_t width16 = AlignLo(width, 16);
305
0
            size_t size = width * 3, size48 = width16 * 3, size192 = AlignLo(width, 64) * 3;
306
0
            __mmask64 tail = TailMask64(size - size48), nose = 0x0000FFFFFFFFFFFF & NoseMask64(size - size48 + 16);
307
0
            for (size_t row = 0; row < height; ++row)
308
0
            {
309
0
                size_t offs = 0;
310
0
                for (; offs < size192; offs += 192)
311
0
                    Avx512bw::TransformImageMirror3x64(src + offs, dst - offs - 16);
312
0
                for (; offs < size48; offs += 48)
313
0
                    Avx512bw::TransformImageMirror3x16(src + offs, dst - offs);
314
0
                if (offs < size)
315
0
                    Avx512bw::TransformImageMirror3x16(src + offs, dst - offs, tail, nose);
316
0
                src += srcStride;
317
0
                dst -= dstStride;
318
0
            }
319
0
        }
320
321
        template<> void TransformImageRotate180<4>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
322
0
        {
323
0
            dst += (height - 1) * dstStride + (width - 16) * 4;
324
0
            size_t width16 = AlignLo(width, 16);
325
0
            __mmask16 tail = TailMask16(width - width16), nose = NoseMask16(width - width16);
326
0
            size_t size = width * 4, size64 = width16 * 4, size256 = AlignLo(size, 256);
327
0
            for (size_t row = 0; row < height; ++row)
328
0
            {
329
0
                size_t offs = 0;
330
0
                for (; offs < size256; offs += 256)
331
0
                    Avx512bw::TransformImageMirror4x64(src + offs, dst - offs);
332
0
                for (; offs < size64; offs += 64)
333
0
                    Avx512bw::TransformImageMirror4x16(src + offs, dst - offs);
334
0
                if(offs < size)
335
0
                    Avx512bw::TransformImageMirror4x16(src + offs, dst - offs, tail, nose);
336
0
                src += srcStride;
337
0
                dst -= dstStride;
338
0
            }
339
0
        }
340
341
        //-----------------------------------------------------------------------------------------
342
343
        template<size_t N> void TransformImageRotate270(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride);
344
345
        template<> void TransformImageRotate270<1>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
346
0
        {
347
0
            dst += (height - 1) * 1;
348
0
            size_t width16 = AlignLo(width, 16);
349
0
            size_t height8 = AlignLo(height, 8);
350
0
            size_t height16 = AlignLo(height, 16);
351
0
            size_t height64 = AlignLo(height, 64);
352
0
            size_t row = 0;
353
0
            for (; row < height64; row += 64)
354
0
            {
355
0
                size_t col = 0;
356
0
                for (; col < width16; col += 16)
357
0
                    Avx512bw::TransformImageTranspose_1x64x16(src + col * 1 + 63 * srcStride, -srcStride, dst + col * dstStride - 63, dstStride);
358
0
                for (; col < width; ++col)
359
0
                    for (size_t i = 0; i < 64; ++i)
360
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst + col * dstStride - i * 1);
361
0
                src += 64 * srcStride;
362
0
                dst -= 64;
363
0
            }
364
0
            for (; row < height16; row += 16)
365
0
            {
366
0
                size_t col = 0;
367
0
                for (; col < width16; col += 16)
368
0
                    Avx2::TransformImageTranspose_1x16x16(src + col * 1 + 15 * srcStride, -srcStride, dst + col * dstStride - 15, dstStride);
369
0
                for (; col < width; ++col)
370
0
                    for (size_t i = 0; i < 16; ++i)
371
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst + col * dstStride - i * 1);
372
0
                src += 16 * srcStride;
373
0
                dst -= 16;
374
0
            }
375
0
            for (; row < height8; row += 8)
376
0
            {
377
0
                size_t col = 0;
378
0
                for (; col < width16; col += 16)
379
0
                    Sse41::TransformImageTranspose_1x8x16(src + col * 1 + 7 * srcStride, -srcStride, dst + col * dstStride - 7, dstStride);
380
0
                for (; col < width; ++col)
381
0
                    for (size_t i = 0; i < 8; ++i)
382
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst + col * dstStride - i * 1);
383
0
                src += 8 * srcStride;
384
0
                dst -= 8;
385
0
            }
386
0
            for (; row < height; ++row)
387
0
            {
388
0
                for (size_t col = 0; col < width; ++col)
389
0
                    Base::CopyPixel<1>(src + col * 1, dst + col * dstStride);
390
0
                src += srcStride;
391
0
                dst -= 1;
392
0
            }
393
0
        }
394
395
        template<> void TransformImageRotate270<2>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
396
0
        {
397
0
            dst += (height - 1) * 2;
398
0
            size_t width8 = AlignLo(width, 8);
399
0
            size_t width16 = AlignLo(width, 16);
400
0
            size_t height8 = AlignLo(height, 8);
401
0
            size_t height16 = AlignLo(height, 16);
402
0
            size_t height32 = AlignLo(height, 32);
403
0
            size_t row = 0;
404
0
            for (; row < height32; row += 32)
405
0
            {
406
0
                size_t col = 0;
407
0
                for (; col < width8; col += 8)
408
0
                    Avx512bw::TransformImageTranspose_2x32x8(src + col * 2 + 31 * srcStride, -srcStride, dst + col * dstStride - 62, dstStride);
409
0
                for (; col < width; ++col)
410
0
                    for (size_t i = 0; i < 32; ++i)
411
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst + col * dstStride - i * 2);
412
0
                src += 32 * srcStride;
413
0
                dst -= 64;
414
0
            }
415
0
            for (; row < height16; row += 16)
416
0
            {
417
0
                size_t col = 0;
418
0
                for (; col < width8; col += 8)
419
0
                    Avx2::TransformImageTranspose_2x16x8(src + col * 2 + 15 * srcStride, -srcStride, dst + col * dstStride - 30, dstStride);
420
0
                for (; col < width; ++col)
421
0
                    for (size_t i = 0; i < 16; ++i)
422
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst + col * dstStride - i * 2);
423
0
                src += 16 * srcStride;
424
0
                dst -= 32;
425
0
            }
426
0
            for (; row < height8; row += 8)
427
0
            {
428
0
                size_t col = 0;
429
0
                for (; col < width16; col += 16)
430
0
                    Avx2::TransformImageTranspose_2x8x16(src + col * 2 + 7 * srcStride, -srcStride, dst + col * dstStride - 14, dstStride);
431
0
                for (; col < width8; col += 8)
432
0
                    Sse41::TransformImageTranspose_2x8x8(src + col * 2 + 7 * srcStride, -srcStride, dst + col * dstStride - 14, dstStride);
433
0
                for (; col < width; ++col)
434
0
                    for (size_t i = 0; i < 8; ++i)
435
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst + col * dstStride - i * 2);
436
0
                src += 8 * srcStride;
437
0
                dst -= 16;
438
0
            }
439
0
            for (; row < height; ++row)
440
0
            {
441
0
                for (size_t col = 0; col < width; ++col)
442
0
                    Base::CopyPixel<2>(src + col * 2, dst + col * dstStride);
443
0
                src += srcStride;
444
0
                dst -= 2;
445
0
            }
446
0
        }
447
448
        template<> void TransformImageRotate270<3>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
449
0
        {
450
0
            src += (height - 1) * srcStride;
451
0
            size_t width4 = AlignLo(width - 5, 4);
452
0
            size_t width8 = AlignLo(width - 9, 8);
453
0
            size_t width16 = AlignLo(width, 16);
454
0
            size_t height4 = AlignLo(height - 5, 4);
455
0
            size_t height8 = AlignLo(height - 9, 8);
456
0
            size_t height16 = AlignLo(height, 16);
457
0
            size_t row = 0;
458
0
            for (; row < height16; row += 16)
459
0
            {
460
0
                size_t col = 0;
461
0
                for (; col < width16; col += 16)
462
0
                    Avx512bw::TransformImageTranspose_3x16x16(src + col * 3, -srcStride, dst + col * dstStride, dstStride);
463
0
                for (; col < width4; col += 4)
464
0
                    Avx512bw::TransformImageTranspose_3x16x4(src + col * 3, -srcStride, dst + col * dstStride, dstStride);
465
0
                for (; col < width; ++col)
466
0
                    for (size_t i = 0; i < 16; ++i)
467
0
                        Base::CopyPixel<3>(src + col * 3 - i * srcStride, dst + col * dstStride + i * 3);
468
0
                src -= 16 * srcStride;
469
0
                dst += 48;
470
0
            }
471
0
            for (; row < height8; row += 8)
472
0
            {
473
0
                size_t col = 0;
474
0
                for (; col < width4; col += 4)
475
0
                    Avx2::TransformImageTranspose_3x8x4(src + col * 3, -srcStride, dst + col * dstStride, dstStride);
476
0
                for (; col < width; ++col)
477
0
                    for (size_t i = 0; i < 8; ++i)
478
0
                        Base::CopyPixel<3>(src + col * 3 - i * srcStride, dst + col * dstStride + i * 3);
479
0
                src -= 8 * srcStride;
480
0
                dst += 24;
481
0
            }
482
0
            for (; row < height4; row += 4)
483
0
            {
484
0
                size_t col = 0;
485
0
                for (; col < width8; col += 8)
486
0
                    Avx2::TransformImageTranspose_3x4x8(src + col * 3, -srcStride, dst + col * dstStride, dstStride);
487
0
                for (; col < width4; col += 4)
488
0
                    Sse41::TransformImageTranspose_3x4x4(src + col * 3, -srcStride, dst + col * dstStride, dstStride);
489
0
                for (; col < width; ++col)
490
0
                    for (size_t i = 0; i < 4; ++i)
491
0
                        Base::CopyPixel<3>(src + col * 3 - i * srcStride, dst + col * dstStride + i * 3);
492
0
                src -= 4 * srcStride;
493
0
                dst += 12;
494
0
            }
495
0
            for (; row < height; ++row)
496
0
            {
497
0
                for (size_t col = 0; col < width; ++col)
498
0
                    Base::CopyPixel<3>(src + col * 3, dst + col * dstStride);
499
0
                src -= srcStride;
500
0
                dst += 3;
501
0
            }
502
0
        }
503
504
        template<> void TransformImageRotate270<4>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
505
0
        {
506
0
            dst += (height - 1) * 4;
507
0
            size_t width4 = AlignLo(width, 4);
508
0
            size_t width8 = AlignLo(width, 8);
509
0
            size_t width16 = AlignLo(width, 16);
510
0
            size_t height4 = AlignLo(height, 4);
511
0
            size_t height8 = AlignLo(height, 8);
512
0
            size_t height16 = AlignLo(height, 16);
513
0
            size_t row = 0;
514
0
            for (; row < height16; row += 16)
515
0
            {
516
0
                size_t col = 0;
517
0
                for (; col < width16; col += 16)
518
0
                    Avx512bw::TransformImageTranspose_4x16x16(src + col * 4 + 15 * srcStride, -srcStride, dst + col * dstStride - 60, dstStride);
519
0
                for (; col < width8; col += 8)
520
0
                    Avx512bw::TransformImageTranspose_4x16x8(src + col * 4 + 15 * srcStride, -srcStride, dst + col * dstStride - 60, dstStride);
521
0
                for (; col < width; ++col)
522
0
                    for (size_t i = 0; i < 16; ++i)
523
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst + col * dstStride - i * 4);
524
0
                src += 16 * srcStride;
525
0
                dst -= 64;
526
0
            }
527
0
            for (; row < height8; row += 8)
528
0
            {
529
0
                size_t col = 0;
530
0
                for (; col < width16; col += 16)
531
0
                    Avx512bw::TransformImageTranspose_4x8x16(src + col * 4 + 7 * srcStride, -srcStride, dst + col * dstStride - 28, dstStride);
532
0
                for (; col < width8; col += 8)
533
0
                    Avx2::TransformImageTranspose_4x8x8(src + col * 4 + 7 * srcStride, -srcStride, dst + col * dstStride - 28, dstStride);
534
0
                for (; col < width4; col += 4)
535
0
                    Avx2::TransformImageTranspose_4x8x4(src + col * 4 + 7 * srcStride, -srcStride, dst + col * dstStride - 28, dstStride);
536
0
                for (; col < width; ++col)
537
0
                    for (size_t i = 0; i < 8; ++i)
538
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst + col * dstStride - i * 4);
539
0
                src += 8 * srcStride;
540
0
                dst -= 32;
541
0
            }
542
0
            for (; row < height4; row += 4)
543
0
            {
544
0
                size_t col = 0;
545
0
                for (; col < width16; col += 8)
546
0
                    Avx512bw::TransformImageTranspose_4x4x16(src + col * 4 + 3 * srcStride, -srcStride, dst + col * dstStride - 12, dstStride);
547
0
                for (; col < width8; col += 8)
548
0
                    Avx2::TransformImageTranspose_4x4x8(src + col * 4 + 3 * srcStride, -srcStride, dst + col * dstStride - 12, dstStride);
549
0
                for (; col < width4; col += 4)
550
0
                    Sse41::TransformImageTranspose_4x4x4(src + col * 4 + 3 * srcStride, -srcStride, dst + col * dstStride - 12, dstStride);
551
0
                for (; col < width; ++col)
552
0
                    for (size_t i = 0; i < 4; ++i)
553
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst + col * dstStride - i * 4);
554
0
                src += 4 * srcStride;
555
0
                dst -= 16;
556
0
            }
557
0
            for (; row < height; ++row)
558
0
            {
559
0
                for (size_t col = 0; col < width; ++col)
560
0
                    Base::CopyPixel<4>(src + col * 4, dst + col * dstStride);
561
0
                src += srcStride;
562
0
                dst -= 4;
563
0
            }
564
0
        }
565
566
        //-----------------------------------------------------------------------------------------
567
568
        template<size_t N> void TransformImageTransposeRotate0(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride);
569
570
        template<> void TransformImageTransposeRotate0<1>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
571
0
        {
572
0
            size_t width16 = AlignLo(width, 16);
573
0
            size_t height8 = AlignLo(height, 8);
574
0
            size_t height16 = AlignLo(height, 16);
575
0
            size_t height64 = AlignLo(height, 64);
576
0
            size_t row = 0;
577
0
            for (; row < height64; row += 64)
578
0
            {
579
0
                size_t col = 0;
580
0
                for (; col < width16; col += 16)
581
0
                    Avx512bw::TransformImageTranspose_1x64x16(src + col * 1, srcStride, dst + col * dstStride, dstStride);
582
0
                for (; col < width; ++col)
583
0
                    for (size_t i = 0; i < 64; ++i)
584
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst + col * dstStride + i * 1);
585
0
                src += 64 * srcStride;
586
0
                dst += 64;
587
0
            }
588
0
            for (; row < height16; row += 16)
589
0
            {
590
0
                size_t col = 0;
591
0
                for (; col < width16; col += 16)
592
0
                    Avx2::TransformImageTranspose_1x16x16(src + col * 1, srcStride, dst + col * dstStride, dstStride);
593
0
                for (; col < width; ++col)
594
0
                    for (size_t i = 0; i < 16; ++i)
595
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst + col * dstStride + i * 1);
596
0
                src += 16 * srcStride;
597
0
                dst += 16;
598
0
            }
599
0
            for (; row < height8; row += 8)
600
0
            {
601
0
                size_t col = 0;
602
0
                for (; col < width16; col += 16)
603
0
                    Sse41::TransformImageTranspose_1x8x16(src + col * 1, srcStride, dst + col * dstStride, dstStride);
604
0
                for (; col < width; ++col)
605
0
                    for (size_t i = 0; i < 8; ++i)
606
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst + col * dstStride + i * 1);
607
0
                src += 8 * srcStride;
608
0
                dst += 8;
609
0
            }
610
0
            for (; row < height; ++row)
611
0
            {
612
0
                for (size_t col = 0; col < width; ++col)
613
0
                    Base::CopyPixel<1>(src + col * 1, dst + col * dstStride);
614
0
                src += srcStride;
615
0
                dst += 1;
616
0
            }
617
0
        }
618
619
        template<> void TransformImageTransposeRotate0<2>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
620
0
        {
621
0
            size_t width8 = AlignLo(width, 8);
622
0
            size_t width16 = AlignLo(width, 16);
623
0
            size_t height8 = AlignLo(height, 8);
624
0
            size_t height16 = AlignLo(height, 16);
625
0
            size_t height32 = AlignLo(height, 32);
626
0
            size_t row = 0;
627
0
            for (; row < height32; row += 32)
628
0
            {
629
0
                size_t col = 0;
630
0
                for (; col < width16; col += 16)
631
0
                    Avx512bw::TransformImageTranspose_2x32x16(src + col * 2, srcStride, dst + col * dstStride, dstStride);
632
0
                for (; col < width8; col += 8)
633
0
                    Avx512bw::TransformImageTranspose_2x32x8(src + col * 2, srcStride, dst + col * dstStride, dstStride);
634
0
                for (; col < width; ++col)
635
0
                    for (size_t i = 0; i < 32; ++i)
636
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst + col * dstStride + i * 2);
637
0
                src += 32 * srcStride;
638
0
                dst += 64;
639
0
            }
640
0
            for (; row < height16; row += 16)
641
0
            {
642
0
                size_t col = 0;
643
0
                for (; col < width8; col += 8)
644
0
                    Avx2::TransformImageTranspose_2x16x8(src + col * 2, srcStride, dst + col * dstStride, dstStride);
645
0
                for (; col < width; ++col)
646
0
                    for (size_t i = 0; i < 16; ++i)
647
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst + col * dstStride + i * 2);
648
0
                src += 16 * srcStride;
649
0
                dst += 32;
650
0
            }
651
0
            for (; row < height8; row += 8)
652
0
            {
653
0
                size_t col = 0;
654
0
                for (; col < width16; col += 16)
655
0
                    Avx2::TransformImageTranspose_2x8x16(src + col * 2, srcStride, dst + col * dstStride, dstStride);
656
0
                for (; col < width8; col += 8)
657
0
                    Sse41::TransformImageTranspose_2x8x8(src + col * 2, srcStride, dst + col * dstStride, dstStride);
658
0
                for (; col < width; ++col)
659
0
                    for (size_t i = 0; i < 8; ++i)
660
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst + col * dstStride + i * 2);
661
0
                src += 8 * srcStride;
662
0
                dst += 16;
663
0
            }
664
0
            for (; row < height; ++row)
665
0
            {
666
0
                for (size_t col = 0; col < width; ++col)
667
0
                    Base::CopyPixel<2>(src + col * 2, dst + col * dstStride);
668
0
                src += srcStride;
669
0
                dst += 2;
670
0
            }
671
0
        }
672
673
        template<> void TransformImageTransposeRotate0<3>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
674
0
        {
675
0
            size_t width4 = AlignLo(width - 5, 4);
676
0
            size_t width8 = AlignLo(width - 9, 8);
677
0
            size_t width16 = AlignLo(width, 16);
678
0
            size_t height4 = AlignLo(height - 5, 4);
679
0
            size_t height8 = AlignLo(height - 9, 8);
680
0
            size_t height16 = AlignLo(height, 16);
681
0
            size_t row = 0;
682
0
            for (; row < height16; row += 16)
683
0
            {
684
0
                size_t col = 0;
685
0
                for (; col < width16; col += 16)
686
0
                    Avx512bw::TransformImageTranspose_3x16x16(src + col * 3, srcStride, dst + col * dstStride, dstStride);
687
0
                for (; col < width4; col += 4)
688
0
                    Avx512bw::TransformImageTranspose_3x16x4(src + col * 3, srcStride, dst + col * dstStride, dstStride);
689
0
                for (; col < width; ++col)
690
0
                    for (size_t i = 0; i < 16; ++i)
691
0
                        Base::CopyPixel<3>(src + col * 3 + i * srcStride, dst + col * dstStride + i * 3);
692
0
                src += 16 * srcStride;
693
0
                dst += 48;
694
0
            }
695
0
            for (; row < height8; row += 8)
696
0
            {
697
0
                size_t col = 0;
698
0
                for (; col < width4; col += 4)
699
0
                    Avx2::TransformImageTranspose_3x8x4(src + col * 3, srcStride, dst + col * dstStride, dstStride);
700
0
                for (; col < width; ++col)
701
0
                    for (size_t i = 0; i < 8; ++i)
702
0
                        Base::CopyPixel<3>(src + col * 3 + i * srcStride, dst + col * dstStride + i * 3);
703
0
                src += 8 * srcStride;
704
0
                dst += 24;
705
0
            }
706
0
            for (; row < height4; row += 4)
707
0
            {
708
0
                size_t col = 0;
709
0
                for (; col < width8; col += 8)
710
0
                    Avx2::TransformImageTranspose_3x4x8(src + col * 3, srcStride, dst + col * dstStride, dstStride);
711
0
                for (; col < width4; col += 4)
712
0
                    Sse41::TransformImageTranspose_3x4x4(src + col * 3, srcStride, dst + col * dstStride, dstStride);
713
0
                for (; col < width; ++col)
714
0
                    for (size_t i = 0; i < 4; ++i)
715
0
                        Base::CopyPixel<3>(src + col * 3 + i * srcStride, dst + col * dstStride + i * 3);
716
0
                src += 4 * srcStride;
717
0
                dst += 12;
718
0
            }
719
0
            for (; row < height; ++row)
720
0
            {
721
0
                for (size_t col = 0; col < width; ++col)
722
0
                    Base::CopyPixel<3>(src + col * 3, dst + col * dstStride);
723
0
                src += srcStride;
724
0
                dst += 3;
725
0
            }
726
0
        }
727
728
        template<> void TransformImageTransposeRotate0<4>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
729
0
        {
730
0
            size_t width4 = AlignLo(width, 4);
731
0
            size_t width8 = AlignLo(width, 8);
732
0
            size_t width16 = AlignLo(width, 16);
733
0
            size_t height4 = AlignLo(height, 4);
734
0
            size_t height8 = AlignLo(height, 8);
735
0
            size_t height16 = AlignLo(height, 16);
736
737
0
            size_t row = 0;
738
0
            for (; row < height16; row += 16)
739
0
            {
740
0
                size_t col = 0;
741
0
                for (; col < width16; col += 16)
742
0
                    Avx512bw::TransformImageTranspose_4x16x16(src + col * 4, srcStride, dst + col * dstStride, dstStride);
743
0
                for (; col < width8; col += 8)
744
0
                    Avx512bw::TransformImageTranspose_4x16x8(src + col * 4, srcStride, dst + col * dstStride, dstStride);
745
0
                for (; col < width; ++col)
746
0
                    for (size_t i = 0; i < 16; ++i)
747
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst + col * dstStride + i * 4);
748
0
                src += 16 * srcStride;
749
0
                dst += 64;
750
0
            }
751
0
            for (; row < height8; row += 8)
752
0
            {
753
0
                size_t col = 0;
754
0
                for (; col < width16; col += 16)
755
0
                    Avx512bw::TransformImageTranspose_4x8x16(src + col * 4, srcStride, dst + col * dstStride, dstStride);
756
0
                for (; col < width8; col += 8)
757
0
                    Avx2::TransformImageTranspose_4x8x8(src + col * 4, srcStride, dst + col * dstStride, dstStride);
758
0
                for (; col < width4; col += 4)
759
0
                    Avx2::TransformImageTranspose_4x8x4(src + col * 4, srcStride, dst + col * dstStride, dstStride);
760
0
                for (; col < width; ++col)
761
0
                    for (size_t i = 0; i < 8; ++i)
762
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst + col * dstStride + i * 4);
763
0
                src += 8 * srcStride;
764
0
                dst += 32;
765
0
            }
766
0
            for (; row < height4; row += 4)
767
0
            {
768
0
                size_t col = 0;
769
0
                for (; col < width16; col += 16)
770
0
                    Avx512bw::TransformImageTranspose_4x4x16(src + col * 4, srcStride, dst + col * dstStride, dstStride);
771
0
                for (; col < width8; col += 8)
772
0
                    Avx2::TransformImageTranspose_4x4x8(src + col * 4, srcStride, dst + col * dstStride, dstStride);
773
0
                for (; col < width4; col += 4)
774
0
                    Sse41::TransformImageTranspose_4x4x4(src + col * 4, srcStride, dst + col * dstStride, dstStride);
775
0
                for (; col < width; ++col)
776
0
                    for (size_t i = 0; i < 4; ++i)
777
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst + col * dstStride + i * 4);
778
0
                src += 4 * srcStride;
779
0
                dst += 16;
780
0
            }
781
0
            for (; row < height; ++row)
782
0
            {
783
0
                for (size_t col = 0; col < width; ++col)
784
0
                    Base::CopyPixel<4>(src + col * 4, dst + col * dstStride);
785
0
                src += srcStride;
786
0
                dst += 4;
787
0
            }
788
0
        }
789
790
        //-----------------------------------------------------------------------------------------
791
792
        template<size_t N> void TransformImageTransposeRotate90(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride);
793
794
        template<> void TransformImageTransposeRotate90<1>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
795
0
        {
796
0
            dst += (width - 64) * 1;
797
0
            size_t width64 = AlignLo(width, 64);
798
0
            __mmask64 tail = TailMask64(width - width64), nose = NoseMask64(width - width64);
799
0
            size_t size = width * 1, size64 = width64 * 1, size256 = AlignLo(size, 256);
800
0
            for (size_t row = 0; row < height; ++row)
801
0
            {
802
0
                size_t offs = 0;
803
0
                for (; offs < size256; offs += 256)
804
0
                    Avx512bw::TransformImageMirror1x256(src + offs, dst - offs);
805
0
                for (; offs < size64; offs += 64)
806
0
                    Avx512bw::TransformImageMirror1x64(src + offs, dst - offs);
807
0
                if (offs < size)
808
0
                    Avx512bw::TransformImageMirror1x64(src + offs, dst - offs, tail, nose);
809
0
                src += srcStride;
810
0
                dst += dstStride;
811
0
            }
812
0
        }
813
814
        template<> void TransformImageTransposeRotate90<2>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
815
0
        {
816
0
            dst += (width - 32) * 2;
817
0
            size_t width32 = AlignLo(width, 32);
818
0
            __mmask32 tail = TailMask32(width - width32), nose = NoseMask32(width - width32);
819
0
            size_t size = width * 2, size64 = width32 * 2, size256 = AlignLo(size, 256);
820
0
            for (size_t row = 0; row < height; ++row)
821
0
            {
822
0
                size_t offs = 0;
823
0
                for (; offs < size256; offs += 256)
824
0
                    Avx512bw::TransformImageMirror2x128(src + offs, dst - offs);
825
0
                for (; offs < size64; offs += 64)
826
0
                    Avx512bw::TransformImageMirror2x32(src + offs, dst - offs);
827
0
                if (offs < size)
828
0
                    Avx512bw::TransformImageMirror2x32(src + offs, dst - offs, tail, nose);
829
0
                src += srcStride;
830
0
                dst += dstStride;
831
0
            }
832
0
        }
833
834
        template<> void TransformImageTransposeRotate90<3>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
835
0
        {
836
0
            dst += (width - 16) * 3;
837
0
            size_t width16 = AlignLo(width, 16);
838
0
            size_t size = width * 3, size48 = width16 * 3, size192 = AlignLo(width, 64) * 3;
839
0
            __mmask64 tail = TailMask64(size - size48), nose = NoseMask64(size - size48 + 16) & 0x0000FFFFFFFFFFFF;
840
0
            for (size_t row = 0; row < height; ++row)
841
0
            {
842
0
                size_t offs = 0;
843
0
                for (; offs < size192; offs += 192)
844
0
                    Avx512bw::TransformImageMirror3x64(src + offs, dst - offs - 16);
845
0
                for (; offs < size48; offs += 48)
846
0
                    Avx512bw::TransformImageMirror3x16(src + offs, dst - offs);
847
0
                if (offs < size)
848
0
                    Avx512bw::TransformImageMirror3x16(src + offs, dst - offs, tail, nose);
849
0
                src += srcStride;
850
0
                dst += dstStride;
851
0
            }
852
0
        }
853
854
        template<> void TransformImageTransposeRotate90<4>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
855
0
        {
856
0
            dst += (width - 16) * 4;
857
0
            size_t width16 = AlignLo(width, 16);
858
0
            __mmask16 tail = TailMask16(width - width16), nose = NoseMask16(width - width16);
859
0
            size_t size = width * 4, size64 = width16 * 4, size256 = AlignLo(size, 256);;
860
0
            for (size_t row = 0; row < height; ++row)
861
0
            {
862
0
                size_t offs = 0;
863
0
                for (; offs < size256; offs += 256)
864
0
                    Avx512bw::TransformImageMirror4x64(src + offs, dst - offs);
865
0
                for (; offs < size64; offs += 64)
866
0
                    Avx512bw::TransformImageMirror4x16(src + offs, dst - offs);
867
0
                if (offs < size)
868
0
                    Avx512bw::TransformImageMirror4x16(src + offs, dst - offs, tail, nose);
869
0
                src += srcStride;
870
0
                dst += dstStride;
871
0
            }
872
0
        }
873
874
        //-----------------------------------------------------------------------------------------
875
876
        template<size_t N> void TransformImageTransposeRotate180(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride);
877
878
        template<> void TransformImageTransposeRotate180<1>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
879
0
        {
880
0
            dst += (width - 1) * dstStride + (height - 1) * 1;
881
0
            size_t width16 = AlignLo(width, 16);
882
0
            size_t height8 = AlignLo(height, 8);
883
0
            size_t height16 = AlignLo(height, 16);
884
0
            size_t height64 = AlignLo(height, 64);
885
0
            size_t row = 0;
886
0
            for (; row < height64; row += 64)
887
0
            {
888
0
                size_t col = 0;
889
0
                for (; col < width16; col += 16)
890
0
                    Avx512bw::TransformImageTranspose_1x64x16(src + col * 1 + 63 * srcStride, -srcStride, dst - col * dstStride - 63, -dstStride);
891
0
                for (; col < width; ++col)
892
0
                    for (size_t i = 0; i < 64; ++i)
893
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst - col * dstStride - i * 1);
894
0
                src += 64 * srcStride;
895
0
                dst -= 64;
896
0
            }
897
0
            for (; row < height16; row += 16)
898
0
            {
899
0
                size_t col = 0;
900
0
                for (; col < width16; col += 16)
901
0
                    Avx2::TransformImageTranspose_1x16x16(src + col * 1 + 15 * srcStride, -srcStride, dst - col * dstStride - 15, -dstStride);
902
0
                for (; col < width; ++col)
903
0
                    for (size_t i = 0; i < 16; ++i)
904
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst - col * dstStride - i * 1);
905
0
                src += 16 * srcStride;
906
0
                dst -= 16;
907
0
            }
908
0
            for (; row < height8; row += 8)
909
0
            {
910
0
                size_t col = 0;
911
0
                for (; col < width16; col += 16)
912
0
                    Sse41::TransformImageTranspose_1x8x16(src + col * 1 + 7 * srcStride, -srcStride, dst - col * dstStride - 7, -dstStride);
913
0
                for (; col < width; ++col)
914
0
                    for (size_t i = 0; i < 8; ++i)
915
0
                        Base::CopyPixel<1>(src + col * 1 + i * srcStride, dst - col * dstStride - i * 1);
916
0
                src += 8 * srcStride;
917
0
                dst -= 8;
918
0
            }
919
0
            for (; row < height; ++row)
920
0
            {
921
0
                for (size_t col = 0; col < width; ++col)
922
0
                    Base::CopyPixel<1>(src + col * 1, dst - col * dstStride);
923
0
                src += srcStride;
924
0
                dst -= 1;
925
0
            }
926
0
        }
927
928
        template<> void TransformImageTransposeRotate180<2>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
929
0
        {
930
0
            dst += (width - 1) * dstStride + (height - 1) * 2;
931
0
            size_t width8 = AlignLo(width, 8);
932
0
            size_t width16 = AlignLo(width, 16);
933
0
            size_t height8 = AlignLo(height, 8);
934
0
            size_t height16 = AlignLo(height, 16);
935
0
            size_t height32 = AlignLo(height, 32);
936
0
            size_t row = 0;
937
0
            for (; row < height32; row += 32)
938
0
            {
939
0
                size_t col = 0;
940
0
                for (; col < width8; col += 8)
941
0
                    Avx512bw::TransformImageTranspose_2x32x8(src + col * 2 + 31 * srcStride, -srcStride, dst - col * dstStride - 62, -dstStride);
942
0
                for (; col < width; ++col)
943
0
                    for (size_t i = 0; i < 32; ++i)
944
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst - col * dstStride - i * 2);
945
0
                src += 32 * srcStride;
946
0
                dst -= 64;
947
0
            }
948
0
            for (; row < height16; row += 16)
949
0
            {
950
0
                size_t col = 0;
951
0
                for (; col < width8; col += 8)
952
0
                    Avx2::TransformImageTranspose_2x16x8(src + col * 2 + 15 * srcStride, -srcStride, dst - col * dstStride - 30, -dstStride);
953
0
                for (; col < width; ++col)
954
0
                    for (size_t i = 0; i < 16; ++i)
955
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst - col * dstStride - i * 2);
956
0
                src += 16 * srcStride;
957
0
                dst -= 32;
958
0
            }
959
0
            for (; row < height8; row += 8)
960
0
            {
961
0
                size_t col = 0;
962
0
                for (; col < width16; col += 16)
963
0
                    Avx2::TransformImageTranspose_2x8x16(src + col * 2 + 7 * srcStride, -srcStride, dst - col * dstStride - 14, -dstStride);
964
0
                for (; col < width8; col += 8)
965
0
                    Sse41::TransformImageTranspose_2x8x8(src + col * 2 + 7 * srcStride, -srcStride, dst - col * dstStride - 14, -dstStride);
966
0
                for (; col < width; ++col)
967
0
                    for (size_t i = 0; i < 8; ++i)
968
0
                        Base::CopyPixel<2>(src + col * 2 + i * srcStride, dst - col * dstStride - i * 2);
969
0
                src += 8 * srcStride;
970
0
                dst -= 16;
971
0
            }
972
0
            for (; row < height; ++row)
973
0
            {
974
0
                for (size_t col = 0; col < width; ++col)
975
0
                    Base::CopyPixel<2>(src + col * 2, dst - col * dstStride);
976
0
                src += srcStride;
977
0
                dst -= 2;
978
0
            }
979
0
        }
980
981
        template<> void TransformImageTransposeRotate180<3>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
982
0
        {
983
0
            src += (height - 1) * srcStride + (width - 1) * 3;
984
0
            size_t width4 = AlignLo(width - 5, 4);
985
0
            size_t width8 = AlignLo(width - 9, 8);
986
0
            size_t width16 = AlignLo(width, 16);
987
0
            size_t height4 = AlignLo(height - 5, 4);
988
0
            size_t height8 = AlignLo(height - 9, 8);
989
0
            size_t height16 = AlignLo(height, 16);
990
0
            size_t row = 0;
991
0
            for (; row < height16; row += 16)
992
0
            {
993
0
                size_t col = 0;
994
0
                for (; col < width16; col += 16)
995
0
                    Avx512bw::TransformImageTranspose_3x16x16(src - col * 3 - 45, -srcStride, dst + (col + 15) * dstStride, -dstStride);
996
0
                for (; col < width4; col += 4)
997
0
                    Avx512bw::TransformImageTranspose_3x16x4(src - col * 3 - 9, -srcStride, dst + (col + 3) * dstStride, -dstStride);
998
0
                for (; col < width; ++col)
999
0
                    for (size_t i = 0; i < 16; ++i)
1000
0
                        Base::CopyPixel<3>(src - col * 3 - i * srcStride, dst + col * dstStride + i * 3);
1001
0
                src -= 16 * srcStride;
1002
0
                dst += 48;
1003
0
            }
1004
0
            for (; row < height8; row += 8)
1005
0
            {
1006
0
                size_t col = 0;
1007
0
                for (; col < width4; col += 4)
1008
0
                    Avx2::TransformImageTranspose_3x8x4(src - col * 3 - 9, -srcStride, dst + (col + 3) * dstStride, -dstStride);
1009
0
                for (; col < width; ++col)
1010
0
                    for (size_t i = 0; i < 8; ++i)
1011
0
                        Base::CopyPixel<3>(src - col * 3 - i * srcStride, dst + col * dstStride + i * 3);
1012
0
                src -= 8 * srcStride;
1013
0
                dst += 24;
1014
0
            }
1015
0
            for (; row < height4; row += 4)
1016
0
            {
1017
0
                size_t col = 0;
1018
0
                for (; col < width8; col += 8)
1019
0
                    Avx2::TransformImageTranspose_3x4x8(src - col * 3 - 21, -srcStride, dst + (col + 7) * dstStride, -dstStride);
1020
0
                for (; col < width4; col += 4)
1021
0
                    Sse41::TransformImageTranspose_3x4x4(src - col * 3 - 9, -srcStride, dst + (col + 3) * dstStride, -dstStride);
1022
0
                for (; col < width; ++col)
1023
0
                    for (size_t i = 0; i < 4; ++i)
1024
0
                        Base::CopyPixel<3>(src - col * 3 - i * srcStride, dst + col * dstStride + i * 3);
1025
0
                src -= 4 * srcStride;
1026
0
                dst += 12;
1027
0
            }
1028
0
            for (; row < height; ++row)
1029
0
            {
1030
0
                for (size_t col = 0; col < width; ++col)
1031
0
                    Base::CopyPixel<3>(src - col * 3, dst + col * dstStride);
1032
0
                src -= srcStride;
1033
0
                dst += 3;
1034
0
            }
1035
0
        }
1036
1037
        template<> void TransformImageTransposeRotate180<4>(const uint8_t* src, ptrdiff_t srcStride, size_t width, size_t height, uint8_t* dst, ptrdiff_t dstStride)
1038
0
        {
1039
0
            dst += (width - 1) * dstStride + (height - 1) * 4;
1040
0
            size_t width4 = AlignLo(width, 4);
1041
0
            size_t width8 = AlignLo(width, 8);
1042
0
            size_t width16 = AlignLo(width, 16);
1043
0
            size_t height4 = AlignLo(height, 4);
1044
0
            size_t height8 = AlignLo(height, 8);
1045
0
            size_t height16 = AlignLo(height, 16);
1046
0
            size_t row = 0;
1047
0
            for (; row < height16; row += 16)
1048
0
            {
1049
0
                size_t col = 0;
1050
0
                for (; col < width16; col += 16)
1051
0
                    Avx512bw::TransformImageTranspose_4x16x16(src + col * 4 + 15 * srcStride, -srcStride, dst - col * dstStride - 60, -dstStride);
1052
0
                for (; col < width8; col += 8)
1053
0
                    Avx512bw::TransformImageTranspose_4x16x8(src + col * 4 + 15 * srcStride, -srcStride, dst - col * dstStride - 60, -dstStride);
1054
0
                for (; col < width; ++col)
1055
0
                    for (size_t i = 0; i < 16; ++i)
1056
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst - col * dstStride - i * 4);
1057
0
                src += 16 * srcStride;
1058
0
                dst -= 64;
1059
0
            }
1060
0
            for (; row < height8; row += 8)
1061
0
            {
1062
0
                size_t col = 0;
1063
0
                for (; col < width16; col += 16)
1064
0
                    Avx512bw::TransformImageTranspose_4x8x16(src + col * 4 + 7 * srcStride, -srcStride, dst - col * dstStride - 28, -dstStride);
1065
0
                for (; col < width8; col += 8)
1066
0
                    Avx2::TransformImageTranspose_4x8x8(src + col * 4 + 7 * srcStride, -srcStride, dst - col * dstStride - 28, -dstStride);
1067
0
                for (; col < width4; col += 4)
1068
0
                    Avx2::TransformImageTranspose_4x8x4(src + col * 4 + 7 * srcStride, -srcStride, dst - col * dstStride - 28, -dstStride);
1069
0
                for (; col < width; ++col)
1070
0
                    for (size_t i = 0; i < 8; ++i)
1071
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst - col * dstStride - i * 4);
1072
0
                src += 8 * srcStride;
1073
0
                dst -= 32;
1074
0
            }
1075
0
            for (; row < height4; row += 4)
1076
0
            {
1077
0
                size_t col = 0;
1078
0
                for (; col < width16; col += 16)
1079
0
                    Avx512bw::TransformImageTranspose_4x4x16(src + col * 4 + 3 * srcStride, -srcStride, dst - col * dstStride - 12, -dstStride);
1080
0
                for (; col < width8; col += 8)
1081
0
                    Avx2::TransformImageTranspose_4x4x8(src + col * 4 + 3 * srcStride, -srcStride, dst - col * dstStride - 12, -dstStride);
1082
0
                for (; col < width4; col += 4)
1083
0
                    Sse41::TransformImageTranspose_4x4x4(src + col * 4 + 3 * srcStride, -srcStride, dst - col * dstStride - 12, -dstStride);
1084
0
                for (; col < width; ++col)
1085
0
                    for (size_t i = 0; i < 4; ++i)
1086
0
                        Base::CopyPixel<4>(src + col * 4 + i * srcStride, dst - col * dstStride - i * 4);
1087
0
                src += 4 * srcStride;
1088
0
                dst -= 16;
1089
0
            }
1090
0
            for (; row < height; ++row)
1091
0
            {
1092
0
                for (size_t col = 0; col < width; ++col)
1093
0
                    Base::CopyPixel<4>(src + col * 4, dst - col * dstStride);
1094
0
                src += srcStride;
1095
0
                dst -= 4;
1096
0
            }
1097
0
        }
1098
1099
        //-----------------------------------------------------------------------------------------
1100
1101
        template<size_t N> void Init(ImageTransforms::TransformPtr transforms[8])
1102
0
        {
1103
0
            transforms[SimdTransformRotate90] = TransformImageRotate90<N>;
1104
0
            transforms[SimdTransformRotate180] = TransformImageRotate180<N>;
1105
0
            transforms[SimdTransformRotate270] = TransformImageRotate270<N>;
1106
0
            transforms[SimdTransformTransposeRotate0] = TransformImageTransposeRotate0<N>;
1107
0
            transforms[SimdTransformTransposeRotate90] = TransformImageTransposeRotate90<N>;
1108
0
            transforms[SimdTransformTransposeRotate180] = TransformImageTransposeRotate180<N>;
1109
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Init<1ul>(void (**)(unsigned char const*, long, unsigned long, unsigned long, unsigned char*, long))
Unexecuted instantiation: void Simd::Avx512bw::Init<2ul>(void (**)(unsigned char const*, long, unsigned long, unsigned long, unsigned char*, long))
Unexecuted instantiation: void Simd::Avx512bw::Init<3ul>(void (**)(unsigned char const*, long, unsigned long, unsigned long, unsigned char*, long))
Unexecuted instantiation: void Simd::Avx512bw::Init<4ul>(void (**)(unsigned char const*, long, unsigned long, unsigned long, unsigned char*, long))
1110
1111
        ImageTransforms::ImageTransforms()
1112
0
            : Avx2::ImageTransforms::ImageTransforms()
1113
0
        {
1114
0
            Init<1>(transforms[0]);
1115
0
            Init<2>(transforms[1]);
1116
0
            Init<3>(transforms[2]);
1117
0
            Init<4>(transforms[3]);
1118
0
        }
1119
1120
        //-----------------------------------------------------------------------------------------
1121
1122
        void TransformImage(const uint8_t * src, size_t srcStride, size_t width, size_t height, size_t pixelSize, SimdTransformType transform, uint8_t * dst, size_t dstStride)
1123
0
        {
1124
0
            static ImageTransforms transforms = ImageTransforms();
1125
1126
0
            transforms.TransformImage(src, srcStride, width, height, pixelSize, transform, dst, dstStride);
1127
0
        }
1128
    }
1129
#endif
1130
}