Coverage Report

Created: 2026-04-09 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdSse41ResizerBicubic.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2024 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdMemory.h"
25
#include "Simd/SimdStore.h"
26
#include "Simd/SimdResizer.h"
27
#include "Simd/SimdResizerCommon.h"
28
#include "Simd/SimdCopy.h"
29
30
namespace Simd
31
{
32
#ifdef SIMD_SSE41_ENABLE
33
    namespace Sse41
34
    {
35
        ResizerByteBicubic::ResizerByteBicubic(const ResParam& param)
36
0
            : Base::ResizerByteBicubic(param)
37
0
        {
38
0
        }
39
40
        void ResizerByteBicubic::EstimateIndexAlphaY()
41
0
        {
42
0
            size_t sizeD = _param.dstH, sizeS = _param.srcH;
43
0
            _iy.Resize(sizeD);
44
0
            _ay.Resize(sizeD * 4);
45
0
            float scale = float(sizeS) / float(sizeD);
46
0
            size_t i = 0, sizeDF = AlignLo(sizeD, F);
47
0
            int32_t* ay = _ay.data;
48
0
            if (sizeDF)
49
0
            {
50
0
                __m128i _i = _mm_setr_epi32(0, 1, 2, 3);
51
0
                __m128 _scale = _mm_set1_ps(scale);
52
0
                __m128 _0 = _mm_set1_ps(0.0f);
53
0
                __m128 _05 = _mm_set1_ps(0.5f);
54
0
                __m128 _1 = _mm_set1_ps(1.0f);
55
0
                __m128 _2 = _mm_set1_ps(2.0f);
56
0
                __m128 _1_6 = _mm_set1_ps(1.0f / 6.0f);
57
0
                __m128 _max = _mm_set1_ps(float(sizeS - 2));
58
0
                __m128 _range = _mm_set1_ps(float(Base::BICUBIC_RANGE));
59
60
0
                for (; i < sizeDF; i += F, ay += 4 * F)
61
0
                {
62
0
                    __m128 _pos = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_i), _05), _scale), _05);
63
0
                    __m128 idx = _mm_round_ps(_pos, _MM_FROUND_FLOOR);
64
0
                    __m128 d = _mm_sub_ps(_pos, idx);
65
66
0
                    __m128 minMask = _mm_cmplt_ps(idx, _0);
67
0
                    idx = _mm_blendv_ps(idx, _0, minMask);
68
0
                    d = _mm_blendv_ps(d, _0, minMask);
69
70
0
                    __m128 maxMask = _mm_cmpgt_ps(idx, _max);
71
0
                    idx = _mm_blendv_ps(idx, _max, maxMask);
72
0
                    d = _mm_blendv_ps(d, _1, maxMask);
73
74
0
                    _mm_storeu_si128((__m128i*)(_iy.data + i), _mm_cvtps_epi32(idx));
75
76
0
                    __m128i a0 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_2, d), _mm_sub_ps(_1, d)), _mm_mul_ps(d, _1_6))));
77
0
                    __m128i a1 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(d, _2), _mm_add_ps(_1, d)), _mm_mul_ps(_mm_sub_ps(_1, d), _05))));
78
0
                    __m128i a2 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(d, _2), _mm_add_ps(_1, d)), _mm_mul_ps(d, _05))));
79
0
                    __m128i a3 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_add_ps(_1, d), _mm_sub_ps(_1, d)), _mm_mul_ps(d, _1_6))));
80
0
                    __m128i a00 = _mm_unpacklo_epi32(a0, a2);
81
0
                    __m128i a01 = _mm_unpacklo_epi32(a1, a3);
82
0
                    __m128i a10 = _mm_unpackhi_epi32(a0, a2);
83
0
                    __m128i a11 = _mm_unpackhi_epi32(a1, a3);
84
0
                    _mm_storeu_si128((__m128i*)ay + 0, _mm_unpacklo_epi32(a00, a01));
85
0
                    _mm_storeu_si128((__m128i*)ay + 1, _mm_unpackhi_epi32(a00, a01));
86
0
                    _mm_storeu_si128((__m128i*)ay + 2, _mm_unpacklo_epi32(a10, a11));
87
0
                    _mm_storeu_si128((__m128i*)ay + 3, _mm_unpackhi_epi32(a10, a11));
88
89
0
                    _i = _mm_add_epi32(_i, K32_00000004);
90
0
                }
91
0
            }
92
0
            for (; i < sizeD; ++i, ay += 4)
93
0
            {
94
0
                float pos = (float)((i + 0.5f) * scale - 0.5f);
95
0
                int idx = (int)::floor(pos);
96
0
                float d = pos - idx;
97
0
                if (idx < 0)
98
0
                {
99
0
                    idx = 0;
100
0
                    d = 0.0f;
101
0
                }
102
0
                if (idx > (int)sizeS - 2)
103
0
                {
104
0
                    idx = (int)sizeS - 2;
105
0
                    d = 1.0f;
106
0
                }
107
0
                _iy[i] = idx;
108
0
                ay[0] = Round(Base::BICUBIC_RANGE * (2.0f - d) * (1.0f - d) * d / 6.0f);
109
0
                ay[1] = Round(Base::BICUBIC_RANGE * (d - 2.0f) * (d + 1.0f) * (1.0f - d) / 2.0f);
110
0
                ay[2] = Round(Base::BICUBIC_RANGE * (d - 2.0f) * (d + 1.0f) * d / 2.0f);
111
0
                ay[3] = Round(Base::BICUBIC_RANGE * (1.0f + d) * (1.0f - d) * d / 6.0f);
112
0
            }
113
0
        }
114
115
        void ResizerByteBicubic::EstimateIndexAlphaX()
116
0
        {
117
0
            size_t sizeD = _param.dstW, sizeS = _param.srcW;
118
0
            _ix.Resize(sizeD);
119
0
            _ax.Resize(sizeD * 4);
120
0
            float scale = float(sizeS) / float(sizeD);
121
0
            size_t i = 0, sizeDF = AlignLo(sizeD, F);
122
0
            int8_t* ax = _ax.data;
123
0
            if (sizeDF)
124
0
            {
125
0
                static const __m128i _SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
126
0
                __m128i _i = _mm_setr_epi32(0, 1, 2, 3);
127
0
                __m128 _scale = _mm_set1_ps(scale);
128
0
                __m128 _0 = _mm_set1_ps(0.0f);
129
0
                __m128 _05 = _mm_set1_ps(0.5f);
130
0
                __m128 _1 = _mm_set1_ps(1.0f);
131
0
                __m128 _2 = _mm_set1_ps(2.0f);
132
0
                __m128 _1_6 = _mm_set1_ps(1.0f / 6.0f);
133
0
                __m128 _max = _mm_set1_ps(float(sizeS - 2));
134
0
                __m128 _range = _mm_set1_ps(float(Base::BICUBIC_RANGE));
135
0
                __m128i _channels = _mm_set1_epi32((int)_param.channels);
136
137
0
                for (; i < sizeDF; i += F, ax += 4 * F)
138
0
                {
139
0
                    __m128 _pos = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_i), _05), _scale), _05);
140
0
                    __m128 idx = _mm_round_ps(_pos, _MM_FROUND_FLOOR);
141
0
                    __m128 d = _mm_sub_ps(_pos, idx);
142
143
0
                    __m128 minMask = _mm_cmplt_ps(idx, _0);
144
0
                    idx = _mm_blendv_ps(idx, _0, minMask);
145
0
                    d = _mm_blendv_ps(d, _0, minMask);
146
147
0
                    __m128 maxMask = _mm_cmpgt_ps(idx, _max);
148
0
                    idx = _mm_blendv_ps(idx, _max, maxMask);
149
0
                    d = _mm_blendv_ps(d, _1, maxMask);
150
151
0
                    _mm_storeu_si128((__m128i*)(_ix.data + i), _mm_mullo_epi32(_mm_cvtps_epi32(idx), _channels));
152
153
0
                    __m128i a0 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_2, d), _mm_sub_ps(_1, d)), _mm_mul_ps(d, _1_6))));
154
0
                    __m128i a1 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(d, _2), _mm_add_ps(_1, d)), _mm_mul_ps(_mm_sub_ps(_1, d), _05))));
155
0
                    __m128i a2 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(d, _2), _mm_add_ps(_1, d)), _mm_mul_ps(d, _05))));
156
0
                    __m128i a3 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_add_ps(_1, d), _mm_sub_ps(_1, d)), _mm_mul_ps(d, _1_6))));
157
0
                    _mm_storeu_si128((__m128i*)ax, _mm_shuffle_epi8(_mm_packs_epi16(_mm_packs_epi32(a0, a1), _mm_packs_epi32(a2, a3)), _SHUFFLE));
158
159
0
                    _i = _mm_add_epi32(_i, K32_00000004);
160
0
                }
161
0
            }
162
0
            for (; i < sizeD; ++i, ax += 4)
163
0
            {
164
0
                float pos = (float)((i + 0.5f) * scale - 0.5f);
165
0
                int idx = (int)::floor(pos);
166
0
                float d = pos - idx;
167
0
                if (idx < 0)
168
0
                {
169
0
                    idx = 0;
170
0
                    d = 0.0f;
171
0
                }
172
0
                if (idx > (int)sizeS - 2)
173
0
                {
174
0
                    idx = (int)sizeS - 2;
175
0
                    d = 1.0f;
176
0
                }
177
0
                _ix[i] = idx * (int)_param.channels;
178
0
                ax[0] = (int8_t)Round(Base::BICUBIC_RANGE * (2.0f - d) * (1.0f - d) * d / 6.0f);
179
0
                ax[1] = (int8_t)Round(Base::BICUBIC_RANGE * (d - 2.0f) * (d + 1.0f) * (1.0f - d) / 2.0f);
180
0
                ax[2] = (int8_t)Round(Base::BICUBIC_RANGE * (d - 2.0f) * (d + 1.0f) * d / 2.0f);
181
0
                ax[3] = (int8_t)Round(Base::BICUBIC_RANGE * (1.0f + d) * (1.0f - d) * d / 6.0f);
182
0
            }
183
0
        }
184
185
        void ResizerByteBicubic::ResizerByteBicubic::Init(bool sparse)
186
0
        {
187
0
            if (_iy.data)
188
0
                return;
189
0
            EstimateIndexAlphaY();
190
0
            EstimateIndexAlphaX();
191
0
            if (!sparse)
192
0
            {
193
0
                for (int i = 0; i < 4; ++i)
194
0
                    _bx[i].Resize(_param.dstW * _param.channels);
195
0
            }
196
0
            _sxl = (_param.srcW - 2) * _param.channels;
197
0
            for (_xn = 0; _ix[_xn] == 0; _xn++);
198
0
            for (_xt = _param.dstW; _ix[_xt - 1] == _sxl; _xt--);
199
0
        }
200
201
        template<int N> __m128i LoadAx(const int8_t* ax);
202
203
        template<> SIMD_INLINE __m128i LoadAx<1>(const int8_t* ax)
204
0
        {
205
0
            return _mm_loadu_si128((__m128i*)ax);
206
0
        }
207
208
        template<> SIMD_INLINE __m128i LoadAx<2>(const int8_t* ax)
209
0
        {
210
0
            return _mm_shuffle_epi32(_mm_loadl_epi64((__m128i*)ax), 0x50);
211
0
        }
212
213
        template<> SIMD_INLINE __m128i LoadAx<3>(const int8_t* ax)
214
0
        {
215
0
            return _mm_set1_epi32(*(int32_t*)ax);
216
0
        }
217
218
        template<> SIMD_INLINE __m128i LoadAx<4>(const int8_t* ax)
219
0
        {
220
0
            return _mm_set1_epi32(*(int32_t*)ax);
221
0
        }
222
223
        template<int N> __m128i CubicSumX(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay);
224
225
        template<> SIMD_INLINE __m128i CubicSumX<1>(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay)
226
0
        {
227
0
            __m128i _src = _mm_setr_epi32(*(int32_t*)(src + ix[0]), *(int32_t*)(src + ix[1]), *(int32_t*)(src + ix[2]), *(int32_t*)(src + ix[3]));
228
0
            return _mm_madd_epi16(_mm_maddubs_epi16(_src, ax), ay);
229
0
        }
230
231
        template<> SIMD_INLINE __m128i CubicSumX<2>(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay)
232
0
        {
233
0
            static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x4, 0x6, 0x1, 0x3, 0x5, 0x7, 0x8, 0xA, 0xC, 0xE, 0x9, 0xB, 0xD, 0xF);
234
0
            __m128i _src = _mm_shuffle_epi8(Load((__m128i*)(src + ix[0]), (__m128i*)(src + ix[1])), SHUFFLE);
235
0
            return _mm_madd_epi16(_mm_maddubs_epi16(_src, ax), ay);
236
0
        }
237
238
        template<> SIMD_INLINE __m128i CubicSumX<3>(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay)
239
0
        {
240
0
            static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1);
241
0
            __m128i _src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(src + ix[0])), SHUFFLE);
242
0
            return _mm_madd_epi16(_mm_maddubs_epi16(_src, ax), ay);
243
0
        }
244
245
        template<> SIMD_INLINE __m128i CubicSumX<4>(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay)
246
0
        {
247
0
            static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
248
0
            __m128i _src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(src + ix[0])), SHUFFLE);
249
0
            return _mm_madd_epi16(_mm_maddubs_epi16(_src, ax), ay);
250
0
        }
251
252
        template <int N> SIMD_INLINE void BicubicInt(const uint8_t* src0, const uint8_t* src1, const uint8_t* src2, const uint8_t* src3, const int32_t* ix, const int8_t* ax, const __m128i* ay, uint8_t* dst)
253
0
        {
254
0
            static const __m128i ROUND = SIMD_MM_SET1_EPI32(Base::BICUBIC_ROUND);
255
0
            __m128i _ax = LoadAx<N>(ax);
256
0
            __m128i say0 = CubicSumX<N>(src0 - N, ix, _ax, ay[0]);
257
0
            __m128i say1 = CubicSumX<N>(src1 - N, ix, _ax, ay[1]);
258
0
            __m128i say2 = CubicSumX<N>(src2 - N, ix, _ax, ay[2]);
259
0
            __m128i say3 = CubicSumX<N>(src3 - N, ix, _ax, ay[3]);
260
0
            __m128i sum = _mm_add_epi32(_mm_add_epi32(say0, say1), _mm_add_epi32(say2, say3));
261
0
            __m128i dst0 = _mm_srai_epi32(_mm_add_epi32(sum, ROUND), Base::BICUBIC_SHIFT);
262
0
            *((int32_t*)(dst)) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst0, K_ZERO), K_ZERO));
263
0
        }
Unexecuted instantiation: void Simd::Sse41::BicubicInt<1>(unsigned char const*, unsigned char const*, unsigned char const*, unsigned char const*, int const*, signed char const*, long long __vector(2) const*, unsigned char*)
Unexecuted instantiation: void Simd::Sse41::BicubicInt<2>(unsigned char const*, unsigned char const*, unsigned char const*, unsigned char const*, int const*, signed char const*, long long __vector(2) const*, unsigned char*)
Unexecuted instantiation: void Simd::Sse41::BicubicInt<3>(unsigned char const*, unsigned char const*, unsigned char const*, unsigned char const*, int const*, signed char const*, long long __vector(2) const*, unsigned char*)
Unexecuted instantiation: void Simd::Sse41::BicubicInt<4>(unsigned char const*, unsigned char const*, unsigned char const*, unsigned char const*, int const*, signed char const*, long long __vector(2) const*, unsigned char*)
264
265
        template<int N> void ResizerByteBicubic::RunS(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
266
0
        {
267
0
            assert(_xn == 0 && _xt == _param.dstW);
268
0
            size_t step = 4 / N;
269
0
            size_t body = AlignLoAny(_param.dstW - (N == 3 ? 1 : 0), step);
270
0
            for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
271
0
            {
272
0
                size_t sy = _iy[dy];
273
0
                const uint8_t* src1 = src + sy * srcStride;
274
0
                const uint8_t* src2 = src1 + srcStride;
275
0
                const uint8_t* src0 = sy ? src1 - srcStride : src1;
276
0
                const uint8_t* src3 = sy < _param.srcH - 2 ? src2 + srcStride : src2;
277
0
                const int32_t* ay = _ay.data + dy * 4;
278
0
                __m128i ays[4];
279
0
                ays[0] = _mm_set1_epi16(ay[0]);
280
0
                ays[1] = _mm_set1_epi16(ay[1]);
281
0
                ays[2] = _mm_set1_epi16(ay[2]);
282
0
                ays[3] = _mm_set1_epi16(ay[3]);
283
0
                size_t dx = 0;
284
0
                for (; dx < body; dx += step)
285
0
                    BicubicInt<N>(src0, src1, src2, src3, _ix.data + dx, _ax.data + dx * 4, ays, dst + dx * N);
286
0
                for (; dx < _param.dstW; dx++)
287
0
                    Base::BicubicInt<N, -1, 2>(src0, src1, src2, src3, _ix[dx], _ax.data + dx * 4, ay, dst + dx * N);
288
0
            }
289
0
        }
Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunS<1>(unsigned char const*, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunS<2>(unsigned char const*, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunS<3>(unsigned char const*, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunS<4>(unsigned char const*, unsigned long, unsigned char*, unsigned long)
290
291
        template<int F> SIMD_INLINE void PixelCubicSumX(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst);
292
293
        template<> SIMD_INLINE void PixelCubicSumX<1>(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst)
294
0
        {
295
0
            __m128i _src = _mm_setr_epi32(*(int32_t*)(src + ix[0]), *(int32_t*)(src + ix[1]), *(int32_t*)(src + ix[2]), *(int32_t*)(src + ix[3]));
296
0
            __m128i _ax = _mm_loadu_si128((__m128i*)ax);
297
0
            _mm_storeu_si128((__m128i*)dst, _mm_madd_epi16(_mm_maddubs_epi16(_src, _ax), K16_0001));
298
0
        }
299
300
        template<> SIMD_INLINE void PixelCubicSumX<2>(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst)
301
0
        {
302
0
            static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x4, 0x6, 0x1, 0x3, 0x5, 0x7, 0x8, 0xA, 0xC, 0xE, 0x9, 0xB, 0xD, 0xF);
303
0
            __m128i _src = _mm_shuffle_epi8(Load((__m128i*)(src + ix[0]), (__m128i*)(src + ix[1])), SHUFFLE);
304
0
            __m128i _ax = _mm_shuffle_epi32(_mm_loadl_epi64((__m128i*)ax), 0x50);
305
0
            _mm_storeu_si128((__m128i*)dst, _mm_madd_epi16(_mm_maddubs_epi16(_src, _ax), K16_0001));
306
0
        }
307
308
        template<> SIMD_INLINE void PixelCubicSumX<3>(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst)
309
0
        {
310
0
            static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1);
311
0
            __m128i _src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(src + ix[0])), SHUFFLE);
312
0
            __m128i _ax = _mm_set1_epi32(*(int32_t*)ax);
313
0
            _mm_storeu_si128((__m128i*)dst, _mm_madd_epi16(_mm_maddubs_epi16(_src, _ax), K16_0001));
314
0
        }
315
316
        template<> SIMD_INLINE void PixelCubicSumX<4>(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst)
317
0
        {
318
0
            static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF);
319
0
            __m128i _src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(src + ix[0])), SHUFFLE);
320
0
            __m128i _ax = _mm_set1_epi32(*(int32_t*)ax);
321
0
            _mm_storeu_si128((__m128i*)dst, _mm_madd_epi16(_mm_maddubs_epi16(_src, _ax), K16_0001));
322
0
        }
323
324
        template<int N> SIMD_INLINE void RowCubicSumX(const uint8_t* src, size_t nose, size_t body, size_t tail, const int32_t* ix, const int8_t* ax, int32_t* dst)
325
0
        {
326
0
            size_t step = 4 / N;
327
0
            size_t bodyS = nose + AlignLoAny(body - nose, step);
328
329
0
            size_t dx = 0;
330
0
            for (; dx < nose; dx++, ax += 4, dst += N)
331
0
                Base::PixelCubicSumX<N, 0, 2>(src + ix[dx], ax, dst);
332
0
            for (; dx < bodyS; dx += step, ax += 4 * step, dst += N * step)
333
0
                PixelCubicSumX<N>(src - N, ix + dx, ax, dst);
334
0
            for (; dx < body; dx++, ax += 4, dst += N)
335
0
                Base::PixelCubicSumX<N, -1, 2>(src + ix[dx], ax, dst);
336
0
            for (; dx < tail; dx++, ax += 4, dst += N)
337
0
                Base::PixelCubicSumX<N, -1, 1>(src + ix[dx], ax, dst);
338
0
        }
Unexecuted instantiation: void Simd::Sse41::RowCubicSumX<1>(unsigned char const*, unsigned long, unsigned long, unsigned long, int const*, signed char const*, int*)
Unexecuted instantiation: void Simd::Sse41::RowCubicSumX<2>(unsigned char const*, unsigned long, unsigned long, unsigned long, int const*, signed char const*, int*)
Unexecuted instantiation: void Simd::Sse41::RowCubicSumX<3>(unsigned char const*, unsigned long, unsigned long, unsigned long, int const*, signed char const*, int*)
Unexecuted instantiation: void Simd::Sse41::RowCubicSumX<4>(unsigned char const*, unsigned long, unsigned long, unsigned long, int const*, signed char const*, int*)
339
340
        SIMD_INLINE void BicubicRowInt(const int32_t* src0, const int32_t* src1, const int32_t* src2, const int32_t* src3, size_t n, const int32_t* ay, uint8_t* dst)
341
0
        {
342
0
            size_t nF = AlignLo(n, F);
343
0
            size_t i = 0;
344
0
            if (nF)
345
0
            {
346
0
                static const __m128i ROUND = SIMD_MM_SET1_EPI32(Base::BICUBIC_ROUND);
347
0
                __m128i ay0 = _mm_set1_epi32(ay[0]);
348
0
                __m128i ay1 = _mm_set1_epi32(ay[1]);
349
0
                __m128i ay2 = _mm_set1_epi32(ay[2]);
350
0
                __m128i ay3 = _mm_set1_epi32(ay[3]);
351
0
                for (; i < nF; i += F)
352
0
                {
353
0
                    __m128i say0 = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)(src0 + i)), ay0);
354
0
                    __m128i say1 = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)(src1 + i)), ay1);
355
0
                    __m128i say2 = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)(src2 + i)), ay2);
356
0
                    __m128i say3 = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)(src3 + i)), ay3);
357
0
                    __m128i sum = _mm_add_epi32(_mm_add_epi32(say0, say1), _mm_add_epi32(say2, say3));
358
0
                    __m128i dst0 = _mm_srai_epi32(_mm_add_epi32(sum, ROUND), Base::BICUBIC_SHIFT);
359
0
                   *((int32_t*)(dst + i)) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst0, K_ZERO), K_ZERO));
360
0
                }
361
0
            }
362
0
            for (; i < n; ++i)
363
0
            {
364
0
                int32_t sum = ay[0] * src0[i] + ay[1] * src1[i] + ay[2] * src2[i] + ay[3] * src3[i];
365
0
                dst[i] = Base::RestrictRange((sum + Base::BICUBIC_ROUND) >> Base::BICUBIC_SHIFT, 0, 255);
366
0
            }
367
0
        }
368
369
        template<int N> void ResizerByteBicubic::RunB(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
370
0
        {
371
0
            int32_t prev = -1;
372
0
            for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride)
373
0
            {
374
0
                int32_t sy = _iy[dy], next = prev;
375
0
                for (int32_t curr = sy - 1, end = sy + 3; curr < end; ++curr)
376
0
                {
377
0
                    if (curr < prev)
378
0
                        continue;
379
0
                    const uint8_t* ps = src + RestrictRange(curr, 0, (int)_param.srcH - 1) * srcStride;
380
0
                    int32_t* pb = _bx[(curr + 1) & 3].data;
381
0
                    RowCubicSumX<N>(ps, _xn, _xt, _param.dstW, _ix.data, _ax.data, pb);
382
0
                    next++;
383
0
                }
384
0
                prev = next;
385
386
0
                const int32_t* ay = _ay.data + dy * 4;
387
0
                int32_t* pb0 = _bx[(sy + 0) & 3].data;
388
0
                int32_t* pb1 = _bx[(sy + 1) & 3].data;
389
0
                int32_t* pb2 = _bx[(sy + 2) & 3].data;
390
0
                int32_t* pb3 = _bx[(sy + 3) & 3].data;
391
0
                BicubicRowInt(pb0, pb1, pb2, pb3, _bx[0].size, ay, dst);
392
0
            }
393
0
        }
Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunB<1>(unsigned char const*, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunB<2>(unsigned char const*, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunB<3>(unsigned char const*, unsigned long, unsigned char*, unsigned long)
Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunB<4>(unsigned char const*, unsigned long, unsigned char*, unsigned long)
394
395
        void ResizerByteBicubic::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride)
396
0
        {
397
0
            bool sparse = _param.dstH * 3.0 <= _param.srcH;
398
0
            Init(sparse);
399
0
            switch (_param.channels)
400
0
            {
401
0
            case 1: sparse ? RunS<1>(src, srcStride, dst, dstStride) : RunB<1>(src, srcStride, dst, dstStride); return;
402
0
            case 2: sparse ? RunS<2>(src, srcStride, dst, dstStride) : RunB<2>(src, srcStride, dst, dstStride); return;
403
0
            case 3: sparse ? RunS<3>(src, srcStride, dst, dstStride) : RunB<3>(src, srcStride, dst, dstStride); return;
404
0
            case 4: sparse ? RunS<4>(src, srcStride, dst, dstStride) : RunB<4>(src, srcStride, dst, dstStride); return;
405
0
            default:
406
                assert(0);
407
0
            }
408
0
        }
409
    }
410
#endif
411
}
412