/src/Simd/src/Simd/SimdSse41ResizerBicubic.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2024 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdMemory.h" |
25 | | #include "Simd/SimdStore.h" |
26 | | #include "Simd/SimdResizer.h" |
27 | | #include "Simd/SimdResizerCommon.h" |
28 | | #include "Simd/SimdCopy.h" |
29 | | |
30 | | namespace Simd |
31 | | { |
32 | | #ifdef SIMD_SSE41_ENABLE |
33 | | namespace Sse41 |
34 | | { |
35 | | ResizerByteBicubic::ResizerByteBicubic(const ResParam& param) |
36 | 0 | : Base::ResizerByteBicubic(param) |
37 | 0 | { |
38 | 0 | } |
39 | | |
40 | | void ResizerByteBicubic::EstimateIndexAlphaY() |
41 | 0 | { |
42 | 0 | size_t sizeD = _param.dstH, sizeS = _param.srcH; |
43 | 0 | _iy.Resize(sizeD); |
44 | 0 | _ay.Resize(sizeD * 4); |
45 | 0 | float scale = float(sizeS) / float(sizeD); |
46 | 0 | size_t i = 0, sizeDF = AlignLo(sizeD, F); |
47 | 0 | int32_t* ay = _ay.data; |
48 | 0 | if (sizeDF) |
49 | 0 | { |
50 | 0 | __m128i _i = _mm_setr_epi32(0, 1, 2, 3); |
51 | 0 | __m128 _scale = _mm_set1_ps(scale); |
52 | 0 | __m128 _0 = _mm_set1_ps(0.0f); |
53 | 0 | __m128 _05 = _mm_set1_ps(0.5f); |
54 | 0 | __m128 _1 = _mm_set1_ps(1.0f); |
55 | 0 | __m128 _2 = _mm_set1_ps(2.0f); |
56 | 0 | __m128 _1_6 = _mm_set1_ps(1.0f / 6.0f); |
57 | 0 | __m128 _max = _mm_set1_ps(float(sizeS - 2)); |
58 | 0 | __m128 _range = _mm_set1_ps(float(Base::BICUBIC_RANGE)); |
59 | |
|
60 | 0 | for (; i < sizeDF; i += F, ay += 4 * F) |
61 | 0 | { |
62 | 0 | __m128 _pos = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_i), _05), _scale), _05); |
63 | 0 | __m128 idx = _mm_round_ps(_pos, _MM_FROUND_FLOOR); |
64 | 0 | __m128 d = _mm_sub_ps(_pos, idx); |
65 | |
|
66 | 0 | __m128 minMask = _mm_cmplt_ps(idx, _0); |
67 | 0 | idx = _mm_blendv_ps(idx, _0, minMask); |
68 | 0 | d = _mm_blendv_ps(d, _0, minMask); |
69 | |
|
70 | 0 | __m128 maxMask = _mm_cmpgt_ps(idx, _max); |
71 | 0 | idx = _mm_blendv_ps(idx, _max, maxMask); |
72 | 0 | d = _mm_blendv_ps(d, _1, maxMask); |
73 | |
|
74 | 0 | _mm_storeu_si128((__m128i*)(_iy.data + i), _mm_cvtps_epi32(idx)); |
75 | |
|
76 | 0 | __m128i a0 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_2, d), _mm_sub_ps(_1, d)), _mm_mul_ps(d, _1_6)))); |
77 | 0 | __m128i a1 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(d, _2), _mm_add_ps(_1, d)), _mm_mul_ps(_mm_sub_ps(_1, d), _05)))); |
78 | 0 | __m128i a2 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(d, _2), _mm_add_ps(_1, d)), _mm_mul_ps(d, _05)))); |
79 | 0 | __m128i a3 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_add_ps(_1, d), _mm_sub_ps(_1, d)), _mm_mul_ps(d, _1_6)))); |
80 | 0 | __m128i a00 = _mm_unpacklo_epi32(a0, a2); |
81 | 0 | __m128i a01 = _mm_unpacklo_epi32(a1, a3); |
82 | 0 | __m128i a10 = _mm_unpackhi_epi32(a0, a2); |
83 | 0 | __m128i a11 = _mm_unpackhi_epi32(a1, a3); |
84 | 0 | _mm_storeu_si128((__m128i*)ay + 0, _mm_unpacklo_epi32(a00, a01)); |
85 | 0 | _mm_storeu_si128((__m128i*)ay + 1, _mm_unpackhi_epi32(a00, a01)); |
86 | 0 | _mm_storeu_si128((__m128i*)ay + 2, _mm_unpacklo_epi32(a10, a11)); |
87 | 0 | _mm_storeu_si128((__m128i*)ay + 3, _mm_unpackhi_epi32(a10, a11)); |
88 | |
|
89 | 0 | _i = _mm_add_epi32(_i, K32_00000004); |
90 | 0 | } |
91 | 0 | } |
92 | 0 | for (; i < sizeD; ++i, ay += 4) |
93 | 0 | { |
94 | 0 | float pos = (float)((i + 0.5f) * scale - 0.5f); |
95 | 0 | int idx = (int)::floor(pos); |
96 | 0 | float d = pos - idx; |
97 | 0 | if (idx < 0) |
98 | 0 | { |
99 | 0 | idx = 0; |
100 | 0 | d = 0.0f; |
101 | 0 | } |
102 | 0 | if (idx > (int)sizeS - 2) |
103 | 0 | { |
104 | 0 | idx = (int)sizeS - 2; |
105 | 0 | d = 1.0f; |
106 | 0 | } |
107 | 0 | _iy[i] = idx; |
108 | 0 | ay[0] = Round(Base::BICUBIC_RANGE * (2.0f - d) * (1.0f - d) * d / 6.0f); |
109 | 0 | ay[1] = Round(Base::BICUBIC_RANGE * (d - 2.0f) * (d + 1.0f) * (1.0f - d) / 2.0f); |
110 | 0 | ay[2] = Round(Base::BICUBIC_RANGE * (d - 2.0f) * (d + 1.0f) * d / 2.0f); |
111 | 0 | ay[3] = Round(Base::BICUBIC_RANGE * (1.0f + d) * (1.0f - d) * d / 6.0f); |
112 | 0 | } |
113 | 0 | } |
114 | | |
115 | | void ResizerByteBicubic::EstimateIndexAlphaX() |
116 | 0 | { |
117 | 0 | size_t sizeD = _param.dstW, sizeS = _param.srcW; |
118 | 0 | _ix.Resize(sizeD); |
119 | 0 | _ax.Resize(sizeD * 4); |
120 | 0 | float scale = float(sizeS) / float(sizeD); |
121 | 0 | size_t i = 0, sizeDF = AlignLo(sizeD, F); |
122 | 0 | int8_t* ax = _ax.data; |
123 | 0 | if (sizeDF) |
124 | 0 | { |
125 | 0 | static const __m128i _SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); |
126 | 0 | __m128i _i = _mm_setr_epi32(0, 1, 2, 3); |
127 | 0 | __m128 _scale = _mm_set1_ps(scale); |
128 | 0 | __m128 _0 = _mm_set1_ps(0.0f); |
129 | 0 | __m128 _05 = _mm_set1_ps(0.5f); |
130 | 0 | __m128 _1 = _mm_set1_ps(1.0f); |
131 | 0 | __m128 _2 = _mm_set1_ps(2.0f); |
132 | 0 | __m128 _1_6 = _mm_set1_ps(1.0f / 6.0f); |
133 | 0 | __m128 _max = _mm_set1_ps(float(sizeS - 2)); |
134 | 0 | __m128 _range = _mm_set1_ps(float(Base::BICUBIC_RANGE)); |
135 | 0 | __m128i _channels = _mm_set1_epi32((int)_param.channels); |
136 | |
|
137 | 0 | for (; i < sizeDF; i += F, ax += 4 * F) |
138 | 0 | { |
139 | 0 | __m128 _pos = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_cvtepi32_ps(_i), _05), _scale), _05); |
140 | 0 | __m128 idx = _mm_round_ps(_pos, _MM_FROUND_FLOOR); |
141 | 0 | __m128 d = _mm_sub_ps(_pos, idx); |
142 | |
|
143 | 0 | __m128 minMask = _mm_cmplt_ps(idx, _0); |
144 | 0 | idx = _mm_blendv_ps(idx, _0, minMask); |
145 | 0 | d = _mm_blendv_ps(d, _0, minMask); |
146 | |
|
147 | 0 | __m128 maxMask = _mm_cmpgt_ps(idx, _max); |
148 | 0 | idx = _mm_blendv_ps(idx, _max, maxMask); |
149 | 0 | d = _mm_blendv_ps(d, _1, maxMask); |
150 | |
|
151 | 0 | _mm_storeu_si128((__m128i*)(_ix.data + i), _mm_mullo_epi32(_mm_cvtps_epi32(idx), _channels)); |
152 | |
|
153 | 0 | __m128i a0 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_2, d), _mm_sub_ps(_1, d)), _mm_mul_ps(d, _1_6)))); |
154 | 0 | __m128i a1 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(d, _2), _mm_add_ps(_1, d)), _mm_mul_ps(_mm_sub_ps(_1, d), _05)))); |
155 | 0 | __m128i a2 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_sub_ps(d, _2), _mm_add_ps(_1, d)), _mm_mul_ps(d, _05)))); |
156 | 0 | __m128i a3 = _mm_cvtps_epi32(_mm_mul_ps(_range, _mm_mul_ps(_mm_mul_ps(_mm_add_ps(_1, d), _mm_sub_ps(_1, d)), _mm_mul_ps(d, _1_6)))); |
157 | 0 | _mm_storeu_si128((__m128i*)ax, _mm_shuffle_epi8(_mm_packs_epi16(_mm_packs_epi32(a0, a1), _mm_packs_epi32(a2, a3)), _SHUFFLE)); |
158 | |
|
159 | 0 | _i = _mm_add_epi32(_i, K32_00000004); |
160 | 0 | } |
161 | 0 | } |
162 | 0 | for (; i < sizeD; ++i, ax += 4) |
163 | 0 | { |
164 | 0 | float pos = (float)((i + 0.5f) * scale - 0.5f); |
165 | 0 | int idx = (int)::floor(pos); |
166 | 0 | float d = pos - idx; |
167 | 0 | if (idx < 0) |
168 | 0 | { |
169 | 0 | idx = 0; |
170 | 0 | d = 0.0f; |
171 | 0 | } |
172 | 0 | if (idx > (int)sizeS - 2) |
173 | 0 | { |
174 | 0 | idx = (int)sizeS - 2; |
175 | 0 | d = 1.0f; |
176 | 0 | } |
177 | 0 | _ix[i] = idx * (int)_param.channels; |
178 | 0 | ax[0] = (int8_t)Round(Base::BICUBIC_RANGE * (2.0f - d) * (1.0f - d) * d / 6.0f); |
179 | 0 | ax[1] = (int8_t)Round(Base::BICUBIC_RANGE * (d - 2.0f) * (d + 1.0f) * (1.0f - d) / 2.0f); |
180 | 0 | ax[2] = (int8_t)Round(Base::BICUBIC_RANGE * (d - 2.0f) * (d + 1.0f) * d / 2.0f); |
181 | 0 | ax[3] = (int8_t)Round(Base::BICUBIC_RANGE * (1.0f + d) * (1.0f - d) * d / 6.0f); |
182 | 0 | } |
183 | 0 | } |
184 | | |
185 | | void ResizerByteBicubic::ResizerByteBicubic::Init(bool sparse) |
186 | 0 | { |
187 | 0 | if (_iy.data) |
188 | 0 | return; |
189 | 0 | EstimateIndexAlphaY(); |
190 | 0 | EstimateIndexAlphaX(); |
191 | 0 | if (!sparse) |
192 | 0 | { |
193 | 0 | for (int i = 0; i < 4; ++i) |
194 | 0 | _bx[i].Resize(_param.dstW * _param.channels); |
195 | 0 | } |
196 | 0 | _sxl = (_param.srcW - 2) * _param.channels; |
197 | 0 | for (_xn = 0; _ix[_xn] == 0; _xn++); |
198 | 0 | for (_xt = _param.dstW; _ix[_xt - 1] == _sxl; _xt--); |
199 | 0 | } |
200 | | |
201 | | template<int N> __m128i LoadAx(const int8_t* ax); |
202 | | |
203 | | template<> SIMD_INLINE __m128i LoadAx<1>(const int8_t* ax) |
204 | 0 | { |
205 | 0 | return _mm_loadu_si128((__m128i*)ax); |
206 | 0 | } |
207 | | |
208 | | template<> SIMD_INLINE __m128i LoadAx<2>(const int8_t* ax) |
209 | 0 | { |
210 | 0 | return _mm_shuffle_epi32(_mm_loadl_epi64((__m128i*)ax), 0x50); |
211 | 0 | } |
212 | | |
213 | | template<> SIMD_INLINE __m128i LoadAx<3>(const int8_t* ax) |
214 | 0 | { |
215 | 0 | return _mm_set1_epi32(*(int32_t*)ax); |
216 | 0 | } |
217 | | |
218 | | template<> SIMD_INLINE __m128i LoadAx<4>(const int8_t* ax) |
219 | 0 | { |
220 | 0 | return _mm_set1_epi32(*(int32_t*)ax); |
221 | 0 | } |
222 | | |
223 | | template<int N> __m128i CubicSumX(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay); |
224 | | |
225 | | template<> SIMD_INLINE __m128i CubicSumX<1>(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay) |
226 | 0 | { |
227 | 0 | __m128i _src = _mm_setr_epi32(*(int32_t*)(src + ix[0]), *(int32_t*)(src + ix[1]), *(int32_t*)(src + ix[2]), *(int32_t*)(src + ix[3])); |
228 | 0 | return _mm_madd_epi16(_mm_maddubs_epi16(_src, ax), ay); |
229 | 0 | } |
230 | | |
231 | | template<> SIMD_INLINE __m128i CubicSumX<2>(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay) |
232 | 0 | { |
233 | 0 | static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x4, 0x6, 0x1, 0x3, 0x5, 0x7, 0x8, 0xA, 0xC, 0xE, 0x9, 0xB, 0xD, 0xF); |
234 | 0 | __m128i _src = _mm_shuffle_epi8(Load((__m128i*)(src + ix[0]), (__m128i*)(src + ix[1])), SHUFFLE); |
235 | 0 | return _mm_madd_epi16(_mm_maddubs_epi16(_src, ax), ay); |
236 | 0 | } |
237 | | |
238 | | template<> SIMD_INLINE __m128i CubicSumX<3>(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay) |
239 | 0 | { |
240 | 0 | static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1); |
241 | 0 | __m128i _src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(src + ix[0])), SHUFFLE); |
242 | 0 | return _mm_madd_epi16(_mm_maddubs_epi16(_src, ax), ay); |
243 | 0 | } |
244 | | |
245 | | template<> SIMD_INLINE __m128i CubicSumX<4>(const uint8_t* src, const int32_t* ix, __m128i ax, __m128i ay) |
246 | 0 | { |
247 | 0 | static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); |
248 | 0 | __m128i _src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(src + ix[0])), SHUFFLE); |
249 | 0 | return _mm_madd_epi16(_mm_maddubs_epi16(_src, ax), ay); |
250 | 0 | } |
251 | | |
252 | | template <int N> SIMD_INLINE void BicubicInt(const uint8_t* src0, const uint8_t* src1, const uint8_t* src2, const uint8_t* src3, const int32_t* ix, const int8_t* ax, const __m128i* ay, uint8_t* dst) |
253 | 0 | { |
254 | 0 | static const __m128i ROUND = SIMD_MM_SET1_EPI32(Base::BICUBIC_ROUND); |
255 | 0 | __m128i _ax = LoadAx<N>(ax); |
256 | 0 | __m128i say0 = CubicSumX<N>(src0 - N, ix, _ax, ay[0]); |
257 | 0 | __m128i say1 = CubicSumX<N>(src1 - N, ix, _ax, ay[1]); |
258 | 0 | __m128i say2 = CubicSumX<N>(src2 - N, ix, _ax, ay[2]); |
259 | 0 | __m128i say3 = CubicSumX<N>(src3 - N, ix, _ax, ay[3]); |
260 | 0 | __m128i sum = _mm_add_epi32(_mm_add_epi32(say0, say1), _mm_add_epi32(say2, say3)); |
261 | 0 | __m128i dst0 = _mm_srai_epi32(_mm_add_epi32(sum, ROUND), Base::BICUBIC_SHIFT); |
262 | 0 | *((int32_t*)(dst)) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst0, K_ZERO), K_ZERO)); |
263 | 0 | } Unexecuted instantiation: void Simd::Sse41::BicubicInt<1>(unsigned char const*, unsigned char const*, unsigned char const*, unsigned char const*, int const*, signed char const*, long long __vector(2) const*, unsigned char*) Unexecuted instantiation: void Simd::Sse41::BicubicInt<2>(unsigned char const*, unsigned char const*, unsigned char const*, unsigned char const*, int const*, signed char const*, long long __vector(2) const*, unsigned char*) Unexecuted instantiation: void Simd::Sse41::BicubicInt<3>(unsigned char const*, unsigned char const*, unsigned char const*, unsigned char const*, int const*, signed char const*, long long __vector(2) const*, unsigned char*) Unexecuted instantiation: void Simd::Sse41::BicubicInt<4>(unsigned char const*, unsigned char const*, unsigned char const*, unsigned char const*, int const*, signed char const*, long long __vector(2) const*, unsigned char*) |
264 | | |
265 | | template<int N> void ResizerByteBicubic::RunS(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) |
266 | 0 | { |
267 | 0 | assert(_xn == 0 && _xt == _param.dstW); |
268 | 0 | size_t step = 4 / N; |
269 | 0 | size_t body = AlignLoAny(_param.dstW - (N == 3 ? 1 : 0), step); |
270 | 0 | for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) |
271 | 0 | { |
272 | 0 | size_t sy = _iy[dy]; |
273 | 0 | const uint8_t* src1 = src + sy * srcStride; |
274 | 0 | const uint8_t* src2 = src1 + srcStride; |
275 | 0 | const uint8_t* src0 = sy ? src1 - srcStride : src1; |
276 | 0 | const uint8_t* src3 = sy < _param.srcH - 2 ? src2 + srcStride : src2; |
277 | 0 | const int32_t* ay = _ay.data + dy * 4; |
278 | 0 | __m128i ays[4]; |
279 | 0 | ays[0] = _mm_set1_epi16(ay[0]); |
280 | 0 | ays[1] = _mm_set1_epi16(ay[1]); |
281 | 0 | ays[2] = _mm_set1_epi16(ay[2]); |
282 | 0 | ays[3] = _mm_set1_epi16(ay[3]); |
283 | 0 | size_t dx = 0; |
284 | 0 | for (; dx < body; dx += step) |
285 | 0 | BicubicInt<N>(src0, src1, src2, src3, _ix.data + dx, _ax.data + dx * 4, ays, dst + dx * N); |
286 | 0 | for (; dx < _param.dstW; dx++) |
287 | 0 | Base::BicubicInt<N, -1, 2>(src0, src1, src2, src3, _ix[dx], _ax.data + dx * 4, ay, dst + dx * N); |
288 | 0 | } |
289 | 0 | } Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunS<1>(unsigned char const*, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunS<2>(unsigned char const*, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunS<3>(unsigned char const*, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunS<4>(unsigned char const*, unsigned long, unsigned char*, unsigned long) |
290 | | |
291 | | template<int F> SIMD_INLINE void PixelCubicSumX(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst); |
292 | | |
293 | | template<> SIMD_INLINE void PixelCubicSumX<1>(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst) |
294 | 0 | { |
295 | 0 | __m128i _src = _mm_setr_epi32(*(int32_t*)(src + ix[0]), *(int32_t*)(src + ix[1]), *(int32_t*)(src + ix[2]), *(int32_t*)(src + ix[3])); |
296 | 0 | __m128i _ax = _mm_loadu_si128((__m128i*)ax); |
297 | 0 | _mm_storeu_si128((__m128i*)dst, _mm_madd_epi16(_mm_maddubs_epi16(_src, _ax), K16_0001)); |
298 | 0 | } |
299 | | |
300 | | template<> SIMD_INLINE void PixelCubicSumX<2>(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst) |
301 | 0 | { |
302 | 0 | static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x2, 0x4, 0x6, 0x1, 0x3, 0x5, 0x7, 0x8, 0xA, 0xC, 0xE, 0x9, 0xB, 0xD, 0xF); |
303 | 0 | __m128i _src = _mm_shuffle_epi8(Load((__m128i*)(src + ix[0]), (__m128i*)(src + ix[1])), SHUFFLE); |
304 | 0 | __m128i _ax = _mm_shuffle_epi32(_mm_loadl_epi64((__m128i*)ax), 0x50); |
305 | 0 | _mm_storeu_si128((__m128i*)dst, _mm_madd_epi16(_mm_maddubs_epi16(_src, _ax), K16_0001)); |
306 | 0 | } |
307 | | |
308 | | template<> SIMD_INLINE void PixelCubicSumX<3>(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst) |
309 | 0 | { |
310 | 0 | static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x3, 0x6, 0x9, 0x1, 0x4, 0x7, 0xA, 0x2, 0x5, 0x8, 0xB, -1, -1, -1, -1); |
311 | 0 | __m128i _src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(src + ix[0])), SHUFFLE); |
312 | 0 | __m128i _ax = _mm_set1_epi32(*(int32_t*)ax); |
313 | 0 | _mm_storeu_si128((__m128i*)dst, _mm_madd_epi16(_mm_maddubs_epi16(_src, _ax), K16_0001)); |
314 | 0 | } |
315 | | |
316 | | template<> SIMD_INLINE void PixelCubicSumX<4>(const uint8_t* src, const int32_t* ix, const int8_t* ax, int32_t* dst) |
317 | 0 | { |
318 | 0 | static const __m128i SHUFFLE = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x8, 0xC, 0x1, 0x5, 0x9, 0xD, 0x2, 0x6, 0xA, 0xE, 0x3, 0x7, 0xB, 0xF); |
319 | 0 | __m128i _src = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(src + ix[0])), SHUFFLE); |
320 | 0 | __m128i _ax = _mm_set1_epi32(*(int32_t*)ax); |
321 | 0 | _mm_storeu_si128((__m128i*)dst, _mm_madd_epi16(_mm_maddubs_epi16(_src, _ax), K16_0001)); |
322 | 0 | } |
323 | | |
324 | | template<int N> SIMD_INLINE void RowCubicSumX(const uint8_t* src, size_t nose, size_t body, size_t tail, const int32_t* ix, const int8_t* ax, int32_t* dst) |
325 | 0 | { |
326 | 0 | size_t step = 4 / N; |
327 | 0 | size_t bodyS = nose + AlignLoAny(body - nose, step); |
328 | |
|
329 | 0 | size_t dx = 0; |
330 | 0 | for (; dx < nose; dx++, ax += 4, dst += N) |
331 | 0 | Base::PixelCubicSumX<N, 0, 2>(src + ix[dx], ax, dst); |
332 | 0 | for (; dx < bodyS; dx += step, ax += 4 * step, dst += N * step) |
333 | 0 | PixelCubicSumX<N>(src - N, ix + dx, ax, dst); |
334 | 0 | for (; dx < body; dx++, ax += 4, dst += N) |
335 | 0 | Base::PixelCubicSumX<N, -1, 2>(src + ix[dx], ax, dst); |
336 | 0 | for (; dx < tail; dx++, ax += 4, dst += N) |
337 | 0 | Base::PixelCubicSumX<N, -1, 1>(src + ix[dx], ax, dst); |
338 | 0 | } Unexecuted instantiation: void Simd::Sse41::RowCubicSumX<1>(unsigned char const*, unsigned long, unsigned long, unsigned long, int const*, signed char const*, int*) Unexecuted instantiation: void Simd::Sse41::RowCubicSumX<2>(unsigned char const*, unsigned long, unsigned long, unsigned long, int const*, signed char const*, int*) Unexecuted instantiation: void Simd::Sse41::RowCubicSumX<3>(unsigned char const*, unsigned long, unsigned long, unsigned long, int const*, signed char const*, int*) Unexecuted instantiation: void Simd::Sse41::RowCubicSumX<4>(unsigned char const*, unsigned long, unsigned long, unsigned long, int const*, signed char const*, int*) |
339 | | |
340 | | SIMD_INLINE void BicubicRowInt(const int32_t* src0, const int32_t* src1, const int32_t* src2, const int32_t* src3, size_t n, const int32_t* ay, uint8_t* dst) |
341 | 0 | { |
342 | 0 | size_t nF = AlignLo(n, F); |
343 | 0 | size_t i = 0; |
344 | 0 | if (nF) |
345 | 0 | { |
346 | 0 | static const __m128i ROUND = SIMD_MM_SET1_EPI32(Base::BICUBIC_ROUND); |
347 | 0 | __m128i ay0 = _mm_set1_epi32(ay[0]); |
348 | 0 | __m128i ay1 = _mm_set1_epi32(ay[1]); |
349 | 0 | __m128i ay2 = _mm_set1_epi32(ay[2]); |
350 | 0 | __m128i ay3 = _mm_set1_epi32(ay[3]); |
351 | 0 | for (; i < nF; i += F) |
352 | 0 | { |
353 | 0 | __m128i say0 = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)(src0 + i)), ay0); |
354 | 0 | __m128i say1 = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)(src1 + i)), ay1); |
355 | 0 | __m128i say2 = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)(src2 + i)), ay2); |
356 | 0 | __m128i say3 = _mm_mullo_epi32(_mm_loadu_si128((__m128i*)(src3 + i)), ay3); |
357 | 0 | __m128i sum = _mm_add_epi32(_mm_add_epi32(say0, say1), _mm_add_epi32(say2, say3)); |
358 | 0 | __m128i dst0 = _mm_srai_epi32(_mm_add_epi32(sum, ROUND), Base::BICUBIC_SHIFT); |
359 | 0 | *((int32_t*)(dst + i)) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(dst0, K_ZERO), K_ZERO)); |
360 | 0 | } |
361 | 0 | } |
362 | 0 | for (; i < n; ++i) |
363 | 0 | { |
364 | 0 | int32_t sum = ay[0] * src0[i] + ay[1] * src1[i] + ay[2] * src2[i] + ay[3] * src3[i]; |
365 | 0 | dst[i] = Base::RestrictRange((sum + Base::BICUBIC_ROUND) >> Base::BICUBIC_SHIFT, 0, 255); |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | | template<int N> void ResizerByteBicubic::RunB(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) |
370 | 0 | { |
371 | 0 | int32_t prev = -1; |
372 | 0 | for (size_t dy = 0; dy < _param.dstH; dy++, dst += dstStride) |
373 | 0 | { |
374 | 0 | int32_t sy = _iy[dy], next = prev; |
375 | 0 | for (int32_t curr = sy - 1, end = sy + 3; curr < end; ++curr) |
376 | 0 | { |
377 | 0 | if (curr < prev) |
378 | 0 | continue; |
379 | 0 | const uint8_t* ps = src + RestrictRange(curr, 0, (int)_param.srcH - 1) * srcStride; |
380 | 0 | int32_t* pb = _bx[(curr + 1) & 3].data; |
381 | 0 | RowCubicSumX<N>(ps, _xn, _xt, _param.dstW, _ix.data, _ax.data, pb); |
382 | 0 | next++; |
383 | 0 | } |
384 | 0 | prev = next; |
385 | |
|
386 | 0 | const int32_t* ay = _ay.data + dy * 4; |
387 | 0 | int32_t* pb0 = _bx[(sy + 0) & 3].data; |
388 | 0 | int32_t* pb1 = _bx[(sy + 1) & 3].data; |
389 | 0 | int32_t* pb2 = _bx[(sy + 2) & 3].data; |
390 | 0 | int32_t* pb3 = _bx[(sy + 3) & 3].data; |
391 | 0 | BicubicRowInt(pb0, pb1, pb2, pb3, _bx[0].size, ay, dst); |
392 | 0 | } |
393 | 0 | } Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunB<1>(unsigned char const*, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunB<2>(unsigned char const*, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunB<3>(unsigned char const*, unsigned long, unsigned char*, unsigned long) Unexecuted instantiation: void Simd::Sse41::ResizerByteBicubic::RunB<4>(unsigned char const*, unsigned long, unsigned char*, unsigned long) |
394 | | |
395 | | void ResizerByteBicubic::Run(const uint8_t* src, size_t srcStride, uint8_t* dst, size_t dstStride) |
396 | 0 | { |
397 | 0 | bool sparse = _param.dstH * 3.0 <= _param.srcH; |
398 | 0 | Init(sparse); |
399 | 0 | switch (_param.channels) |
400 | 0 | { |
401 | 0 | case 1: sparse ? RunS<1>(src, srcStride, dst, dstStride) : RunB<1>(src, srcStride, dst, dstStride); return; |
402 | 0 | case 2: sparse ? RunS<2>(src, srcStride, dst, dstStride) : RunB<2>(src, srcStride, dst, dstStride); return; |
403 | 0 | case 3: sparse ? RunS<3>(src, srcStride, dst, dstStride) : RunB<3>(src, srcStride, dst, dstStride); return; |
404 | 0 | case 4: sparse ? RunS<4>(src, srcStride, dst, dstStride) : RunB<4>(src, srcStride, dst, dstStride); return; |
405 | 0 | default: |
406 | | assert(0); |
407 | 0 | } |
408 | 0 | } |
409 | | } |
410 | | #endif |
411 | | } |
412 | | |