/src/Simd/src/Simd/SimdSse41SynetGridSample2d32fBlZ.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2025 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | |
25 | | #include "Simd/SimdSynetGridSample.h" |
26 | | |
27 | | #include "Simd/SimdLoad.h" |
28 | | #include "Simd/SimdSet.h" |
29 | | |
30 | | namespace Simd |
31 | | { |
32 | | #if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) |
33 | | namespace Sse41 |
34 | | { |
35 | | template <int align> SIMD_INLINE float Denormalize32f(float pos, int dim) |
36 | 0 | { |
37 | 0 | if (align) |
38 | 0 | return float((pos + 1) / 2.0f * (dim - 1)); |
39 | 0 | else |
40 | 0 | return float(((pos + 1) * dim - 1) / 2.0f); |
41 | 0 | } Unexecuted instantiation: float Simd::Sse41::Denormalize32f<1>(float, int) Unexecuted instantiation: float Simd::Sse41::Denormalize32f<0>(float, int) |
42 | | |
43 | | template<int align, int range> void IndexCoeffs32fBlZ(const float* grd, size_t dstS, int srcH, int srcW, int padW, uint32_t* idx, float* dy, float* dx, int& yMin, int& yMax) |
44 | 0 | { |
45 | 0 | size_t dstSF = AlignLo(dstS, F), d = 0; |
46 | 0 | const __m128 a = SetFloat((srcW - align) / 2.0f, (srcH - align) / 2.0f); |
47 | 0 | const __m128 b = SetFloat((srcW - 1) / 2.0f, (srcH - 1) / 2.0f); |
48 | 0 | const __m128i _0 = _mm_setzero_si128(); |
49 | 0 | const __m128i _2 = _mm_set1_epi32(2); |
50 | 0 | const __m128i _srcH = _mm_set1_epi32(srcH + 2); |
51 | 0 | const __m128i _srcW = _mm_set1_epi32(srcW + 2); |
52 | 0 | const __m128i _padW = _mm_set1_epi32(padW); |
53 | 0 | __m128i _yMin, _yMax; |
54 | 0 | if (range) |
55 | 0 | { |
56 | 0 | _yMin = _mm_set1_epi32(yMin); |
57 | 0 | _yMax = _mm_set1_epi32(yMax); |
58 | 0 | } |
59 | 0 | for (; d < dstSF; d += F) |
60 | 0 | { |
61 | 0 | __m128 xy0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + 0), a), b); |
62 | 0 | __m128 xy1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(grd + F), a), b); |
63 | 0 | __m128 x = _mm_shuffle_ps(xy0, xy1, 0x88); |
64 | 0 | __m128 y = _mm_shuffle_ps(xy0, xy1, 0xDD); |
65 | 0 | __m128 xf = _mm_round_ps(x, _MM_FROUND_FLOOR); |
66 | 0 | __m128 yf = _mm_round_ps(y, _MM_FROUND_FLOOR); |
67 | 0 | _mm_storeu_ps(dy + d, _mm_sub_ps(y, yf)); |
68 | 0 | _mm_storeu_ps(dx + d, _mm_sub_ps(x, xf)); |
69 | 0 | __m128i xi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(xf), _2), _0), _srcW); |
70 | 0 | __m128i yi = _mm_min_epi32(_mm_max_epi32(_mm_add_epi32(_mm_cvtps_epi32(yf), _2), _0), _srcH); |
71 | 0 | _mm_storeu_si128((__m128i*)(idx + d), _mm_add_epi32(_mm_mullo_epi32(_padW, yi), xi)); |
72 | 0 | if (range) |
73 | 0 | { |
74 | 0 | _yMin = _mm_min_epi32(_yMin, yi); |
75 | 0 | _yMax = _mm_max_epi32(_yMax, yi); |
76 | 0 | } |
77 | 0 | grd += 2 * F; |
78 | 0 | } |
79 | 0 | if (range) |
80 | 0 | { |
81 | 0 | yMin = MinVal32i(_yMin); |
82 | 0 | yMax = MaxVal32i(_yMax); |
83 | 0 | } |
84 | 0 | for (; d < dstS; ++d) |
85 | 0 | { |
86 | 0 | float x = Denormalize32f<align>(grd[0], srcW); |
87 | 0 | float y = Denormalize32f<align>(grd[1], srcH); |
88 | 0 | int x0 = int(std::floor(x)); |
89 | 0 | int y0 = int(std::floor(y)); |
90 | 0 | dy[d] = y - float(y0); |
91 | 0 | dx[d] = x - float(x0); |
92 | 0 | x0 = Simd::RestrictRange(x0, -2, srcW) + 2; |
93 | 0 | y0 = Simd::RestrictRange(y0, -2, srcH) + 2; |
94 | 0 | idx[d] = padW * y0 + x0; |
95 | 0 | if (range) |
96 | 0 | { |
97 | 0 | yMin = Min(yMin, y0); |
98 | 0 | yMax = Max(yMax, y0); |
99 | 0 | } |
100 | 0 | grd += 2; |
101 | 0 | } |
102 | 0 | } Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<1, 1>(float const*, unsigned long, int, int, int, unsigned int*, float*, float*, int&, int&) Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<0, 1>(float const*, unsigned long, int, int, int, unsigned int*, float*, float*, int&, int&) Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<1, 0>(float const*, unsigned long, int, int, int, unsigned int*, float*, float*, int&, int&) Unexecuted instantiation: void Simd::Sse41::IndexCoeffs32fBlZ<0, 0>(float const*, unsigned long, int, int, int, unsigned int*, float*, float*, int&, int&) |
103 | | |
104 | | //------------------------------------------------------------------------------------------------- |
105 | | |
106 | | void BilinearInterp32fBlZ(const float* pad0, size_t dstS, int padW, uint32_t* idx, float* dy, float* dx, float* dst) |
107 | 0 | { |
108 | 0 | size_t dstSF = AlignLo(dstS, F), d = 0; |
109 | 0 | const float* pad1 = pad0 + padW; |
110 | 0 | __m128 p0, p1, _1 = _mm_set1_ps(1.0f); |
111 | 0 | for (; d < dstSF; d += F) |
112 | 0 | { |
113 | 0 | int i0 = idx[d + 0], i1 = idx[d + 1], i2 = idx[d + 2], i3 = idx[d + 3]; |
114 | 0 | p0 = Load(pad0 + i0, pad0 + i1); |
115 | 0 | p1 = Load(pad0 + i2, pad0 + i3); |
116 | 0 | __m128 p00 = _mm_shuffle_ps(p0, p1, 0x88); |
117 | 0 | __m128 p01 = _mm_shuffle_ps(p0, p1, 0xDD); |
118 | 0 | p0 = Load(pad1 + i0, pad1 + i1); |
119 | 0 | p1 = Load(pad1 + i2, pad1 + i3); |
120 | 0 | __m128 p10 = _mm_shuffle_ps(p0, p1, 0x88); |
121 | 0 | __m128 p11 = _mm_shuffle_ps(p0, p1, 0xDD); |
122 | 0 | __m128 dy1 = _mm_loadu_ps(dy + d); |
123 | 0 | __m128 dy0 = _mm_sub_ps(_1, dy1); |
124 | 0 | __m128 dx1 = _mm_loadu_ps(dx + d); |
125 | 0 | __m128 dx0 = _mm_sub_ps(_1, dx1); |
126 | 0 | __m128 d0 = _mm_add_ps(_mm_mul_ps(dx0, p00), _mm_mul_ps(dx1, p01)); |
127 | 0 | __m128 d1 = _mm_add_ps(_mm_mul_ps(dx0, p10), _mm_mul_ps(dx1, p11)); |
128 | 0 | _mm_storeu_ps(dst + d, _mm_add_ps(_mm_mul_ps(dy0, d0), _mm_mul_ps(dy1, d1))); |
129 | 0 | } |
130 | 0 | for (; d < dstS; ++d) |
131 | 0 | { |
132 | 0 | int offs = idx[d]; |
133 | 0 | float p00 = pad0[offs + 0]; |
134 | 0 | float p01 = pad0[offs + 1]; |
135 | 0 | float p10 = pad1[offs + 0]; |
136 | 0 | float p11 = pad1[offs + 1]; |
137 | 0 | float dy1 = dy[d]; |
138 | 0 | float dy0 = 1.0f - dy1; |
139 | 0 | float dx1 = dx[d]; |
140 | 0 | float dx0 = 1.0f - dx1; |
141 | 0 | dst[d] = dy0 * (dx0 * p00 + dx1 * p01) + dy1 * (dx0 * p10 + dx1 * p11); |
142 | 0 | } |
143 | 0 | } |
144 | | |
145 | | //------------------------------------------------------------------------------------------------- |
146 | | |
147 | | SynetGridSample2d32fBlZ::SynetGridSample2d32fBlZ(const GridSample2dParam& param) |
148 | 0 | : Base::SynetGridSample2d32fBlZ(param) |
149 | 0 | { |
150 | 0 | if (_sparse) |
151 | 0 | _indexCoeffs = _param.align ? IndexCoeffs32fBlZ<1, 1> : IndexCoeffs32fBlZ<0, 1>; |
152 | 0 | else |
153 | 0 | _indexCoeffs = _param.align ? IndexCoeffs32fBlZ<1, 0> : IndexCoeffs32fBlZ<0, 0>; |
154 | 0 | _bilinearInterp = BilinearInterp32fBlZ; |
155 | 0 | } |
156 | | } |
157 | | #endif |
158 | | } |