/src/Simd/src/Simd/SimdSse41Winograd2.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2022 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdMemory.h" |
25 | | #include "Simd/SimdStore.h" |
26 | | #include "Simd/SimdWinograd.h" |
27 | | #include "Simd/SimdBase.h" |
28 | | |
29 | | namespace Simd |
30 | | { |
31 | | #if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) |
32 | | namespace Sse41 |
33 | | { |
34 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter(const __m128 src[4], float * dst, size_t stride) |
35 | 0 | { |
36 | 0 | _mm_storeu_ps(dst + 0 * stride, src[0]); |
37 | 0 | _mm_storeu_ps(dst + 1 * stride, _mm_add_ps(src[0], src[1])); |
38 | 0 | _mm_storeu_ps(dst + 2 * stride, src[1]); |
39 | |
|
40 | 0 | _mm_storeu_ps(dst + 3 * stride, _mm_add_ps(src[0], src[2])); |
41 | 0 | _mm_storeu_ps(dst + 4 * stride, _mm_add_ps(_mm_add_ps(src[0], src[1]), _mm_add_ps(src[2], src[3]))); |
42 | 0 | _mm_storeu_ps(dst + 5 * stride, _mm_add_ps(src[1], src[3])); |
43 | |
|
44 | 0 | _mm_storeu_ps(dst + 6 * stride, src[2]); |
45 | 0 | _mm_storeu_ps(dst + 7 * stride, _mm_add_ps(src[2], src[3])); |
46 | 0 | _mm_storeu_ps(dst + 8 * stride, src[3]); |
47 | 0 | } |
48 | | |
49 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter4n(const float* src, float* dst, size_t stride) |
50 | 0 | { |
51 | 0 | __m128 _src[4]; |
52 | 0 | Load4(src + 0, 4, _src + 0); |
53 | 0 | WinogradKernel2x2Block2x2SetFilter(_src, dst, stride); |
54 | 0 | } |
55 | | |
56 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter4t(const float* src, float* dst, size_t stride) |
57 | 0 | { |
58 | 0 | __m128 _src[4]; |
59 | 0 | _src[0] = _mm_loadu_ps(src + 0 * stride); |
60 | 0 | _src[1] = _mm_loadu_ps(src + 1 * stride); |
61 | 0 | _src[2] = _mm_loadu_ps(src + 2 * stride); |
62 | 0 | _src[3] = _mm_loadu_ps(src + 3 * stride); |
63 | 0 | WinogradKernel2x2Block2x2SetFilter(_src, dst, stride); |
64 | 0 | } |
65 | | |
66 | | void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans) |
67 | 0 | { |
68 | 0 | size_t size4 = AlignLo(size, 4), i = 0; |
69 | 0 | if (trans) |
70 | 0 | { |
71 | 0 | for (; i < size4; i += 4) |
72 | 0 | WinogradKernel2x2Block2x2SetFilter4t(src + i, dst + i, size); |
73 | 0 | for (; i < size; i += 1) |
74 | 0 | Base::WinogradKernel2x2Block2x2SetFilter1t(src + i, dst + i, size); |
75 | 0 | } |
76 | 0 | else |
77 | 0 | { |
78 | 0 | for (; i < size4; i += 4, src += 16, dst += 4) |
79 | 0 | WinogradKernel2x2Block2x2SetFilter4n(src, dst, size); |
80 | 0 | for (; i < size; i += 1, src += 4, dst += 1) |
81 | 0 | Base::WinogradKernel2x2Block2x2SetFilter1n(src, dst, size); |
82 | 0 | } |
83 | 0 | } |
84 | | |
85 | | //----------------------------------------------------------------------- |
86 | | |
87 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4Store(const __m128* src, float* dst, size_t stride) |
88 | 0 | { |
89 | 0 | _mm_storeu_ps(dst + 0 * stride, _mm_add_ps(_mm_sub_ps(src[0], src[1]), _mm_sub_ps(src[4], src[3]))); |
90 | 0 | _mm_storeu_ps(dst + 1 * stride, _mm_sub_ps(src[1], src[4])); |
91 | 0 | _mm_storeu_ps(dst + 2 * stride, _mm_add_ps(_mm_sub_ps(src[2], src[1]), _mm_sub_ps(src[4], src[5]))); |
92 | 0 | _mm_storeu_ps(dst + 3 * stride, _mm_sub_ps(src[3], src[4])); |
93 | 0 | _mm_storeu_ps(dst + 4 * stride, src[4]); |
94 | 0 | _mm_storeu_ps(dst + 5 * stride, _mm_sub_ps(src[5], src[4])); |
95 | 0 | _mm_storeu_ps(dst + 6 * stride, _mm_add_ps(_mm_sub_ps(src[4], src[3]), _mm_sub_ps(src[6], src[7]))); |
96 | 0 | _mm_storeu_ps(dst + 7 * stride, _mm_sub_ps(src[7], src[4])); |
97 | 0 | _mm_storeu_ps(dst + 8 * stride, _mm_add_ps(_mm_sub_ps(src[4], src[5]), _mm_sub_ps(src[8], src[7]))); |
98 | 0 | } |
99 | | |
100 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcS, size_t srcC, __m128 dst[9]) |
101 | 0 | { |
102 | 0 | dst[0] = _mm_loadu_ps(src + 0 * srcS + 0 * srcC); |
103 | 0 | dst[1] = _mm_loadu_ps(src + 0 * srcS + 1 * srcC); |
104 | 0 | dst[2] = _mm_loadu_ps(src + 0 * srcS + 2 * srcC); |
105 | 0 | dst[3] = _mm_loadu_ps(src + 1 * srcS + 0 * srcC); |
106 | 0 | dst[4] = _mm_loadu_ps(src + 1 * srcS + 1 * srcC); |
107 | 0 | dst[5] = _mm_loadu_ps(src + 1 * srcS + 2 * srcC); |
108 | 0 | dst[6] = _mm_loadu_ps(src + 2 * srcS + 0 * srcC); |
109 | 0 | dst[7] = _mm_loadu_ps(src + 2 * srcS + 1 * srcC); |
110 | 0 | dst[8] = _mm_loadu_ps(src + 2 * srcS + 2 * srcC); |
111 | 0 | } |
112 | | |
113 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) |
114 | 0 | { |
115 | 0 | size_t srcS = srcW * srcC; |
116 | 0 | size_t srcCF = AlignLo(srcC, F); |
117 | 0 | for (size_t c = 0; c < srcCF; c += F) |
118 | 0 | { |
119 | 0 | __m128 tmp[9]; |
120 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + c, srcS, srcC, tmp); |
121 | 0 | WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + c, dstStride); |
122 | 0 | } |
123 | 0 | if (srcCF < srcC) |
124 | 0 | { |
125 | 0 | __m128 tmp[9]; |
126 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + srcC - F, srcS, srcC, tmp); |
127 | 0 | WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride); |
128 | 0 | } |
129 | 0 | } |
130 | | |
131 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m128 dst[9]) |
132 | 0 | { |
133 | 0 | for (size_t i = 0; i < 9; ++i) |
134 | 0 | dst[i] = _mm_setzero_ps(); |
135 | 0 | for (size_t row = rowB; row < rowE; ++row) |
136 | 0 | for (size_t col = colB; col < colE; ++col) |
137 | 0 | dst[row * 3 + col] = _mm_loadu_ps(src + row * srcS + col * srcC); |
138 | 0 | } |
139 | | |
140 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) |
141 | 0 | { |
142 | 0 | size_t srcS = srcW * srcC; |
143 | 0 | size_t srcCF = AlignLo(srcC, F); |
144 | 0 | for (size_t c = 0; c < srcCF; c += F) |
145 | 0 | { |
146 | 0 | __m128 tmp[9]; |
147 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); |
148 | 0 | WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + c, dstStride); |
149 | 0 | } |
150 | 0 | if (srcCF < srcC) |
151 | 0 | { |
152 | 0 | __m128 tmp[9]; |
153 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); |
154 | 0 | WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride); |
155 | 0 | } |
156 | 0 | } |
157 | | |
158 | | void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, |
159 | | size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) |
160 | 0 | { |
161 | 0 | assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); |
162 | 0 | if (trans ? (srcChannels < F) : true) |
163 | 0 | { |
164 | 0 | Base::WinogradKernel2x2Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); |
165 | 0 | return; |
166 | 0 | } |
167 | 0 | size_t dstH = srcHeight - 1 + padY + padH; |
168 | 0 | size_t dstW = srcWidth - 1 + padX + padW; |
169 | 0 | size_t dstH2 = AlignLo(dstH, 2); |
170 | 0 | size_t dstW2 = AlignLo(dstW, 2); |
171 | 0 | size_t noseW = Simd::Min<size_t>(3, dstW + 1); |
172 | 0 | size_t noseH = Simd::Min<size_t>(3, dstH + 1); |
173 | 0 | size_t startY = padY ? 2 : 0; |
174 | 0 | size_t startX = padX ? 2 : 0; |
175 | 0 | if (padY || padH) |
176 | 0 | { |
177 | 0 | if (dstH == dstH2) |
178 | 0 | dstH2 -= 2; |
179 | 0 | if (dstW == dstW2) |
180 | 0 | dstW2 -= 2; |
181 | 0 | if (padY) |
182 | 0 | src -= (srcWidth + 1) * (trans ? srcChannels : 1); |
183 | 0 | } |
184 | 0 | size_t tailW = dstW - dstW2 + (padW ? 0 : 1); |
185 | 0 | size_t tailH = dstH - dstH2 + (padH ? 0 : 1); |
186 | 0 | size_t row = 0, col = 0; |
187 | 0 | if (padY) |
188 | 0 | { |
189 | 0 | if (padX) |
190 | 0 | WinogradKernel2x2Block2x2SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; |
191 | 0 | for (col = startX; col < dstW2; col += 2) |
192 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 3, dst, dstStride), dst += srcChannels; |
193 | 0 | if (col < dstW) |
194 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; |
195 | 0 | } |
196 | 0 | for (row = startY; row < dstH2; row += 2) |
197 | 0 | { |
198 | 0 | if (padX) |
199 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 3, 1, noseW, dst, dstStride), dst += srcChannels; |
200 | 0 | for (col = startX; col < dstW2; col += 2) |
201 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; |
202 | 0 | if (col < dstW) |
203 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 3, 0, tailW, dst, dstStride), dst += srcChannels; |
204 | 0 | } |
205 | 0 | if (row < dstH) |
206 | 0 | { |
207 | 0 | if (padX) |
208 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; |
209 | 0 | for (col = startX; col < dstW2; col += 2) |
210 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 3, dst, dstStride), dst += srcChannels; |
211 | 0 | if (col < dstW) |
212 | 0 | WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; |
213 | 0 | } |
214 | 0 | } |
215 | | |
216 | | //----------------------------------------------------------------------- |
217 | | |
218 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputLoad9(const float* src, size_t stride, __m128* dst) |
219 | 0 | { |
220 | 0 | __m128 s[9]; |
221 | 0 | s[0] = _mm_loadu_ps(src + 0 * stride); |
222 | 0 | s[1] = _mm_loadu_ps(src + 1 * stride); |
223 | 0 | s[2] = _mm_loadu_ps(src + 2 * stride); |
224 | 0 | s[3] = _mm_loadu_ps(src + 3 * stride); |
225 | 0 | s[4] = _mm_loadu_ps(src + 4 * stride); |
226 | 0 | s[5] = _mm_loadu_ps(src + 5 * stride); |
227 | 0 | s[6] = _mm_loadu_ps(src + 6 * stride); |
228 | 0 | s[7] = _mm_loadu_ps(src + 7 * stride); |
229 | 0 | s[8] = _mm_loadu_ps(src + 8 * stride); |
230 | 0 | dst[0] = _mm_add_ps(_mm_add_ps(s[0], s[1]), _mm_add_ps(s[3], s[4])); |
231 | 0 | dst[1] = _mm_add_ps(_mm_add_ps(s[1], s[2]), _mm_add_ps(s[4], s[5])); |
232 | 0 | dst[2] = _mm_add_ps(_mm_add_ps(s[3], s[4]), _mm_add_ps(s[6], s[7])); |
233 | 0 | dst[3] = _mm_add_ps(_mm_add_ps(s[4], s[5]), _mm_add_ps(s[7], s[8])); |
234 | 0 | } |
235 | | |
236 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore4(const __m128 src[4], float* dst, size_t dstS, size_t dstC) |
237 | 0 | { |
238 | 0 | _mm_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]); |
239 | 0 | _mm_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]); |
240 | 0 | _mm_storeu_ps(dst + 1 * dstS + 0 * dstC, src[2]); |
241 | 0 | _mm_storeu_ps(dst + 1 * dstS + 1 * dstC, src[3]); |
242 | 0 | } |
243 | | |
244 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) |
245 | 0 | { |
246 | 0 | size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); |
247 | 0 | for (size_t d = 0; d < dstCF; d += F) |
248 | 0 | { |
249 | 0 | __m128 tmp[4]; |
250 | 0 | WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp); |
251 | 0 | WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC); |
252 | 0 | } |
253 | 0 | if (dstCF < dstC) |
254 | 0 | { |
255 | 0 | __m128 tmp[4]; |
256 | 0 | WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp); |
257 | 0 | WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC); |
258 | 0 | } |
259 | 0 | } |
260 | | |
261 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore4(const __m128 src[4], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) |
262 | 0 | { |
263 | 0 | for (size_t row = 0; row < rowE; ++row) |
264 | 0 | for (size_t col = 0; col < colE; ++col) |
265 | 0 | _mm_storeu_ps(dst + row * dstS + col * dstC, src[row * 2 + col]); |
266 | 0 | } |
267 | | |
268 | | SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) |
269 | 0 | { |
270 | 0 | size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); |
271 | 0 | for (size_t d = 0; d < dstCF; d += F) |
272 | 0 | { |
273 | 0 | __m128 tmp[4]; |
274 | 0 | WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp); |
275 | 0 | WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, rowE, colE); |
276 | 0 | } |
277 | 0 | if (dstCF < dstC) |
278 | 0 | { |
279 | 0 | __m128 tmp[4]; |
280 | 0 | WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp); |
281 | 0 | WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC, rowE, colE); |
282 | 0 | } |
283 | 0 | } |
284 | | |
285 | | void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) |
286 | 0 | { |
287 | 0 | if (trans ? (dstChannels < F) : true) |
288 | 0 | { |
289 | 0 | Base::WinogradKernel2x2Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); |
290 | 0 | return; |
291 | 0 | } |
292 | 0 | size_t tileH = (dstHeight + 1) / 2; |
293 | 0 | size_t tileW = (dstWidth + 1) / 2; |
294 | 0 | size_t dstH2 = AlignLo(dstHeight, 2); |
295 | 0 | size_t dstW2 = AlignLo(dstWidth, 2); |
296 | 0 | size_t row, col; |
297 | 0 | for (row = 0; row < dstH2; row += 2) |
298 | 0 | { |
299 | 0 | for (col = 0; col < dstW2; col += 2) |
300 | 0 | WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; |
301 | 0 | if (col < dstWidth) |
302 | 0 | WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels; |
303 | 0 | } |
304 | 0 | if (row < dstHeight) |
305 | 0 | { |
306 | 0 | for (col = 0; col < dstW2; col += 2) |
307 | 0 | WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels; |
308 | 0 | if (col < dstWidth) |
309 | 0 | WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; |
310 | 0 | } |
311 | 0 | } |
312 | | |
313 | | //----------------------------------------------------------------------- |
314 | | |
315 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetFilterRow(const __m128* t, float* dst, size_t stride) |
316 | 0 | { |
317 | 0 | const __m128 r2 = _mm_set1_ps(1.0f / 2.0f); |
318 | 0 | const __m128 r3 = _mm_set1_ps(1.0f / 3.0f); |
319 | 0 | const __m128 r6 = _mm_set1_ps(1.0f / 6.0f); |
320 | 0 | const __m128 mr2 = _mm_set1_ps(-1.0f / 2.0f); |
321 | |
|
322 | 0 | _mm_storeu_ps(dst + 0 * stride, _mm_mul_ps(r2, t[0])); |
323 | 0 | _mm_storeu_ps(dst + 1 * stride, _mm_mul_ps(mr2, _mm_add_ps(t[0], t[1]))); |
324 | 0 | _mm_storeu_ps(dst + 2 * stride, _mm_mul_ps(r6, _mm_sub_ps(t[1], t[0]))); |
325 | 0 | _mm_storeu_ps(dst + 3 * stride, _mm_add_ps(_mm_mul_ps(r6, t[0]), _mm_mul_ps(r3, t[1]))); |
326 | 0 | _mm_storeu_ps(dst + 4 * stride, t[1]); |
327 | 0 | } |
328 | | |
329 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter(const __m128 src[4], float* dst, size_t stride) |
330 | 0 | { |
331 | 0 | const __m128 r2 = _mm_set1_ps(1.0f / 2.0f); |
332 | 0 | const __m128 r3 = _mm_set1_ps(1.0f / 3.0f); |
333 | 0 | const __m128 r6 = _mm_set1_ps(1.0f / 6.0f); |
334 | 0 | const __m128 mr2 = _mm_set1_ps(-1.0f / 2.0f); |
335 | |
|
336 | 0 | __m128 t[2]; |
337 | 0 | t[0] = _mm_mul_ps(r2, src[0]); |
338 | 0 | t[1] = _mm_mul_ps(r2, src[1]); |
339 | 0 | WinogradKernel2x2Block4x4SetFilterRow(t, dst + 0 * stride, stride); |
340 | |
|
341 | 0 | t[0] = _mm_mul_ps(mr2, _mm_add_ps(src[0], src[2])); |
342 | 0 | t[1] = _mm_mul_ps(mr2, _mm_add_ps(src[1], src[3])); |
343 | 0 | WinogradKernel2x2Block4x4SetFilterRow(t, dst + 5 * stride, stride); |
344 | |
|
345 | 0 | t[0] = _mm_mul_ps(r6, _mm_sub_ps(src[2], src[0])); |
346 | 0 | t[1] = _mm_mul_ps(r6, _mm_sub_ps(src[3], src[1])); |
347 | 0 | WinogradKernel2x2Block4x4SetFilterRow(t, dst + 10 * stride, stride); |
348 | |
|
349 | 0 | t[0] = _mm_add_ps(_mm_mul_ps(r6, src[0]), _mm_mul_ps(r3, src[2])); |
350 | 0 | t[1] = _mm_add_ps(_mm_mul_ps(r6, src[1]), _mm_mul_ps(r3, src[3])); |
351 | 0 | WinogradKernel2x2Block4x4SetFilterRow(t, dst + 15 * stride, stride); |
352 | |
|
353 | 0 | t[0] = src[2]; |
354 | 0 | t[1] = src[3]; |
355 | 0 | WinogradKernel2x2Block4x4SetFilterRow(t, dst + 20 * stride, stride); |
356 | 0 | } |
357 | | |
358 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter4n(const float* src, float* dst, size_t stride) |
359 | 0 | { |
360 | 0 | __m128 _src[4]; |
361 | 0 | Load4(src + 0, 4, _src + 0); |
362 | 0 | WinogradKernel2x2Block4x4SetFilter(_src, dst, stride); |
363 | 0 | } |
364 | | |
365 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter4t(const float* src, float* dst, size_t stride) |
366 | 0 | { |
367 | 0 | __m128 _src[4]; |
368 | 0 | _src[0] = _mm_loadu_ps(src + 0 * stride); |
369 | 0 | _src[1] = _mm_loadu_ps(src + 1 * stride); |
370 | 0 | _src[2] = _mm_loadu_ps(src + 2 * stride); |
371 | 0 | _src[3] = _mm_loadu_ps(src + 3 * stride); |
372 | 0 | WinogradKernel2x2Block4x4SetFilter(_src, dst, stride); |
373 | 0 | } |
374 | | |
375 | | void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans) |
376 | 0 | { |
377 | 0 | size_t size4 = AlignLo(size, 4), i = 0; |
378 | 0 | if (trans) |
379 | 0 | { |
380 | 0 | for (; i < size4; i += 4) |
381 | 0 | WinogradKernel2x2Block4x4SetFilter4t(src + i, dst + i, size); |
382 | 0 | for (; i < size; i += 1) |
383 | 0 | Base::WinogradKernel2x2Block4x4SetFilter1t(src + i, dst + i, size); |
384 | 0 | } |
385 | 0 | else |
386 | 0 | { |
387 | 0 | for (; i < size4; i += 4, src += 16, dst += 4) |
388 | 0 | WinogradKernel2x2Block4x4SetFilter4n(src, dst, size); |
389 | 0 | for (; i < size; i += 1, src += 4, dst += 1) |
390 | 0 | Base::WinogradKernel2x2Block4x4SetFilter1n(src, dst, size); |
391 | 0 | } |
392 | 0 | } |
393 | | |
394 | | //----------------------------------------------------------------------- |
395 | | |
396 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStoreRow(const __m128 tmp[5], float* dst, size_t stride) |
397 | 0 | { |
398 | 0 | const __m128 _2 = _mm_set1_ps(2.0f); |
399 | 0 | const __m128 _3 = _mm_set1_ps(3.0f); |
400 | 0 | _mm_storeu_ps(dst + 0 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[0]), tmp[1]), _mm_sub_ps(tmp[3], _mm_mul_ps(_2, tmp[2])))); |
401 | 0 | _mm_storeu_ps(dst + 1 * stride, _mm_sub_ps(tmp[3], _mm_add_ps(_mm_mul_ps(_2, tmp[1]), tmp[2]))); |
402 | 0 | _mm_storeu_ps(dst + 2 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[1]), _mm_mul_ps(_3, tmp[2])), tmp[3])); |
403 | 0 | _mm_storeu_ps(dst + 3 * stride, _mm_sub_ps(tmp[3], tmp[1])); |
404 | 0 | _mm_storeu_ps(dst + 4 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[1]), tmp[2]), _mm_sub_ps(tmp[4], _mm_mul_ps(_2, tmp[3])))); |
405 | 0 | } |
406 | | |
407 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStore(const __m128* src, float* dst, size_t stride) |
408 | 0 | { |
409 | 0 | const __m128 _2 = _mm_set1_ps(2.0f); |
410 | 0 | const __m128 _3 = _mm_set1_ps(3.0f); |
411 | 0 | __m128 tmp[5]; |
412 | 0 | tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[0]), src[5]), _mm_sub_ps(src[15], _mm_mul_ps(_2, src[10]))); |
413 | 0 | tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[1]), src[6]), _mm_sub_ps(src[16], _mm_mul_ps(_2, src[11]))); |
414 | 0 | tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[2]), src[7]), _mm_sub_ps(src[17], _mm_mul_ps(_2, src[12]))); |
415 | 0 | tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[3]), src[8]), _mm_sub_ps(src[18], _mm_mul_ps(_2, src[13]))); |
416 | 0 | tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[4]), src[9]), _mm_sub_ps(src[19], _mm_mul_ps(_2, src[14]))); |
417 | 0 | WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 0 * stride, stride); |
418 | |
|
419 | 0 | tmp[0] = _mm_sub_ps(src[15], _mm_add_ps(_mm_mul_ps(_2, src[5]), src[10])); |
420 | 0 | tmp[1] = _mm_sub_ps(src[16], _mm_add_ps(_mm_mul_ps(_2, src[6]), src[11])); |
421 | 0 | tmp[2] = _mm_sub_ps(src[17], _mm_add_ps(_mm_mul_ps(_2, src[7]), src[12])); |
422 | 0 | tmp[3] = _mm_sub_ps(src[18], _mm_add_ps(_mm_mul_ps(_2, src[8]), src[13])); |
423 | 0 | tmp[4] = _mm_sub_ps(src[19], _mm_add_ps(_mm_mul_ps(_2, src[9]), src[14])); |
424 | 0 | WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 5 * stride, stride); |
425 | |
|
426 | 0 | tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[5]), _mm_mul_ps(_3, src[10])), src[15]); |
427 | 0 | tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[6]), _mm_mul_ps(_3, src[11])), src[16]); |
428 | 0 | tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[7]), _mm_mul_ps(_3, src[12])), src[17]); |
429 | 0 | tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[8]), _mm_mul_ps(_3, src[13])), src[18]); |
430 | 0 | tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[9]), _mm_mul_ps(_3, src[14])), src[19]); |
431 | 0 | WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 10 * stride, stride); |
432 | |
|
433 | 0 | tmp[0] = _mm_sub_ps(src[15], src[5]); |
434 | 0 | tmp[1] = _mm_sub_ps(src[16], src[6]); |
435 | 0 | tmp[2] = _mm_sub_ps(src[17], src[7]); |
436 | 0 | tmp[3] = _mm_sub_ps(src[18], src[8]); |
437 | 0 | tmp[4] = _mm_sub_ps(src[19], src[9]); |
438 | 0 | WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 15 * stride, stride); |
439 | |
|
440 | 0 | tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[5]), src[10]), _mm_sub_ps(src[20], _mm_mul_ps(_2, src[15]))); |
441 | 0 | tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[6]), src[11]), _mm_sub_ps(src[21], _mm_mul_ps(_2, src[16]))); |
442 | 0 | tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[7]), src[12]), _mm_sub_ps(src[22], _mm_mul_ps(_2, src[17]))); |
443 | 0 | tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[8]), src[13]), _mm_sub_ps(src[23], _mm_mul_ps(_2, src[18]))); |
444 | 0 | tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[9]), src[14]), _mm_sub_ps(src[24], _mm_mul_ps(_2, src[19]))); |
445 | 0 | WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 20 * stride, stride); |
446 | 0 | } |
447 | | |
448 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcS, size_t srcC, __m128 dst[25]) |
449 | 0 | { |
450 | 0 | dst[0] = _mm_loadu_ps(src + 0 * srcS + 0 * srcC); |
451 | 0 | dst[1] = _mm_loadu_ps(src + 0 * srcS + 1 * srcC); |
452 | 0 | dst[2] = _mm_loadu_ps(src + 0 * srcS + 2 * srcC); |
453 | 0 | dst[3] = _mm_loadu_ps(src + 0 * srcS + 3 * srcC); |
454 | 0 | dst[4] = _mm_loadu_ps(src + 0 * srcS + 4 * srcC); |
455 | 0 | dst[5] = _mm_loadu_ps(src + 1 * srcS + 0 * srcC); |
456 | 0 | dst[6] = _mm_loadu_ps(src + 1 * srcS + 1 * srcC); |
457 | 0 | dst[7] = _mm_loadu_ps(src + 1 * srcS + 2 * srcC); |
458 | 0 | dst[8] = _mm_loadu_ps(src + 1 * srcS + 3 * srcC); |
459 | 0 | dst[9] = _mm_loadu_ps(src + 1 * srcS + 4 * srcC); |
460 | 0 | dst[10] = _mm_loadu_ps(src + 2 * srcS + 0 * srcC); |
461 | 0 | dst[11] = _mm_loadu_ps(src + 2 * srcS + 1 * srcC); |
462 | 0 | dst[12] = _mm_loadu_ps(src + 2 * srcS + 2 * srcC); |
463 | 0 | dst[13] = _mm_loadu_ps(src + 2 * srcS + 3 * srcC); |
464 | 0 | dst[14] = _mm_loadu_ps(src + 2 * srcS + 4 * srcC); |
465 | 0 | dst[15] = _mm_loadu_ps(src + 3 * srcS + 0 * srcC); |
466 | 0 | dst[16] = _mm_loadu_ps(src + 3 * srcS + 1 * srcC); |
467 | 0 | dst[17] = _mm_loadu_ps(src + 3 * srcS + 2 * srcC); |
468 | 0 | dst[18] = _mm_loadu_ps(src + 3 * srcS + 3 * srcC); |
469 | 0 | dst[19] = _mm_loadu_ps(src + 3 * srcS + 4 * srcC); |
470 | 0 | dst[20] = _mm_loadu_ps(src + 4 * srcS + 0 * srcC); |
471 | 0 | dst[21] = _mm_loadu_ps(src + 4 * srcS + 1 * srcC); |
472 | 0 | dst[22] = _mm_loadu_ps(src + 4 * srcS + 2 * srcC); |
473 | 0 | dst[23] = _mm_loadu_ps(src + 4 * srcS + 3 * srcC); |
474 | 0 | dst[24] = _mm_loadu_ps(src + 4 * srcS + 4 * srcC); |
475 | 0 | } |
476 | | |
477 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride) |
478 | 0 | { |
479 | 0 | size_t srcS = srcW * srcC; |
480 | 0 | size_t srcCF = AlignLo(srcC, F); |
481 | 0 | for (size_t c = 0; c < srcCF; c += F) |
482 | 0 | { |
483 | 0 | __m128 tmp[25]; |
484 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + c, srcS, srcC, tmp); |
485 | 0 | WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride); |
486 | 0 | } |
487 | 0 | if (srcCF < srcC) |
488 | 0 | { |
489 | 0 | __m128 tmp[25]; |
490 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + srcC - F, srcS, srcC, tmp); |
491 | 0 | WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride); |
492 | 0 | } |
493 | 0 | } |
494 | | |
495 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m128 dst[25]) |
496 | 0 | { |
497 | 0 | for (size_t i = 0; i < 25; ++i) |
498 | 0 | dst[i] = _mm_setzero_ps(); |
499 | 0 | for (size_t row = rowB; row < rowE; ++row) |
500 | 0 | for (size_t col = colB; col < colE; ++col) |
501 | 0 | dst[row * 5 + col] = _mm_loadu_ps(src + row * srcS + col * srcC); |
502 | 0 | } |
503 | | |
504 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride) |
505 | 0 | { |
506 | 0 | size_t srcS = srcW * srcC; |
507 | 0 | size_t srcCF = AlignLo(srcC, F); |
508 | 0 | for (size_t c = 0; c < srcCF; c += F) |
509 | 0 | { |
510 | 0 | __m128 tmp[25]; |
511 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp); |
512 | 0 | WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride); |
513 | 0 | } |
514 | 0 | if (srcCF < srcC) |
515 | 0 | { |
516 | 0 | __m128 tmp[25]; |
517 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp); |
518 | 0 | WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride); |
519 | 0 | } |
520 | 0 | } |
521 | | |
522 | | void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth, |
523 | | size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans) |
524 | 0 | { |
525 | 0 | assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1)); |
526 | 0 | if (trans ? (srcChannels < F) : true) |
527 | 0 | { |
528 | 0 | Base::WinogradKernel2x2Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans); |
529 | 0 | return; |
530 | 0 | } |
531 | 0 | size_t dstH = srcHeight - 1 + padY + padH; |
532 | 0 | size_t dstW = srcWidth - 1 + padX + padW; |
533 | 0 | size_t dstH4 = AlignLo(dstH, 4); |
534 | 0 | size_t dstW4 = AlignLo(dstW, 4); |
535 | 0 | size_t noseW = Simd::Min<size_t>(5, dstW + 1); |
536 | 0 | size_t noseH = Simd::Min<size_t>(5, dstH + 1); |
537 | 0 | size_t startY = padY ? 4 : 0; |
538 | 0 | size_t startX = padX ? 4 : 0; |
539 | 0 | if (padY || padH) |
540 | 0 | { |
541 | 0 | if (dstH == dstH4) |
542 | 0 | dstH4 -= 4; |
543 | 0 | if (dstW == dstW4) |
544 | 0 | dstW4 -= 4; |
545 | 0 | if (padY) |
546 | 0 | src -= (srcWidth + 1) * (trans ? srcChannels : 1); |
547 | 0 | } |
548 | 0 | size_t tailW = dstW - dstW4 + (padW ? 0 : 1); |
549 | 0 | size_t tailH = dstH - dstH4 + (padH ? 0 : 1); |
550 | 0 | size_t row = 0, col = 0; |
551 | 0 | if (padY) |
552 | 0 | { |
553 | 0 | if (padX) |
554 | 0 | WinogradKernel2x2Block4x4SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels; |
555 | 0 | for (col = startX; col < dstW4; col += 4) |
556 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels; |
557 | 0 | if (col < dstW) |
558 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels; |
559 | 0 | } |
560 | 0 | for (row = startY; row < dstH4; row += 4) |
561 | 0 | { |
562 | 0 | if (padX) |
563 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels; |
564 | 0 | for (col = startX; col < dstW4; col += 4) |
565 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels; |
566 | 0 | if (col < dstW) |
567 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels; |
568 | 0 | } |
569 | 0 | if (row < dstH) |
570 | 0 | { |
571 | 0 | if (padX) |
572 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels; |
573 | 0 | for (col = startX; col < dstW4; col += 4) |
574 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels; |
575 | 0 | if (col < dstW) |
576 | 0 | WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels; |
577 | 0 | } |
578 | 0 | } |
579 | | |
580 | | //----------------------------------------------------------------------- |
581 | | |
582 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputGetRow(const __m128 * s, __m128 * d) |
583 | 0 | { |
584 | 0 | const __m128 _2 = _mm_set1_ps(2.0f); |
585 | 0 | const __m128 _4 = _mm_set1_ps(4.0f); |
586 | 0 | const __m128 _8 = _mm_set1_ps(8.0f); |
587 | 0 | d[0] = _mm_add_ps(_mm_add_ps(s[0], s[1]), _mm_add_ps(s[2], s[3])); |
588 | 0 | d[1] = _mm_add_ps(_mm_sub_ps(s[1], s[2]), _mm_mul_ps(_2, s[3])); |
589 | 0 | d[2] = _mm_add_ps(_mm_add_ps(s[1], s[2]), _mm_mul_ps(_4, s[3])); |
590 | 0 | d[3] = _mm_add_ps(_mm_sub_ps(s[1], s[2]), _mm_add_ps(_mm_mul_ps(_8, s[3]), s[4])); |
591 | 0 | } |
592 | | |
593 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputLoad25(const float* src, size_t stride, __m128* dst) |
594 | 0 | { |
595 | 0 | __m128 s[25]; |
596 | 0 | s[0] = _mm_loadu_ps(src + 0 * stride); |
597 | 0 | s[1] = _mm_loadu_ps(src + 1 * stride); |
598 | 0 | s[2] = _mm_loadu_ps(src + 2 * stride); |
599 | 0 | s[3] = _mm_loadu_ps(src + 3 * stride); |
600 | 0 | s[4] = _mm_loadu_ps(src + 4 * stride); |
601 | 0 | s[5] = _mm_loadu_ps(src + 5 * stride); |
602 | 0 | s[6] = _mm_loadu_ps(src + 6 * stride); |
603 | 0 | s[7] = _mm_loadu_ps(src + 7 * stride); |
604 | 0 | s[8] = _mm_loadu_ps(src + 8 * stride); |
605 | 0 | s[9] = _mm_loadu_ps(src + 9 * stride); |
606 | 0 | s[10] = _mm_loadu_ps(src + 10 * stride); |
607 | 0 | s[11] = _mm_loadu_ps(src + 11 * stride); |
608 | 0 | s[12] = _mm_loadu_ps(src + 12 * stride); |
609 | 0 | s[13] = _mm_loadu_ps(src + 13 * stride); |
610 | 0 | s[14] = _mm_loadu_ps(src + 14 * stride); |
611 | 0 | s[15] = _mm_loadu_ps(src + 15 * stride); |
612 | 0 | s[16] = _mm_loadu_ps(src + 16 * stride); |
613 | 0 | s[17] = _mm_loadu_ps(src + 17 * stride); |
614 | 0 | s[18] = _mm_loadu_ps(src + 18 * stride); |
615 | 0 | s[19] = _mm_loadu_ps(src + 19 * stride); |
616 | 0 | s[20] = _mm_loadu_ps(src + 20 * stride); |
617 | 0 | s[21] = _mm_loadu_ps(src + 21 * stride); |
618 | 0 | s[22] = _mm_loadu_ps(src + 22 * stride); |
619 | 0 | s[23] = _mm_loadu_ps(src + 23 * stride); |
620 | 0 | s[24] = _mm_loadu_ps(src + 24 * stride); |
621 | |
|
622 | 0 | const __m128 _2 = _mm_set1_ps(2.0f); |
623 | 0 | const __m128 _4 = _mm_set1_ps(4.0f); |
624 | 0 | const __m128 _8 = _mm_set1_ps(8.0f); |
625 | 0 | __m128 t[5]; |
626 | 0 | t[0] = _mm_add_ps(_mm_add_ps(s[0], s[5]), _mm_add_ps(s[10], s[15])); |
627 | 0 | t[1] = _mm_add_ps(_mm_add_ps(s[1], s[6]), _mm_add_ps(s[11], s[16])); |
628 | 0 | t[2] = _mm_add_ps(_mm_add_ps(s[2], s[7]), _mm_add_ps(s[12], s[17])); |
629 | 0 | t[3] = _mm_add_ps(_mm_add_ps(s[3], s[8]), _mm_add_ps(s[13], s[18])); |
630 | 0 | t[4] = _mm_add_ps(_mm_add_ps(s[4], s[9]), _mm_add_ps(s[14], s[19])); |
631 | 0 | WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 0); |
632 | |
|
633 | 0 | t[0] = _mm_add_ps(_mm_sub_ps(s[5], s[10]), _mm_mul_ps(_2, s[15])); |
634 | 0 | t[1] = _mm_add_ps(_mm_sub_ps(s[6], s[11]), _mm_mul_ps(_2, s[16])); |
635 | 0 | t[2] = _mm_add_ps(_mm_sub_ps(s[7], s[12]), _mm_mul_ps(_2, s[17])); |
636 | 0 | t[3] = _mm_add_ps(_mm_sub_ps(s[8], s[13]), _mm_mul_ps(_2, s[18])); |
637 | 0 | t[4] = _mm_add_ps(_mm_sub_ps(s[9], s[14]), _mm_mul_ps(_2, s[19])); |
638 | 0 | WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 4); |
639 | |
|
640 | 0 | t[0] = _mm_add_ps(_mm_add_ps(s[5], s[10]), _mm_mul_ps(_4, s[15])); |
641 | 0 | t[1] = _mm_add_ps(_mm_add_ps(s[6], s[11]), _mm_mul_ps(_4, s[16])); |
642 | 0 | t[2] = _mm_add_ps(_mm_add_ps(s[7], s[12]), _mm_mul_ps(_4, s[17])); |
643 | 0 | t[3] = _mm_add_ps(_mm_add_ps(s[8], s[13]), _mm_mul_ps(_4, s[18])); |
644 | 0 | t[4] = _mm_add_ps(_mm_add_ps(s[9], s[14]), _mm_mul_ps(_4, s[19])); |
645 | 0 | WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 8); |
646 | |
|
647 | 0 | t[0] = _mm_add_ps(_mm_sub_ps(s[5], s[10]), _mm_add_ps(_mm_mul_ps(_8, s[15]), s[20])); |
648 | 0 | t[1] = _mm_add_ps(_mm_sub_ps(s[6], s[11]), _mm_add_ps(_mm_mul_ps(_8, s[16]), s[21])); |
649 | 0 | t[2] = _mm_add_ps(_mm_sub_ps(s[7], s[12]), _mm_add_ps(_mm_mul_ps(_8, s[17]), s[22])); |
650 | 0 | t[3] = _mm_add_ps(_mm_sub_ps(s[8], s[13]), _mm_add_ps(_mm_mul_ps(_8, s[18]), s[23])); |
651 | 0 | t[4] = _mm_add_ps(_mm_sub_ps(s[9], s[14]), _mm_add_ps(_mm_mul_ps(_8, s[19]), s[24])); |
652 | 0 | WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 12); |
653 | 0 | } |
654 | | |
655 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m128 src[16], float* dst, size_t dstS, size_t dstC) |
656 | 0 | { |
657 | 0 | _mm_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]); |
658 | 0 | _mm_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]); |
659 | 0 | _mm_storeu_ps(dst + 0 * dstS + 2 * dstC, src[2]); |
660 | 0 | _mm_storeu_ps(dst + 0 * dstS + 3 * dstC, src[3]); |
661 | 0 | _mm_storeu_ps(dst + 1 * dstS + 0 * dstC, src[4]); |
662 | 0 | _mm_storeu_ps(dst + 1 * dstS + 1 * dstC, src[5]); |
663 | 0 | _mm_storeu_ps(dst + 1 * dstS + 2 * dstC, src[6]); |
664 | 0 | _mm_storeu_ps(dst + 1 * dstS + 3 * dstC, src[7]); |
665 | 0 | _mm_storeu_ps(dst + 2 * dstS + 0 * dstC, src[8]); |
666 | 0 | _mm_storeu_ps(dst + 2 * dstS + 1 * dstC, src[9]); |
667 | 0 | _mm_storeu_ps(dst + 2 * dstS + 2 * dstC, src[10]); |
668 | 0 | _mm_storeu_ps(dst + 2 * dstS + 3 * dstC, src[11]); |
669 | 0 | _mm_storeu_ps(dst + 3 * dstS + 0 * dstC, src[12]); |
670 | 0 | _mm_storeu_ps(dst + 3 * dstS + 1 * dstC, src[13]); |
671 | 0 | _mm_storeu_ps(dst + 3 * dstS + 2 * dstC, src[14]); |
672 | 0 | _mm_storeu_ps(dst + 3 * dstS + 3 * dstC, src[15]); |
673 | 0 | } |
674 | | |
675 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC) |
676 | 0 | { |
677 | 0 | size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); |
678 | 0 | for (size_t d = 0; d < dstCF; d += F) |
679 | 0 | { |
680 | 0 | __m128 tmp[16]; |
681 | 0 | WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp); |
682 | 0 | WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC); |
683 | 0 | } |
684 | 0 | if (dstCF < dstC) |
685 | 0 | { |
686 | 0 | __m128 tmp[16]; |
687 | 0 | WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp); |
688 | 0 | WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC); |
689 | 0 | } |
690 | 0 | } |
691 | | |
692 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m128 src[16], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE) |
693 | 0 | { |
694 | 0 | for (size_t row = 0; row < rowE; ++row) |
695 | 0 | for (size_t col = 0; col < colE; ++col) |
696 | 0 | _mm_storeu_ps(dst + row * dstS + col * dstC, src[row * 4 + col]); |
697 | 0 | } |
698 | | |
699 | | SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE) |
700 | 0 | { |
701 | 0 | size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F); |
702 | 0 | for (size_t d = 0; d < dstCF; d += F) |
703 | 0 | { |
704 | 0 | __m128 tmp[16]; |
705 | 0 | WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp); |
706 | 0 | WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE); |
707 | 0 | } |
708 | 0 | if (dstCF < dstC) |
709 | 0 | { |
710 | 0 | __m128 tmp[16]; |
711 | 0 | WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp); |
712 | 0 | WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC, rowE, colE); |
713 | 0 | } |
714 | 0 | } |
715 | | |
716 | | void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans) |
717 | 0 | { |
718 | 0 | if (trans ? (dstChannels < F) : true) |
719 | 0 | { |
720 | 0 | Base::WinogradKernel2x2Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans); |
721 | 0 | return; |
722 | 0 | } |
723 | 0 | size_t tileH = (dstHeight + 3) / 4; |
724 | 0 | size_t tileW = (dstWidth + 3) / 4; |
725 | 0 | size_t dstH4 = AlignLo(dstHeight, 4); |
726 | 0 | size_t dstW4 = AlignLo(dstWidth, 4); |
727 | 0 | size_t row, col; |
728 | 0 | for (row = 0; row < dstH4; row += 4) |
729 | 0 | { |
730 | 0 | for (col = 0; col < dstW4; col += 4) |
731 | 0 | WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels; |
732 | 0 | if (col < dstWidth) |
733 | 0 | WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels; |
734 | 0 | } |
735 | 0 | if (row < dstHeight) |
736 | 0 | { |
737 | 0 | for (col = 0; col < dstW4; col += 4) |
738 | 0 | WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels; |
739 | 0 | if (col < dstWidth) |
740 | 0 | WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels; |
741 | 0 | } |
742 | 0 | } |
743 | | } |
744 | | #endif |
745 | | } |