/work/openh264/codec/processing/src/downsample/downsamplefuncs.cpp
Line | Count | Source |
1 | | /*! |
2 | | * \copy |
3 | | * Copyright (c) 2008-2013, Cisco Systems |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions |
8 | | * are met: |
9 | | * |
10 | | * * Redistributions of source code must retain the above copyright |
11 | | * notice, this list of conditions and the following disclaimer. |
12 | | * |
13 | | * * Redistributions in binary form must reproduce the above copyright |
14 | | * notice, this list of conditions and the following disclaimer in |
15 | | * the documentation and/or other materials provided with the |
16 | | * distribution. |
17 | | * |
18 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 | | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
21 | | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
22 | | * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
23 | | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
24 | | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
25 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
27 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
28 | | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | | * POSSIBILITY OF SUCH DAMAGE. |
30 | | * |
31 | | * downsample_yuv.c |
32 | | * |
33 | | * Abstract |
34 | | * Implementation for source yuv data downsampling used before spatial encoding. |
35 | | * |
36 | | * History |
37 | | * 10/24/2008 Created |
38 | | * |
39 | | *****************************************************************************/ |
40 | | |
41 | | #include "downsample.h" |
42 | | |
43 | | |
44 | | WELSVP_NAMESPACE_BEGIN |
45 | | |
46 | | |
47 | | void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, |
48 | | uint8_t* pSrc, const int32_t kiSrcStride, |
49 | | const int32_t kiSrcWidth, const int32_t kiSrcHeight) |
50 | | |
51 | 0 | { |
52 | 0 | uint8_t* pDstLine = pDst; |
53 | 0 | uint8_t* pSrcLine = pSrc; |
54 | 0 | const int32_t kiSrcStridex2 = kiSrcStride << 1; |
55 | 0 | const int32_t kiDstWidth = kiSrcWidth >> 1; |
56 | 0 | const int32_t kiDstHeight = kiSrcHeight >> 1; |
57 | |
|
58 | 0 | for (int32_t j = 0; j < kiDstHeight; j ++) { |
59 | 0 | for (int32_t i = 0; i < kiDstWidth; i ++) { |
60 | 0 | const int32_t kiSrcX = i << 1; |
61 | 0 | const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1; |
62 | 0 | const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1; |
63 | |
|
64 | 0 | pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1); |
65 | 0 | } |
66 | 0 | pDstLine += kiDstStride; |
67 | 0 | pSrcLine += kiSrcStridex2; |
68 | 0 | } |
69 | 0 | } |
70 | | |
71 | | void DyadicBilinearQuarterDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, |
72 | | uint8_t* pSrc, const int32_t kiSrcStride, |
73 | | const int32_t kiSrcWidth, const int32_t kiSrcHeight) |
74 | | |
75 | 0 | { |
76 | 0 | uint8_t* pDstLine = pDst; |
77 | 0 | uint8_t* pSrcLine = pSrc; |
78 | 0 | const int32_t kiSrcStridex4 = kiSrcStride << 2; |
79 | 0 | const int32_t kiDstWidth = kiSrcWidth >> 2; |
80 | 0 | const int32_t kiDstHeight = kiSrcHeight >> 2; |
81 | |
|
82 | 0 | for (int32_t j = 0; j < kiDstHeight; j ++) { |
83 | 0 | for (int32_t i = 0; i < kiDstWidth; i ++) { |
84 | 0 | const int32_t kiSrcX = i << 2; |
85 | 0 | const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1; |
86 | 0 | const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1; |
87 | |
|
88 | 0 | pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1); |
89 | 0 | } |
90 | 0 | pDstLine += kiDstStride; |
91 | 0 | pSrcLine += kiSrcStridex4; |
92 | 0 | } |
93 | 0 | } |
94 | | |
95 | | void DyadicBilinearOneThirdDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, |
96 | | uint8_t* pSrc, const int32_t kiSrcStride, |
97 | | const int32_t kiSrcWidth, const int32_t kiDstHeight) |
98 | | |
99 | 0 | { |
100 | 0 | uint8_t* pDstLine = pDst; |
101 | 0 | uint8_t* pSrcLine = pSrc; |
102 | 0 | const int32_t kiSrcStridex3 = kiSrcStride * 3; |
103 | 0 | const int32_t kiDstWidth = kiSrcWidth / 3; |
104 | |
|
105 | 0 | for (int32_t j = 0; j < kiDstHeight; j ++) { |
106 | 0 | for (int32_t i = 0; i < kiDstWidth; i ++) { |
107 | 0 | const int32_t kiSrcX = i * 3; |
108 | 0 | const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1; |
109 | 0 | const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1; |
110 | |
|
111 | 0 | pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1); |
112 | 0 | } |
113 | 0 | pDstLine += kiDstStride; |
114 | 0 | pSrcLine += kiSrcStridex3; |
115 | 0 | } |
116 | 0 | } |
117 | | |
118 | | void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, |
119 | | const int32_t kiDstHeight, |
120 | 0 | uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { |
121 | 0 | const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15; |
122 | 0 | const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight); |
123 | 0 | int32_t fScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth); |
124 | 0 | int32_t fScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight); |
125 | 0 | uint32_t x; |
126 | 0 | int32_t iYInverse, iXInverse; |
127 | |
|
128 | 0 | uint8_t* pByDst = pDst; |
129 | 0 | uint8_t* pByLineDst = pDst; |
130 | |
|
131 | 0 | iYInverse = 1 << (kuiScaleBitHeight - 1); |
132 | 0 | for (int32_t i = 0; i < kiDstHeight - 1; i++) { |
133 | 0 | int32_t iYy = iYInverse >> kuiScaleBitHeight; |
134 | 0 | int32_t fv = iYInverse & (kuiScaleHeight - 1); |
135 | |
|
136 | 0 | uint8_t* pBySrc = pSrc + iYy * kiSrcStride; |
137 | |
|
138 | 0 | pByDst = pByLineDst; |
139 | 0 | iXInverse = 1 << (kuiScaleBitWidth - 1); |
140 | 0 | for (int32_t j = 0; j < kiDstWidth - 1; j++) { |
141 | 0 | int32_t iXx = iXInverse >> kuiScaleBitWidth; |
142 | 0 | int32_t iFu = iXInverse & (kuiScaleWidth - 1); |
143 | |
|
144 | 0 | uint8_t* pByCurrent = pBySrc + iXx; |
145 | 0 | uint8_t a, b, c, d; |
146 | |
|
147 | 0 | a = *pByCurrent; |
148 | 0 | b = * (pByCurrent + 1); |
149 | 0 | c = * (pByCurrent + kiSrcStride); |
150 | 0 | d = * (pByCurrent + kiSrcStride + 1); |
151 | |
|
152 | 0 | x = (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a; |
153 | 0 | x += (((uint32_t) (iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b; |
154 | 0 | x += (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (fv) >> kuiScaleBitWidth) * c; |
155 | 0 | x += (((uint32_t) (iFu)) * (fv) >> kuiScaleBitWidth) * d; |
156 | 0 | x >>= (kuiScaleBitHeight - 1); |
157 | 0 | x += 1; |
158 | 0 | x >>= 1; |
159 | | //x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c + |
160 | | // ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG); |
161 | 0 | x = WELS_CLAMP (x, 0, 255); |
162 | 0 | *pByDst++ = (uint8_t)x; |
163 | |
|
164 | 0 | iXInverse += fScalex; |
165 | 0 | } |
166 | 0 | *pByDst = * (pBySrc + (iXInverse >> kuiScaleBitWidth)); |
167 | 0 | pByLineDst += kiDstStride; |
168 | 0 | iYInverse += fScaley; |
169 | 0 | } |
170 | | |
171 | | // last row special |
172 | 0 | { |
173 | 0 | int32_t iYy = iYInverse >> kuiScaleBitHeight; |
174 | 0 | uint8_t* pBySrc = pSrc + iYy * kiSrcStride; |
175 | |
|
176 | 0 | pByDst = pByLineDst; |
177 | 0 | iXInverse = 1 << (kuiScaleBitWidth - 1); |
178 | 0 | for (int32_t j = 0; j < kiDstWidth; j++) { |
179 | 0 | int32_t iXx = iXInverse >> kuiScaleBitWidth; |
180 | 0 | *pByDst++ = * (pBySrc + iXx); |
181 | |
|
182 | 0 | iXInverse += fScalex; |
183 | 0 | } |
184 | 0 | } |
185 | 0 | } |
186 | | |
187 | | void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, |
188 | | const int32_t kiDstHeight, |
189 | 0 | uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { |
190 | 0 | const int32_t kiScaleBit = 15; |
191 | 0 | const int32_t kiScale = (1 << kiScaleBit); |
192 | 0 | int32_t iScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kiScale); |
193 | 0 | int32_t iScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kiScale); |
194 | 0 | int64_t x; |
195 | 0 | int32_t iYInverse, iXInverse; |
196 | |
|
197 | 0 | uint8_t* pByDst = pDst; |
198 | 0 | uint8_t* pByLineDst = pDst; |
199 | |
|
200 | 0 | iYInverse = 1 << (kiScaleBit - 1); |
201 | 0 | for (int32_t i = 0; i < kiDstHeight - 1; i++) { |
202 | 0 | int32_t iYy = iYInverse >> kiScaleBit; |
203 | 0 | int32_t iFv = iYInverse & (kiScale - 1); |
204 | |
|
205 | 0 | uint8_t* pBySrc = pSrc + iYy * kiSrcStride; |
206 | |
|
207 | 0 | pByDst = pByLineDst; |
208 | 0 | iXInverse = 1 << (kiScaleBit - 1); |
209 | 0 | for (int32_t j = 0; j < kiDstWidth - 1; j++) { |
210 | 0 | int32_t iXx = iXInverse >> kiScaleBit; |
211 | 0 | int32_t iFu = iXInverse & (kiScale - 1); |
212 | |
|
213 | 0 | uint8_t* pByCurrent = pBySrc + iXx; |
214 | 0 | uint8_t a, b, c, d; |
215 | |
|
216 | 0 | a = *pByCurrent; |
217 | 0 | b = * (pByCurrent + 1); |
218 | 0 | c = * (pByCurrent + kiSrcStride); |
219 | 0 | d = * (pByCurrent + kiSrcStride + 1); |
220 | |
|
221 | 0 | x = (((int64_t) (kiScale - 1 - iFu)) * (kiScale - 1 - iFv) * a + ((int64_t)iFu) * (kiScale - 1 - iFv) * b + ((int64_t) ( |
222 | 0 | kiScale - 1 - iFu)) * iFv * c + |
223 | 0 | ((int64_t)iFu) * iFv * d + (int64_t) (1 << (2 * kiScaleBit - 1))) >> (2 * kiScaleBit); |
224 | 0 | x = WELS_CLAMP (x, 0, 255); |
225 | 0 | *pByDst++ = (uint8_t)x; |
226 | |
|
227 | 0 | iXInverse += iScalex; |
228 | 0 | } |
229 | 0 | *pByDst = * (pBySrc + (iXInverse >> kiScaleBit)); |
230 | 0 | pByLineDst += kiDstStride; |
231 | 0 | iYInverse += iScaley; |
232 | 0 | } |
233 | | |
234 | | // last row special |
235 | 0 | { |
236 | 0 | int32_t iYy = iYInverse >> kiScaleBit; |
237 | 0 | uint8_t* pBySrc = pSrc + iYy * kiSrcStride; |
238 | |
|
239 | 0 | pByDst = pByLineDst; |
240 | 0 | iXInverse = 1 << (kiScaleBit - 1); |
241 | 0 | for (int32_t j = 0; j < kiDstWidth; j++) { |
242 | 0 | int32_t iXx = iXInverse >> kiScaleBit; |
243 | 0 | *pByDst++ = * (pBySrc + iXx); |
244 | |
|
245 | 0 | iXInverse += iScalex; |
246 | 0 | } |
247 | 0 | } |
248 | 0 | } |
249 | | |
250 | | #if defined(X86_ASM) || defined(HAVE_NEON) || (defined(HAVE_NEON_AARCH64) && defined(__aarch64__)) |
251 | | static void GeneralBilinearDownsamplerWrap (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, |
252 | | const int32_t kiDstHeight, |
253 | | uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight, |
254 | | const int32_t kiScaleBitWidth, const int32_t kiScaleBitHeight, |
255 | | void (*func) (uint8_t* pDst, int32_t iDstStride, int32_t iDstWidth, int32_t iDstHeight, |
256 | | uint8_t* pSrc, int32_t iSrcStride, uint32_t uiScaleX, uint32_t uiScaleY)) { |
257 | | const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight); |
258 | | |
259 | | uint32_t uiScalex = WELS_ROUND ((float)kiSrcWidth / (float)kiDstWidth * kuiScaleWidth); |
260 | | uint32_t uiScaley = WELS_ROUND ((float)kiSrcHeight / (float)kiDstHeight * kuiScaleHeight); |
261 | | |
262 | | func (pDst, kiDstStride, kiDstWidth, kiDstHeight, pSrc, kiSrcStride, uiScalex, uiScaley); |
263 | | } |
264 | | |
265 | | #define DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP(suffix) \ |
266 | | void GeneralBilinearFastDownsamplerWrap_ ## suffix ( \ |
267 | | uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \ |
268 | | uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \ |
269 | | GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \ |
270 | | pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 16, 15, GeneralBilinearFastDownsampler_ ## suffix); \ |
271 | | } |
272 | | |
273 | | #define DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP(suffix) \ |
274 | | void GeneralBilinearAccurateDownsamplerWrap_ ## suffix ( \ |
275 | | uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, \ |
276 | | uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) { \ |
277 | | GeneralBilinearDownsamplerWrap (pDst, kiDstStride, kiDstWidth, kiDstHeight, \ |
278 | | pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, 15, 15, GeneralBilinearAccurateDownsampler_ ## suffix); \ |
279 | | } |
280 | | #endif |
281 | | |
282 | | #ifdef X86_ASM |
283 | | DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (sse2) |
284 | | DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse2) |
285 | | DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (ssse3) |
286 | | DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (sse41) |
287 | | #ifdef HAVE_AVX2 |
288 | | DEFINE_GENERAL_BILINEAR_FAST_DOWNSAMPLER_WRAP (avx2) |
289 | | DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (avx2) |
290 | | #endif |
291 | | #endif //X86_ASM |
292 | | |
293 | | #ifdef HAVE_NEON |
294 | | DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (neon) |
295 | | #endif |
296 | | |
297 | | #if defined(HAVE_NEON_AARCH64) && defined(__aarch64__) |
298 | | DEFINE_GENERAL_BILINEAR_ACCURATE_DOWNSAMPLER_WRAP (AArch64_neon) |
299 | | #endif |
300 | | WELSVP_NAMESPACE_END |