/src/openh264/codec/common/src/mc.cpp
Line | Count | Source |
1 | | /*! |
2 | | * \copy |
3 | | * Copyright (c) 2009-2013, Cisco Systems |
4 | | * All rights reserved. |
5 | | * |
6 | | * Redistribution and use in source and binary forms, with or without |
7 | | * modification, are permitted provided that the following conditions |
8 | | * are met: |
9 | | * |
10 | | * * Redistributions of source code must retain the above copyright |
11 | | * notice, this list of conditions and the following disclaimer. |
12 | | * |
13 | | * * Redistributions in binary form must reproduce the above copyright |
14 | | * notice, this list of conditions and the following disclaimer in |
15 | | * the documentation and/or other materials provided with the |
16 | | * distribution. |
17 | | * |
18 | | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 | | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 | | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
21 | | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
22 | | * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
23 | | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
24 | | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
25 | | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
27 | | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN |
28 | | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | | * POSSIBILITY OF SUCH DAMAGE. |
30 | | * |
31 | | * |
32 | | * \file mc.c |
33 | | * |
34 | | * \brief Interfaces implementation for motion compensation |
35 | | * |
36 | | * \date 03/17/2009 Created |
37 | | * |
38 | | ************************************************************************************* |
39 | | */ |
40 | | |
41 | | #include "mc.h" |
42 | | |
43 | | #include "cpu_core.h" |
44 | | #include "ls_defines.h" |
45 | | #include "macros.h" |
46 | | #include "asmdefs_mmi.h" |
47 | | |
48 | | namespace { |
49 | | |
50 | | typedef void (*PMcChromaWidthExtFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
51 | | const uint8_t* kpABCD, int32_t iHeight); |
52 | | typedef void (*PWelsSampleWidthAveragingFunc) (uint8_t*, int32_t, const uint8_t*, int32_t, const uint8_t*, |
53 | | int32_t, int32_t); |
54 | | typedef void (*PWelsMcWidthHeightFunc) (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
55 | | int32_t iWidth, int32_t iHeight); |
56 | | |
57 | | /*------------------weight for chroma fraction pixel interpolation------------------*/ |
58 | | //iA = (8 - dx) * (8 - dy); |
59 | | //iB = dx * (8 - dy); |
60 | | //iC = (8 - dx) * dy; |
61 | | //iD = dx * dy |
62 | | static const uint8_t g_kuiABCD[8][8][4] = { //g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx] |
63 | | { |
64 | | {64, 0, 0, 0}, {56, 8, 0, 0}, {48, 16, 0, 0}, {40, 24, 0, 0}, |
65 | | {32, 32, 0, 0}, {24, 40, 0, 0}, {16, 48, 0, 0}, {8, 56, 0, 0} |
66 | | }, |
67 | | { |
68 | | {56, 0, 8, 0}, {49, 7, 7, 1}, {42, 14, 6, 2}, {35, 21, 5, 3}, |
69 | | {28, 28, 4, 4}, {21, 35, 3, 5}, {14, 42, 2, 6}, {7, 49, 1, 7} |
70 | | }, |
71 | | { |
72 | | {48, 0, 16, 0}, {42, 6, 14, 2}, {36, 12, 12, 4}, {30, 18, 10, 6}, |
73 | | {24, 24, 8, 8}, {18, 30, 6, 10}, {12, 36, 4, 12}, {6, 42, 2, 14} |
74 | | }, |
75 | | { |
76 | | {40, 0, 24, 0}, {35, 5, 21, 3}, {30, 10, 18, 6}, {25, 15, 15, 9}, |
77 | | {20, 20, 12, 12}, {15, 25, 9, 15}, {10, 30, 6, 18}, {5, 35, 3, 21} |
78 | | }, |
79 | | { |
80 | | {32, 0, 32, 0}, {28, 4, 28, 4}, {24, 8, 24, 8}, {20, 12, 20, 12}, |
81 | | {16, 16, 16, 16}, {12, 20, 12, 20}, {8, 24, 8, 24}, {4, 28, 4, 28} |
82 | | }, |
83 | | { |
84 | | {24, 0, 40, 0}, {21, 3, 35, 5}, {18, 6, 30, 10}, {15, 9, 25, 15}, |
85 | | {12, 12, 20, 20}, {9, 15, 15, 25}, {6, 18, 10, 30}, {3, 21, 5, 35} |
86 | | }, |
87 | | { |
88 | | {16, 0, 48, 0}, {14, 2, 42, 6}, {12, 4, 36, 12}, {10, 6, 30, 18}, |
89 | | {8, 8, 24, 24}, {6, 10, 18, 30}, {4, 12, 12, 36}, {2, 14, 6, 42} |
90 | | }, |
91 | | { |
92 | | {8, 0, 56, 0}, {7, 1, 49, 7}, {6, 2, 42, 14}, {5, 3, 35, 21}, |
93 | | {4, 4, 28, 28}, {3, 5, 21, 35}, {2, 6, 14, 42}, {1, 7, 7, 49} |
94 | | } |
95 | | }; |
96 | | |
97 | | //***************************************************************************// |
98 | | // C code implementation // |
99 | | //***************************************************************************// |
100 | | static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
101 | 0 | int32_t iHeight) { |
102 | 0 | int32_t i; |
103 | 0 | for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma |
104 | 0 | ST16A2 (pDst, LD16 (pSrc)); |
105 | 0 | pDst += iDstStride; |
106 | 0 | pSrc += iSrcStride; |
107 | 0 | } |
108 | 0 | } |
109 | | |
110 | | static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
111 | 0 | int32_t iHeight) { |
112 | 0 | int32_t i; |
113 | 0 | for (i = 0; i < iHeight; i++) { |
114 | 0 | ST32A4 (pDst, LD32 (pSrc)); |
115 | 0 | pDst += iDstStride; |
116 | 0 | pSrc += iSrcStride; |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | | static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
121 | 0 | int32_t iHeight) { |
122 | 0 | int32_t i; |
123 | 0 | for (i = 0; i < iHeight; i++) { |
124 | 0 | ST64A8 (pDst, LD64 (pSrc)); |
125 | 0 | pDst += iDstStride; |
126 | 0 | pSrc += iSrcStride; |
127 | 0 | } |
128 | 0 | } |
129 | | |
130 | | static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
131 | 0 | int32_t iHeight) { |
132 | 0 | int32_t i; |
133 | 0 | for (i = 0; i < iHeight; i++) { |
134 | 0 | ST64A8 (pDst , LD64 (pSrc)); |
135 | 0 | ST64A8 (pDst + 8, LD64 (pSrc + 8)); |
136 | 0 | pDst += iDstStride; |
137 | 0 | pSrc += iSrcStride; |
138 | 0 | } |
139 | 0 | } |
140 | | |
141 | | //--------------------Luma sample MC------------------// |
142 | | |
143 | 0 | static inline int32_t HorFilterInput16bit_c (const int16_t* pSrc) { |
144 | 0 | int32_t iPix05 = pSrc[0] + pSrc[5]; |
145 | 0 | int32_t iPix14 = pSrc[1] + pSrc[4]; |
146 | 0 | int32_t iPix23 = pSrc[2] + pSrc[3]; |
147 | |
|
148 | 0 | return (iPix05 - (iPix14 * 5) + (iPix23 * 20)); |
149 | 0 | } |
150 | | // h: iOffset=1 / v: iOffset=iSrcStride |
151 | 0 | static inline int32_t FilterInput8bitWithStride_c (const uint8_t* pSrc, const int32_t kiOffset) { |
152 | 0 | const int32_t kiOffset1 = kiOffset; |
153 | 0 | const int32_t kiOffset2 = (kiOffset << 1); |
154 | 0 | const int32_t kiOffset3 = kiOffset + kiOffset2; |
155 | 0 | const uint32_t kuiPix05 = * (pSrc - kiOffset2) + * (pSrc + kiOffset3); |
156 | 0 | const uint32_t kuiPix14 = * (pSrc - kiOffset1) + * (pSrc + kiOffset2); |
157 | 0 | const uint32_t kuiPix23 = * (pSrc) + * (pSrc + kiOffset1); |
158 | |
|
159 | 0 | return (kuiPix05 - ((kuiPix14 << 2) + kuiPix14) + (kuiPix23 << 4) + (kuiPix23 << 2)); |
160 | 0 | } |
161 | | |
162 | | static inline void PixelAvg_c (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, |
163 | 0 | const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { |
164 | 0 | int32_t i, j; |
165 | 0 | for (i = 0; i < iHeight; i++) { |
166 | 0 | for (j = 0; j < iWidth; j++) { |
167 | 0 | pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1; |
168 | 0 | } |
169 | 0 | pDst += iDstStride; |
170 | 0 | pSrcA += iSrcAStride; |
171 | 0 | pSrcB += iSrcBStride; |
172 | 0 | } |
173 | 0 | } |
174 | | static inline void McCopy_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, |
175 | 0 | int32_t iHeight) { |
176 | 0 | if (iWidth == 16) |
177 | 0 | McCopyWidthEq16_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
178 | 0 | else if (iWidth == 8) |
179 | 0 | McCopyWidthEq8_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
180 | 0 | else if (iWidth == 4) |
181 | 0 | McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
182 | 0 | else //here iWidth == 2 |
183 | 0 | McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
184 | 0 | } |
185 | | |
186 | | //horizontal filter to gain half sample, that is (2, 0) location in quarter sample |
187 | | static inline void McHorVer20_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
188 | | int32_t iWidth, |
189 | 0 | int32_t iHeight) { |
190 | 0 | int32_t i, j; |
191 | 0 | for (i = 0; i < iHeight; i++) { |
192 | 0 | for (j = 0; j < iWidth; j++) { |
193 | 0 | pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, 1) + 16) >> 5); |
194 | 0 | } |
195 | 0 | pDst += iDstStride; |
196 | 0 | pSrc += iSrcStride; |
197 | 0 | } |
198 | 0 | } |
199 | | |
200 | | //vertical filter to gain half sample, that is (0, 2) location in quarter sample |
201 | | static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
202 | | int32_t iWidth, |
203 | 0 | int32_t iHeight) { |
204 | 0 | int32_t i, j; |
205 | 0 | for (i = 0; i < iHeight; i++) { |
206 | 0 | for (j = 0; j < iWidth; j++) { |
207 | 0 | pDst[j] = WelsClip1 ((FilterInput8bitWithStride_c (pSrc + j, iSrcStride) + 16) >> 5); |
208 | 0 | } |
209 | 0 | pDst += iDstStride; |
210 | 0 | pSrc += iSrcStride; |
211 | 0 | } |
212 | 0 | } |
213 | | |
214 | | //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample |
215 | | static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
216 | | int32_t iWidth, |
217 | 0 | int32_t iHeight) { |
218 | 0 | int16_t iTmp[17 + 5]; |
219 | 0 | int32_t i, j, k; |
220 | |
|
221 | 0 | for (i = 0; i < iHeight; i++) { |
222 | 0 | for (j = 0; j < iWidth + 5; j++) { |
223 | 0 | iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride); |
224 | 0 | } |
225 | 0 | for (k = 0; k < iWidth; k++) { |
226 | 0 | pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10); |
227 | 0 | } |
228 | 0 | pSrc += iSrcStride; |
229 | 0 | pDst += iDstStride; |
230 | 0 | } |
231 | 0 | } |
232 | | |
233 | | /////////////////////luma MC////////////////////////// |
234 | | static inline void McHorVer01_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
235 | | int32_t iWidth, |
236 | 0 | int32_t iHeight) { |
237 | 0 | uint8_t uiTmp[256]; |
238 | 0 | McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); |
239 | 0 | PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); |
240 | 0 | } |
241 | | static inline void McHorVer03_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
242 | | int32_t iWidth, |
243 | 0 | int32_t iHeight) { |
244 | 0 | uint8_t uiTmp[256]; |
245 | 0 | McHorVer02_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); |
246 | 0 | PixelAvg_c (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight); |
247 | 0 | } |
248 | | static inline void McHorVer10_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
249 | | int32_t iWidth, |
250 | 0 | int32_t iHeight) { |
251 | 0 | uint8_t uiTmp[256]; |
252 | 0 | McHorVer20_c (pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); |
253 | 0 | PixelAvg_c (pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight); |
254 | 0 | } |
255 | | static inline void McHorVer11_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
256 | | int32_t iWidth, |
257 | 0 | int32_t iHeight) { |
258 | 0 | uint8_t uiHorTmp[256]; |
259 | 0 | uint8_t uiVerTmp[256]; |
260 | 0 | McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); |
261 | 0 | McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); |
262 | 0 | PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); |
263 | 0 | } |
264 | | static inline void McHorVer12_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
265 | | int32_t iWidth, |
266 | 0 | int32_t iHeight) { |
267 | 0 | uint8_t uiVerTmp[256]; |
268 | 0 | uint8_t uiCtrTmp[256]; |
269 | 0 | McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); |
270 | 0 | McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); |
271 | 0 | PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight); |
272 | 0 | } |
273 | | static inline void McHorVer13_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
274 | | int32_t iWidth, |
275 | 0 | int32_t iHeight) { |
276 | 0 | uint8_t uiHorTmp[256]; |
277 | 0 | uint8_t uiVerTmp[256]; |
278 | 0 | McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); |
279 | 0 | McHorVer02_c (pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight); |
280 | 0 | PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); |
281 | 0 | } |
282 | | static inline void McHorVer21_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
283 | | int32_t iWidth, |
284 | 0 | int32_t iHeight) { |
285 | 0 | uint8_t uiHorTmp[256]; |
286 | 0 | uint8_t uiCtrTmp[256]; |
287 | 0 | McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); |
288 | 0 | McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); |
289 | 0 | PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight); |
290 | 0 | } |
291 | | static inline void McHorVer23_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
292 | | int32_t iWidth, |
293 | 0 | int32_t iHeight) { |
294 | 0 | uint8_t uiHorTmp[256]; |
295 | 0 | uint8_t uiCtrTmp[256]; |
296 | 0 | McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); |
297 | 0 | McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); |
298 | 0 | PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight); |
299 | 0 | } |
300 | | static inline void McHorVer30_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
301 | | int32_t iWidth, |
302 | 0 | int32_t iHeight) { |
303 | 0 | uint8_t uiHorTmp[256]; |
304 | 0 | McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); |
305 | 0 | PixelAvg_c (pDst, iDstStride, pSrc + 1, iSrcStride, uiHorTmp, 16, iWidth, iHeight); |
306 | 0 | } |
307 | | static inline void McHorVer31_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
308 | | int32_t iWidth, |
309 | 0 | int32_t iHeight) { |
310 | 0 | uint8_t uiHorTmp[256]; |
311 | 0 | uint8_t uiVerTmp[256]; |
312 | 0 | McHorVer20_c (pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight); |
313 | 0 | McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight); |
314 | 0 | PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); |
315 | 0 | } |
316 | | static inline void McHorVer32_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
317 | | int32_t iWidth, |
318 | 0 | int32_t iHeight) { |
319 | 0 | uint8_t uiVerTmp[256]; |
320 | 0 | uint8_t uiCtrTmp[256]; |
321 | 0 | McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight); |
322 | 0 | McHorVer22_c (pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight); |
323 | 0 | PixelAvg_c (pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight); |
324 | 0 | } |
325 | | static inline void McHorVer33_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
326 | | int32_t iWidth, |
327 | 0 | int32_t iHeight) { |
328 | 0 | uint8_t uiHorTmp[256]; |
329 | 0 | uint8_t uiVerTmp[256]; |
330 | 0 | McHorVer20_c (pSrc + iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight); |
331 | 0 | McHorVer02_c (pSrc + 1, iSrcStride, uiVerTmp, 16, iWidth, iHeight); |
332 | 0 | PixelAvg_c (pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight); |
333 | 0 | } |
334 | | |
335 | | void McLuma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
336 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) |
337 | | //pSrc has been added the offset of mv |
338 | 0 | { |
339 | 0 | static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] |
340 | 0 | {McCopy_c, McHorVer01_c, McHorVer02_c, McHorVer03_c}, |
341 | 0 | {McHorVer10_c, McHorVer11_c, McHorVer12_c, McHorVer13_c}, |
342 | 0 | {McHorVer20_c, McHorVer21_c, McHorVer22_c, McHorVer23_c}, |
343 | 0 | {McHorVer30_c, McHorVer31_c, McHorVer32_c, McHorVer33_c}, |
344 | 0 | }; |
345 | |
|
346 | 0 | pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
347 | 0 | } |
348 | | |
349 | | static inline void McChromaWithFragMv_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
350 | 0 | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
351 | 0 | int32_t i, j; |
352 | 0 | int32_t iA, iB, iC, iD; |
353 | 0 | const uint8_t* pSrcNext = pSrc + iSrcStride; |
354 | 0 | const uint8_t* pABCD = g_kuiABCD[iMvY & 0x07][iMvX & 0x07]; |
355 | 0 | iA = pABCD[0]; |
356 | 0 | iB = pABCD[1]; |
357 | 0 | iC = pABCD[2]; |
358 | 0 | iD = pABCD[3]; |
359 | 0 | for (i = 0; i < iHeight; i++) { |
360 | 0 | for (j = 0; j < iWidth; j++) { |
361 | 0 | pDst[j] = (iA * pSrc[j] + iB * pSrc[j + 1] + iC * pSrcNext[j] + iD * pSrcNext[j + 1] + 32) >> 6; |
362 | 0 | } |
363 | 0 | pDst += iDstStride; |
364 | 0 | pSrc = pSrcNext; |
365 | 0 | pSrcNext += iSrcStride; |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | | void McChroma_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
370 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) |
371 | | //pSrc has been added the offset of mv |
372 | 0 | { |
373 | 0 | const int32_t kiD8x = iMvX & 0x07; |
374 | 0 | const int32_t kiD8y = iMvY & 0x07; |
375 | 0 | if (0 == kiD8x && 0 == kiD8y) |
376 | 0 | McCopy_c (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
377 | 0 | else |
378 | 0 | McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); |
379 | 0 | } |
380 | | |
381 | | #if defined(X86_ASM) |
382 | | //***************************************************************************// |
383 | | // SSE2 implement // |
384 | | //***************************************************************************// |
385 | | static inline void McHorVer22WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
386 | | int32_t iHeight) { |
387 | | ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16) |
388 | | McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5); |
389 | | McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight); |
390 | | } |
391 | | |
392 | | static inline void McHorVer02WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
393 | | int32_t iHeight) { |
394 | | McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
395 | | McHorVer02WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); |
396 | | } |
397 | | |
398 | | static inline void McHorVer22WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
399 | | int32_t iHeight) { |
400 | | McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
401 | | McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); |
402 | | } |
403 | | |
404 | | void McHorVer20Width5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
405 | | int32_t iWidth, int32_t iHeight) { |
406 | | if (iWidth == 17 || iWidth == 9) |
407 | | McHorVer20Width9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
408 | | else //if (iWidth == 5) |
409 | | McHorVer20Width5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
410 | | } |
411 | | |
412 | | void McHorVer02Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
413 | | int32_t iWidth, int32_t iHeight) { |
414 | | if (iWidth == 16 || iWidth == 8) |
415 | | McHorVer02Height9Or17_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
416 | | else //if (iWidth == 4) |
417 | | McHorVer02Height5_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
418 | | } |
419 | | |
420 | | void McHorVer22Width5Or9Or17Height5Or9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
421 | | int32_t iWidth, int32_t iHeight) { |
422 | | ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16) |
423 | | if (iWidth == 17 || iWidth == 9){ |
424 | | int32_t tmp1 = 2 * (iWidth - 8); |
425 | | McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5); |
426 | | McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight); |
427 | | McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, iDstStride, 8, iHeight); |
428 | | } |
429 | | else{ //if(iWidth == 5) |
430 | | int32_t tmp1 = 2 * (iWidth - 4); |
431 | | McHorVer22Width5HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5); |
432 | | McHorVer22Width4VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight); |
433 | | McHorVer22Width4VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 4, iDstStride, 4, iHeight); |
434 | | } |
435 | | |
436 | | } |
437 | | |
438 | | static inline void McCopy_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
439 | | int32_t iWidth, |
440 | | int32_t iHeight) { |
441 | | if (iWidth == 16) |
442 | | McCopyWidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
443 | | else if (iWidth == 8) |
444 | | McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
445 | | else if (iWidth == 4) |
446 | | McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
447 | | else |
448 | | McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
449 | | } |
450 | | |
451 | | static inline void McHorVer20_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
452 | | int32_t iWidth, int32_t iHeight) { |
453 | | if (iWidth == 16) |
454 | | McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
455 | | else if (iWidth == 8) |
456 | | McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
457 | | else |
458 | | McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
459 | | } |
460 | | |
461 | | static inline void McHorVer02_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
462 | | int32_t iWidth, int32_t iHeight) { |
463 | | if (iWidth == 16) |
464 | | McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
465 | | else if (iWidth == 8) |
466 | | McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
467 | | else |
468 | | McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight); |
469 | | } |
470 | | |
471 | | static inline void McHorVer22_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
472 | | int32_t iWidth, int32_t iHeight) { |
473 | | if (iWidth == 16) |
474 | | McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
475 | | else if (iWidth == 8) |
476 | | McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
477 | | else |
478 | | McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight); |
479 | | } |
480 | | |
481 | | static inline void McHorVer01_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
482 | | int32_t iWidth, int32_t iHeight) { |
483 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
484 | | if (iWidth == 16) { |
485 | | McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); |
486 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
487 | | } else if (iWidth == 8) { |
488 | | McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); |
489 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
490 | | } else { |
491 | | McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight); |
492 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
493 | | } |
494 | | } |
495 | | static inline void McHorVer03_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
496 | | int32_t iWidth, int32_t iHeight) { |
497 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
498 | | if (iWidth == 16) { |
499 | | McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); |
500 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
501 | | } else if (iWidth == 8) { |
502 | | McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); |
503 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
504 | | } else { |
505 | | McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight); |
506 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
507 | | } |
508 | | } |
509 | | static inline void McHorVer10_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
510 | | int32_t iWidth, int32_t iHeight) { |
511 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
512 | | if (iWidth == 16) { |
513 | | McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); |
514 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
515 | | } else if (iWidth == 8) { |
516 | | McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pTmp, 16, iHeight); |
517 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
518 | | } else { |
519 | | McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pTmp, 16, iHeight); |
520 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
521 | | } |
522 | | } |
523 | | static inline void McHorVer11_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
524 | | int32_t iWidth, int32_t iHeight) { |
525 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
526 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
527 | | if (iWidth == 16) { |
528 | | McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
529 | | McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
530 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
531 | | } else if (iWidth == 8) { |
532 | | McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
533 | | McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
534 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
535 | | } else { |
536 | | McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
537 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); |
538 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
539 | | } |
540 | | } |
541 | | static inline void McHorVer12_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
542 | | int32_t iWidth, int32_t iHeight) { |
543 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
544 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
545 | | if (iWidth == 16) { |
546 | | McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
547 | | McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
548 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
549 | | } else if (iWidth == 8) { |
550 | | McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
551 | | McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
552 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
553 | | } else { |
554 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); |
555 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
556 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
557 | | } |
558 | | } |
559 | | static inline void McHorVer13_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
560 | | int32_t iWidth, int32_t iHeight) { |
561 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
562 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
563 | | if (iWidth == 16) { |
564 | | McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
565 | | McHorVer02WidthEq16_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
566 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
567 | | } else if (iWidth == 8) { |
568 | | McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
569 | | McHorVer02WidthEq8_sse2 (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
570 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
571 | | } else { |
572 | | McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
573 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4 , iHeight); |
574 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
575 | | } |
576 | | } |
577 | | static inline void McHorVer21_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
578 | | int32_t iWidth, int32_t iHeight) { |
579 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
580 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
581 | | if (iWidth == 16) { |
582 | | McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
583 | | McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
584 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
585 | | } else if (iWidth == 8) { |
586 | | McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
587 | | McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
588 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
589 | | } else { |
590 | | McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
591 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
592 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
593 | | } |
594 | | } |
595 | | static inline void McHorVer23_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
596 | | int32_t iWidth, int32_t iHeight) { |
597 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
598 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
599 | | if (iWidth == 16) { |
600 | | McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
601 | | McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
602 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
603 | | } else if (iWidth == 8) { |
604 | | McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
605 | | McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
606 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
607 | | } else { |
608 | | McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
609 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
610 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
611 | | } |
612 | | } |
613 | | static inline void McHorVer30_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
614 | | int32_t iWidth, int32_t iHeight) { |
615 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
616 | | if (iWidth == 16) { |
617 | | McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
618 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
619 | | } else if (iWidth == 8) { |
620 | | McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
621 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
622 | | } else { |
623 | | McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
624 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
625 | | } |
626 | | } |
627 | | static inline void McHorVer31_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
628 | | int32_t iWidth, int32_t iHeight) { |
629 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
630 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
631 | | if (iWidth == 16) { |
632 | | McHorVer20WidthEq16_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
633 | | McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
634 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
635 | | } else if (iWidth == 8) { |
636 | | McHorVer20WidthEq8_sse2 (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
637 | | McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
638 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
639 | | } else { |
640 | | McHorVer20WidthEq4_mmx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
641 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
642 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
643 | | } |
644 | | } |
645 | | static inline void McHorVer32_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
646 | | int32_t iWidth, int32_t iHeight) { |
647 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
648 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
649 | | if (iWidth == 16) { |
650 | | McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
651 | | McHorVer22WidthEq16_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
652 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
653 | | } else if (iWidth == 8) { |
654 | | McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
655 | | McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
656 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
657 | | } else { |
658 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
659 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
660 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
661 | | } |
662 | | } |
663 | | static inline void McHorVer33_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
664 | | int32_t iWidth, int32_t iHeight) { |
665 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
666 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
667 | | if (iWidth == 16) { |
668 | | McHorVer20WidthEq16_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
669 | | McHorVer02WidthEq16_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
670 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
671 | | } else if (iWidth == 8) { |
672 | | McHorVer20WidthEq8_sse2 (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
673 | | McHorVer02WidthEq8_sse2 (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
674 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
675 | | } else { |
676 | | McHorVer20WidthEq4_mmx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
677 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
678 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
679 | | } |
680 | | } |
681 | | |
682 | | void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
683 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) |
684 | | //pSrc has been added the offset of mv |
685 | | { |
686 | | static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] |
687 | | {McCopy_sse2, McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2}, |
688 | | {McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2}, |
689 | | {McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2}, |
690 | | {McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2}, |
691 | | }; |
692 | | |
693 | | pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
694 | | } |
695 | | |
696 | | void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
697 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
698 | | static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = { |
699 | | McChromaWidthEq4_mmx, |
700 | | McChromaWidthEq8_sse2 |
701 | | }; |
702 | | const int32_t kiD8x = iMvX & 0x07; |
703 | | const int32_t kiD8y = iMvY & 0x07; |
704 | | if (kiD8x == 0 && kiD8y == 0) { |
705 | | McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
706 | | return; |
707 | | } |
708 | | if (iWidth != 2) { |
709 | | kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight); |
710 | | } else |
711 | | McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); |
712 | | } |
713 | | |
714 | | //***************************************************************************// |
715 | | // SSSE3 implementation // |
716 | | //***************************************************************************// |
717 | | |
718 | | void PixelAvgWidth4Or8Or16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, |
719 | | const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { |
720 | | if (iWidth < 8) { |
721 | | PixelAvgWidthEq4_mmx (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight); |
722 | | } else if (iWidth == 8) { |
723 | | PixelAvgWidthEq8_mmx (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight); |
724 | | } else { |
725 | | PixelAvgWidthEq16_sse2 (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight); |
726 | | } |
727 | | } |
728 | | |
729 | | void McCopy_sse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
730 | | int32_t iWidth, int32_t iHeight) { |
731 | | switch (iWidth) { |
732 | | case 16: return McCopyWidthEq16_sse3 (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
733 | | case 8: return McCopyWidthEq8_mmx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
734 | | case 4: return McCopyWidthEq4_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
735 | | } |
736 | | return McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
737 | | } |
738 | | |
739 | | void McHorVer22_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
740 | | int32_t iWidth, int32_t iHeight) { |
741 | | ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 8, 16); |
742 | | if (iWidth < 8) { |
743 | | McHorVer20Width4U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5); |
744 | | McHorVer02Width4S16ToU8_ssse3 (&pTmp[0][0], pDst, iDstStride, iHeight); |
745 | | } else if (iWidth == 8) { |
746 | | McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5); |
747 | | McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight); |
748 | | } else { |
749 | | McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5); |
750 | | McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, 8, iHeight); |
751 | | McHorVer20Width8U8ToS16_ssse3 (pSrc + 8, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5); |
752 | | McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst + 8, iDstStride, 8, iHeight); |
753 | | } |
754 | | } |
755 | | |
756 | | void McHorVer01_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
757 | | int32_t iWidth, int32_t iHeight) { |
758 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16); |
759 | | McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
760 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride, |
761 | | &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
762 | | } |
763 | | |
764 | | void McHorVer03_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
765 | | int32_t iWidth, int32_t iHeight) { |
766 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16); |
767 | | McHorVer02_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
768 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, |
769 | | &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
770 | | } |
771 | | |
772 | | void McHorVer10_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
773 | | int32_t iWidth, int32_t iHeight) { |
774 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16); |
775 | | McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
776 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride, |
777 | | &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
778 | | } |
779 | | |
780 | | void McHorVer11_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
781 | | int32_t iWidth, int32_t iHeight) { |
782 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
783 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
784 | | McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
785 | | McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
786 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
787 | | &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
788 | | } |
789 | | |
790 | | void McHorVer12_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
791 | | int32_t iWidth, int32_t iHeight) { |
792 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
793 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16); |
794 | | McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
795 | | McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
796 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp, |
797 | | &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
798 | | } |
799 | | |
800 | | void McHorVer13_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
801 | | int32_t iWidth, int32_t iHeight) { |
802 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
803 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
804 | | McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
805 | | McHorVer02_ssse3 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
806 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
807 | | &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
808 | | } |
809 | | |
810 | | void McHorVer21_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
811 | | int32_t iWidth, int32_t iHeight) { |
812 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
813 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16); |
814 | | McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
815 | | McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
816 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
817 | | &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
818 | | } |
819 | | |
820 | | void McHorVer23_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
821 | | int32_t iWidth, int32_t iHeight) { |
822 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
823 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16); |
824 | | McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
825 | | McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
826 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
827 | | &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
828 | | } |
829 | | |
830 | | void McHorVer30_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
831 | | int32_t iWidth, int32_t iHeight) { |
832 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16); |
833 | | McHorVer20_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
834 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
835 | | } |
836 | | |
837 | | void McHorVer31_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
838 | | int32_t iWidth, int32_t iHeight) { |
839 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
840 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
841 | | McHorVer20_ssse3 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
842 | | McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
843 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
844 | | &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
845 | | } |
846 | | |
847 | | void McHorVer32_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
848 | | int32_t iWidth, int32_t iHeight) { |
849 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
850 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16); |
851 | | McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
852 | | McHorVer22_ssse3 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
853 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp, |
854 | | &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
855 | | } |
856 | | |
857 | | void McHorVer33_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
858 | | int32_t iWidth, int32_t iHeight) { |
859 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
860 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
861 | | McHorVer20_ssse3 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
862 | | McHorVer02_ssse3 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
863 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
864 | | &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
865 | | } |
866 | | |
867 | | void McHorVer22Width5Or9Or17_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
868 | | int32_t iWidth, int32_t iHeight) { |
869 | | ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 16 / sizeof (int16_t)), 16) |
870 | | if (iWidth > 5) { |
871 | | McHorVer20Width9Or17U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight + 5); |
872 | | McHorVer02WidthGe8S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight); |
873 | | } else { |
874 | | McHorVer20Width8U8ToS16_ssse3 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iHeight + 5); |
875 | | McHorVer02Width5S16ToU8_ssse3 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iHeight); |
876 | | } |
877 | | } |
878 | | |
879 | | void McLuma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
880 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
881 | | static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { |
882 | | {McCopy_sse3, McHorVer01_ssse3, McHorVer02_ssse3, McHorVer03_ssse3}, |
883 | | {McHorVer10_ssse3, McHorVer11_ssse3, McHorVer12_ssse3, McHorVer13_ssse3}, |
884 | | {McHorVer20_ssse3, McHorVer21_ssse3, McHorVer22_ssse3, McHorVer23_ssse3}, |
885 | | {McHorVer30_ssse3, McHorVer31_ssse3, McHorVer32_ssse3, McHorVer33_ssse3}, |
886 | | }; |
887 | | |
888 | | pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
889 | | } |
890 | | |
891 | | void McChroma_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
892 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
893 | | static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = { |
894 | | McChromaWidthEq4_mmx, |
895 | | McChromaWidthEq8_ssse3 |
896 | | }; |
897 | | const int32_t kiD8x = iMvX & 0x07; |
898 | | const int32_t kiD8y = iMvY & 0x07; |
899 | | if (kiD8x == 0 && kiD8y == 0) { |
900 | | McCopy_sse2 (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
901 | | return; |
902 | | } |
903 | | if (iWidth != 2) { |
904 | | kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight); |
905 | | } else |
906 | | McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); |
907 | | } |
908 | | |
909 | | //***************************************************************************// |
910 | | // AVX2 implementation // |
911 | | //***************************************************************************// |
912 | | |
913 | | #ifdef HAVE_AVX2 |
914 | | |
915 | | void McHorVer22_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
916 | | int32_t iWidth, int32_t iHeight) { |
917 | | ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 16 + 5, 16, 32); |
918 | | if (iWidth < 8) { |
919 | | McHorVer20Width4U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5); |
920 | | McHorVer02Width4S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight); |
921 | | } else if (iWidth == 8) { |
922 | | McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5); |
923 | | McHorVer02Width8S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight); |
924 | | } else { |
925 | | McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5); |
926 | | McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight); |
927 | | } |
928 | | } |
929 | | |
930 | | void McHorVer01_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
931 | | int32_t iWidth, int32_t iHeight) { |
932 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16); |
933 | | McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
934 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride, |
935 | | &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
936 | | } |
937 | | |
938 | | void McHorVer03_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
939 | | int32_t iWidth, int32_t iHeight) { |
940 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16); |
941 | | McHorVer02_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
942 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, |
943 | | &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
944 | | } |
945 | | |
946 | | void McHorVer10_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
947 | | int32_t iWidth, int32_t iHeight) { |
948 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16); |
949 | | McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
950 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc, iSrcStride, |
951 | | &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
952 | | } |
953 | | |
954 | | void McHorVer11_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
955 | | int32_t iWidth, int32_t iHeight) { |
956 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
957 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
958 | | McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
959 | | McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
960 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
961 | | &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
962 | | } |
963 | | |
964 | | void McHorVer12_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
965 | | int32_t iWidth, int32_t iHeight) { |
966 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
967 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16); |
968 | | McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
969 | | McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
970 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp, |
971 | | &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
972 | | } |
973 | | |
974 | | void McHorVer13_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
975 | | int32_t iWidth, int32_t iHeight) { |
976 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
977 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
978 | | McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
979 | | McHorVer02_avx2 (pSrc, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
980 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
981 | | &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
982 | | } |
983 | | |
984 | | void McHorVer21_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
985 | | int32_t iWidth, int32_t iHeight) { |
986 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
987 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16); |
988 | | McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
989 | | McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
990 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
991 | | &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
992 | | } |
993 | | |
994 | | void McHorVer23_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
995 | | int32_t iWidth, int32_t iHeight) { |
996 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
997 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16); |
998 | | McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
999 | | McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
1000 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
1001 | | &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
1002 | | } |
1003 | | |
1004 | | void McHorVer30_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1005 | | int32_t iWidth, int32_t iHeight) { |
1006 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pTmp, 16, 16, 16); |
1007 | | McHorVer20_avx2 (pSrc, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
1008 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, pSrc + 1, iSrcStride, &pTmp[0][0], sizeof *pTmp, iWidth, iHeight); |
1009 | | } |
1010 | | |
1011 | | void McHorVer31_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1012 | | int32_t iWidth, int32_t iHeight) { |
1013 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
1014 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
1015 | | McHorVer20_avx2 (pSrc, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
1016 | | McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
1017 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
1018 | | &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
1019 | | } |
1020 | | |
1021 | | void McHorVer32_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1022 | | int32_t iWidth, int32_t iHeight) { |
1023 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
1024 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pCtrTmp, 16, 16, 16); |
1025 | | McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
1026 | | McHorVer22_avx2 (pSrc, iSrcStride, &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
1027 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pVerTmp[0][0], sizeof *pVerTmp, |
1028 | | &pCtrTmp[0][0], sizeof *pCtrTmp, iWidth, iHeight); |
1029 | | } |
1030 | | |
1031 | | void McHorVer33_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1032 | | int32_t iWidth, int32_t iHeight) { |
1033 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pHorTmp, 16, 16, 16); |
1034 | | ENFORCE_STACK_ALIGN_2D (uint8_t, pVerTmp, 16, 16, 16); |
1035 | | McHorVer20_avx2 (pSrc + iSrcStride, iSrcStride, &pHorTmp[0][0], sizeof *pHorTmp, iWidth, iHeight); |
1036 | | McHorVer02_avx2 (pSrc + 1, iSrcStride, &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
1037 | | PixelAvgWidth4Or8Or16_sse2 (pDst, iDstStride, &pHorTmp[0][0], sizeof *pHorTmp, |
1038 | | &pVerTmp[0][0], sizeof *pVerTmp, iWidth, iHeight); |
1039 | | } |
1040 | | |
1041 | | void McHorVer22Width5Or9Or17_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1042 | | int32_t iWidth, int32_t iHeight) { |
1043 | | if (iWidth < 9) { |
1044 | | ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 9 + 5, WELS_ALIGN(5, 16 / sizeof (int16_t)), 16) |
1045 | | McHorVer20Width8U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5); |
1046 | | McHorVer02Width5S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight); |
1047 | | } else if (iWidth == 9) { |
1048 | | ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, 16, 32) |
1049 | | McHorVer20Width16U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5); |
1050 | | McHorVer02Width9S16ToU8_avx2 (&pTmp[0][0], pDst, iDstStride, iHeight); |
1051 | | } else { |
1052 | | ENFORCE_STACK_ALIGN_2D (int16_t, pTmp, 17 + 5, WELS_ALIGN(17, 32 / sizeof (int16_t)), 32) |
1053 | | McHorVer20Width17U8ToS16_avx2 (pSrc, iSrcStride, &pTmp[0][0], iHeight + 5); |
1054 | | McHorVer02Width16Or17S16ToU8_avx2 (&pTmp[0][0], sizeof *pTmp, pDst, iDstStride, iWidth, iHeight); |
1055 | | } |
1056 | | } |
1057 | | |
1058 | | void McLuma_avx2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1059 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
1060 | | static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { |
1061 | | {McCopy_sse3, McHorVer01_avx2, McHorVer02_avx2, McHorVer03_avx2}, |
1062 | | {McHorVer10_avx2, McHorVer11_avx2, McHorVer12_avx2, McHorVer13_avx2}, |
1063 | | {McHorVer20_avx2, McHorVer21_avx2, McHorVer22_avx2, McHorVer23_avx2}, |
1064 | | {McHorVer30_avx2, McHorVer31_avx2, McHorVer32_avx2, McHorVer33_avx2}, |
1065 | | }; |
1066 | | |
1067 | | pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
1068 | | } |
1069 | | |
1070 | | #endif //HAVE_AVX2 |
1071 | | |
1072 | | void PixelAvg_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, |
1073 | | const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { |
1074 | | static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = { |
1075 | | PixelAvgWidthEq8_mmx, |
1076 | | PixelAvgWidthEq16_sse2 |
1077 | | }; |
1078 | | kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight); |
1079 | | } |
1080 | | |
1081 | | #endif //X86_ASM |
1082 | | //***************************************************************************// |
1083 | | // NEON implementation // |
1084 | | //***************************************************************************// |
1085 | | #if defined(HAVE_NEON) |
1086 | | void McHorVer20Width5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1087 | | int32_t iWidth, int32_t iHeight) { |
1088 | | if (iWidth == 17) |
1089 | | McHorVer20Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1090 | | else if (iWidth == 9) |
1091 | | McHorVer20Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1092 | | else //if (iWidth == 5) |
1093 | | McHorVer20Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1094 | | } |
1095 | | void McHorVer02Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1096 | | int32_t iWidth, int32_t iHeight) { |
1097 | | if (iWidth == 16) |
1098 | | McHorVer02Height17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1099 | | else if (iWidth == 8) |
1100 | | McHorVer02Height9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1101 | | else //if (iWidth == 4) |
1102 | | McHorVer02Height5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1103 | | } |
1104 | | void McHorVer22Width5Or9Or17Height5Or9Or17_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1105 | | int32_t iWidth, int32_t iHeight) { |
1106 | | if (iWidth == 17) |
1107 | | McHorVer22Width17_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1108 | | else if (iWidth == 9) |
1109 | | McHorVer22Width9_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1110 | | else //if (iWidth == 5) |
1111 | | McHorVer22Width5_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1112 | | } |
1113 | | void McCopy_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1114 | | int32_t iWidth, int32_t iHeight) { |
1115 | | if (16 == iWidth) |
1116 | | McCopyWidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1117 | | else if (8 == iWidth) |
1118 | | McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1119 | | else if (4 == iWidth) |
1120 | | McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1121 | | else |
1122 | | McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1123 | | } |
1124 | | void McHorVer20_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1125 | | int32_t iWidth, int32_t iHeight) { |
1126 | | if (iWidth == 16) |
1127 | | McHorVer20WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1128 | | else if (iWidth == 8) |
1129 | | McHorVer20WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1130 | | else if (iWidth == 4) |
1131 | | McHorVer20WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1132 | | } |
1133 | | void McHorVer02_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1134 | | int32_t iWidth, int32_t iHeight) { |
1135 | | if (iWidth == 16) |
1136 | | McHorVer02WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1137 | | else if (iWidth == 8) |
1138 | | McHorVer02WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1139 | | else if (iWidth == 4) |
1140 | | McHorVer02WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1141 | | } |
1142 | | void McHorVer22_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1143 | | int32_t iWidth, int32_t iHeight) { |
1144 | | if (iWidth == 16) |
1145 | | McHorVer22WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1146 | | else if (iWidth == 8) |
1147 | | McHorVer22WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1148 | | else if (iWidth == 4) |
1149 | | McHorVer22WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1150 | | } |
1151 | | |
1152 | | void McHorVer01_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1153 | | int32_t iWidth, int32_t iHeight) { |
1154 | | if (iWidth == 16) |
1155 | | McHorVer01WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1156 | | else if (iWidth == 8) |
1157 | | McHorVer01WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1158 | | else if (iWidth == 4) |
1159 | | McHorVer01WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1160 | | } |
1161 | | void McHorVer03_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1162 | | int32_t iWidth, int32_t iHeight) { |
1163 | | if (iWidth == 16) |
1164 | | McHorVer03WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1165 | | else if (iWidth == 8) |
1166 | | McHorVer03WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1167 | | else if (iWidth == 4) |
1168 | | McHorVer03WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1169 | | } |
1170 | | void McHorVer10_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1171 | | int32_t iWidth, int32_t iHeight) { |
1172 | | if (iWidth == 16) |
1173 | | McHorVer10WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1174 | | else if (iWidth == 8) |
1175 | | McHorVer10WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1176 | | else if (iWidth == 4) |
1177 | | McHorVer10WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1178 | | } |
1179 | | void McHorVer11_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1180 | | int32_t iWidth, int32_t iHeight) { |
1181 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1182 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1183 | | if (iWidth == 16) { |
1184 | | McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1185 | | McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1186 | | PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1187 | | } else if (iWidth == 8) { |
1188 | | McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1189 | | McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1190 | | PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1191 | | } else if (iWidth == 4) { |
1192 | | McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1193 | | McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1194 | | PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1195 | | } |
1196 | | } |
1197 | | void McHorVer12_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1198 | | int32_t iWidth, int32_t iHeight) { |
1199 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1200 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
1201 | | if (iWidth == 16) { |
1202 | | McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1203 | | McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1204 | | PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); |
1205 | | } else if (iWidth == 8) { |
1206 | | McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1207 | | McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1208 | | PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); |
1209 | | } else if (iWidth == 4) { |
1210 | | McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1211 | | McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1212 | | PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); |
1213 | | } |
1214 | | } |
1215 | | void McHorVer13_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1216 | | int32_t iWidth, int32_t iHeight) { |
1217 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1218 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1219 | | if (iWidth == 16) { |
1220 | | McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1221 | | McHorVer02WidthEq16_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1222 | | PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1223 | | } else if (iWidth == 8) { |
1224 | | McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1225 | | McHorVer02WidthEq8_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1226 | | PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1227 | | } else if (iWidth == 4) { |
1228 | | McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1229 | | McHorVer02WidthEq4_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1230 | | PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1231 | | } |
1232 | | } |
1233 | | void McHorVer21_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1234 | | int32_t iWidth, int32_t iHeight) { |
1235 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1236 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
1237 | | if (iWidth == 16) { |
1238 | | McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1239 | | McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1240 | | PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); |
1241 | | } else if (iWidth == 8) { |
1242 | | McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1243 | | McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1244 | | PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); |
1245 | | } else if (iWidth == 4) { |
1246 | | McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1247 | | McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1248 | | PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); |
1249 | | } |
1250 | | } |
1251 | | void McHorVer23_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1252 | | int32_t iWidth, int32_t iHeight) { |
1253 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1254 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
1255 | | if (iWidth == 16) { |
1256 | | McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1257 | | McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1258 | | PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); |
1259 | | } else if (iWidth == 8) { |
1260 | | McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1261 | | McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1262 | | PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); |
1263 | | } else if (iWidth == 4) { |
1264 | | McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1265 | | McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1266 | | PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pCtrTmp, iHeight); |
1267 | | } |
1268 | | } |
1269 | | void McHorVer30_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1270 | | int32_t iWidth, int32_t iHeight) { |
1271 | | if (iWidth == 16) |
1272 | | McHorVer30WidthEq16_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1273 | | else if (iWidth == 8) |
1274 | | McHorVer30WidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1275 | | else if (iWidth == 4) |
1276 | | McHorVer30WidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1277 | | } |
1278 | | void McHorVer31_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1279 | | int32_t iWidth, int32_t iHeight) { |
1280 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1281 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1282 | | if (iWidth == 16) { |
1283 | | McHorVer20WidthEq16_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1284 | | McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1285 | | PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1286 | | } else if (iWidth == 8) { |
1287 | | McHorVer20WidthEq8_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1288 | | McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1289 | | PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1290 | | } else if (iWidth == 4) { |
1291 | | McHorVer20WidthEq4_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1292 | | McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1293 | | PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1294 | | } |
1295 | | } |
1296 | | void McHorVer32_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1297 | | int32_t iWidth, int32_t iHeight) { |
1298 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1299 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
1300 | | if (iWidth == 16) { |
1301 | | McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1302 | | McHorVer22WidthEq16_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1303 | | PixelAvgWidthEq16_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); |
1304 | | } else if (iWidth == 8) { |
1305 | | McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1306 | | McHorVer22WidthEq8_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1307 | | PixelAvgWidthEq8_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); |
1308 | | } else if (iWidth == 4) { |
1309 | | McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1310 | | McHorVer22WidthEq4_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1311 | | PixelAvgWidthEq4_neon (pDst, iDstStride, pVerTmp, pCtrTmp, iHeight); |
1312 | | } |
1313 | | } |
1314 | | void McHorVer33_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1315 | | int32_t iWidth, int32_t iHeight) { |
1316 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1317 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1318 | | if (iWidth == 16) { |
1319 | | McHorVer20WidthEq16_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1320 | | McHorVer02WidthEq16_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1321 | | PixelAvgWidthEq16_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1322 | | } else if (iWidth == 8) { |
1323 | | McHorVer20WidthEq8_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1324 | | McHorVer02WidthEq8_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1325 | | PixelAvgWidthEq8_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1326 | | } else if (iWidth == 4) { |
1327 | | McHorVer20WidthEq4_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1328 | | McHorVer02WidthEq4_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1329 | | PixelAvgWidthEq4_neon (pDst, iDstStride, pHorTmp, pVerTmp, iHeight); |
1330 | | } |
1331 | | } |
1332 | | |
1333 | | void McLuma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1334 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
1335 | | static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] |
1336 | | {McCopy_neon, McHorVer01_neon, McHorVer02_neon, McHorVer03_neon}, |
1337 | | {McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon}, |
1338 | | {McHorVer20_neon, McHorVer21_neon, McHorVer22_neon, McHorVer23_neon}, |
1339 | | {McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon}, |
1340 | | }; |
1341 | | // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2); |
1342 | | pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
1343 | | } |
1344 | | void McChroma_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1345 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
1346 | | if (0 == iMvX && 0 == iMvY) { |
1347 | | if (8 == iWidth) |
1348 | | McCopyWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1349 | | else if (iWidth == 4) |
1350 | | McCopyWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1351 | | else //here iWidth == 2 |
1352 | | McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1353 | | } else { |
1354 | | const int32_t kiD8x = iMvX & 0x07; |
1355 | | const int32_t kiD8y = iMvY & 0x07; |
1356 | | if (8 == iWidth) |
1357 | | McChromaWidthEq8_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); |
1358 | | else if (4 == iWidth) |
1359 | | McChromaWidthEq4_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); |
1360 | | else //here iWidth == 2 |
1361 | | McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); |
1362 | | } |
1363 | | } |
1364 | | void PixelAvg_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, |
1365 | | const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { |
1366 | | static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = { |
1367 | | PixStrideAvgWidthEq8_neon, |
1368 | | PixStrideAvgWidthEq16_neon |
1369 | | }; |
1370 | | kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight); |
1371 | | } |
1372 | | #endif |
1373 | | #if defined(HAVE_NEON_AARCH64) |
1374 | | void McHorVer20Width5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1375 | | int32_t iWidth, int32_t iHeight) { |
1376 | | if (iWidth == 17) |
1377 | | McHorVer20Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1378 | | else if (iWidth == 9) |
1379 | | McHorVer20Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1380 | | else //if (iWidth == 5) |
1381 | | McHorVer20Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1382 | | } |
1383 | | void McHorVer02Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1384 | | int32_t iWidth, int32_t iHeight) { |
1385 | | if (iWidth == 16) |
1386 | | McHorVer02Height17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1387 | | else if (iWidth == 8) |
1388 | | McHorVer02Height9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1389 | | else //if (iWidth == 4) |
1390 | | McHorVer02Height5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1391 | | } |
1392 | | void McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
1393 | | int32_t iDstStride, |
1394 | | int32_t iWidth, int32_t iHeight) { |
1395 | | if (iWidth == 17) |
1396 | | McHorVer22Width17_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1397 | | else if (iWidth == 9) |
1398 | | McHorVer22Width9_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1399 | | else //if (iWidth == 5) |
1400 | | McHorVer22Width5_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1401 | | } |
1402 | | void McCopy_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1403 | | int32_t iWidth, int32_t iHeight) { |
1404 | | if (16 == iWidth) |
1405 | | McCopyWidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1406 | | else if (8 == iWidth) |
1407 | | McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1408 | | else if (4 == iWidth) |
1409 | | McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1410 | | else |
1411 | | McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1412 | | } |
1413 | | void McHorVer20_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1414 | | int32_t iWidth, int32_t iHeight) { |
1415 | | if (iWidth == 16) |
1416 | | McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1417 | | else if (iWidth == 8) |
1418 | | McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1419 | | else if (iWidth == 4) |
1420 | | McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1421 | | } |
1422 | | void McHorVer02_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1423 | | int32_t iWidth, int32_t iHeight) { |
1424 | | if (iWidth == 16) |
1425 | | McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1426 | | else if (iWidth == 8) |
1427 | | McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1428 | | else if (iWidth == 4) |
1429 | | McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1430 | | } |
1431 | | void McHorVer22_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1432 | | int32_t iWidth, int32_t iHeight) { |
1433 | | if (iWidth == 16) |
1434 | | McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1435 | | else if (iWidth == 8) |
1436 | | McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1437 | | else if (iWidth == 4) |
1438 | | McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1439 | | } |
1440 | | |
1441 | | void McHorVer01_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1442 | | int32_t iWidth, int32_t iHeight) { |
1443 | | if (iWidth == 16) |
1444 | | McHorVer01WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1445 | | else if (iWidth == 8) |
1446 | | McHorVer01WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1447 | | else if (iWidth == 4) |
1448 | | McHorVer01WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1449 | | } |
1450 | | void McHorVer03_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1451 | | int32_t iWidth, int32_t iHeight) { |
1452 | | if (iWidth == 16) |
1453 | | McHorVer03WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1454 | | else if (iWidth == 8) |
1455 | | McHorVer03WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1456 | | else if (iWidth == 4) |
1457 | | McHorVer03WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1458 | | } |
1459 | | void McHorVer10_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1460 | | int32_t iWidth, int32_t iHeight) { |
1461 | | if (iWidth == 16) |
1462 | | McHorVer10WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1463 | | else if (iWidth == 8) |
1464 | | McHorVer10WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1465 | | else if (iWidth == 4) |
1466 | | McHorVer10WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1467 | | } |
1468 | | void McHorVer11_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1469 | | int32_t iWidth, int32_t iHeight) { |
1470 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1471 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1472 | | if (iWidth == 16) { |
1473 | | McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1474 | | McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1475 | | PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1476 | | } else if (iWidth == 8) { |
1477 | | McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1478 | | McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1479 | | PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1480 | | } else if (iWidth == 4) { |
1481 | | McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1482 | | McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1483 | | PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1484 | | } |
1485 | | } |
1486 | | void McHorVer12_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1487 | | int32_t iWidth, int32_t iHeight) { |
1488 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1489 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
1490 | | if (iWidth == 16) { |
1491 | | McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1492 | | McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1493 | | PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
1494 | | } else if (iWidth == 8) { |
1495 | | McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1496 | | McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1497 | | PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
1498 | | } else if (iWidth == 4) { |
1499 | | McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1500 | | McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1501 | | PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
1502 | | } |
1503 | | } |
1504 | | void McHorVer13_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1505 | | int32_t iWidth, int32_t iHeight) { |
1506 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1507 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1508 | | if (iWidth == 16) { |
1509 | | McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1510 | | McHorVer02WidthEq16_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1511 | | PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1512 | | } else if (iWidth == 8) { |
1513 | | McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1514 | | McHorVer02WidthEq8_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1515 | | PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1516 | | } else if (iWidth == 4) { |
1517 | | McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1518 | | McHorVer02WidthEq4_AArch64_neon (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
1519 | | PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1520 | | } |
1521 | | } |
1522 | | void McHorVer21_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1523 | | int32_t iWidth, int32_t iHeight) { |
1524 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1525 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
1526 | | if (iWidth == 16) { |
1527 | | McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1528 | | McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1529 | | PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
1530 | | } else if (iWidth == 8) { |
1531 | | McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1532 | | McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1533 | | PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
1534 | | } else if (iWidth == 4) { |
1535 | | McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1536 | | McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1537 | | PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
1538 | | } |
1539 | | } |
1540 | | void McHorVer23_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1541 | | int32_t iWidth, int32_t iHeight) { |
1542 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1543 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
1544 | | if (iWidth == 16) { |
1545 | | McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1546 | | McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1547 | | PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
1548 | | } else if (iWidth == 8) { |
1549 | | McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1550 | | McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1551 | | PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
1552 | | } else if (iWidth == 4) { |
1553 | | McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1554 | | McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1555 | | PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
1556 | | } |
1557 | | } |
1558 | | void McHorVer30_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1559 | | int32_t iWidth, int32_t iHeight) { |
1560 | | if (iWidth == 16) |
1561 | | McHorVer30WidthEq16_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1562 | | else if (iWidth == 8) |
1563 | | McHorVer30WidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1564 | | else if (iWidth == 4) |
1565 | | McHorVer30WidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1566 | | } |
1567 | | void McHorVer31_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1568 | | int32_t iWidth, int32_t iHeight) { |
1569 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1570 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1571 | | if (iWidth == 16) { |
1572 | | McHorVer20WidthEq16_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1573 | | McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1574 | | PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1575 | | } else if (iWidth == 8) { |
1576 | | McHorVer20WidthEq8_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1577 | | McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1578 | | PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1579 | | } else if (iWidth == 4) { |
1580 | | McHorVer20WidthEq4_AArch64_neon (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
1581 | | McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1582 | | PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1583 | | } |
1584 | | } |
1585 | | void McHorVer32_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1586 | | int32_t iWidth, int32_t iHeight) { |
1587 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1588 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
1589 | | if (iWidth == 16) { |
1590 | | McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1591 | | McHorVer22WidthEq16_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1592 | | PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
1593 | | } else if (iWidth == 8) { |
1594 | | McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1595 | | McHorVer22WidthEq8_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1596 | | PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
1597 | | } else if (iWidth == 4) { |
1598 | | McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1599 | | McHorVer22WidthEq4_AArch64_neon (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
1600 | | PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
1601 | | } |
1602 | | } |
1603 | | void McHorVer33_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1604 | | int32_t iWidth, int32_t iHeight) { |
1605 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
1606 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
1607 | | if (iWidth == 16) { |
1608 | | McHorVer20WidthEq16_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1609 | | McHorVer02WidthEq16_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1610 | | PixelAvgWidthEq16_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1611 | | } else if (iWidth == 8) { |
1612 | | McHorVer20WidthEq8_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1613 | | McHorVer02WidthEq8_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1614 | | PixelAvgWidthEq8_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1615 | | } else if (iWidth == 4) { |
1616 | | McHorVer20WidthEq4_AArch64_neon (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
1617 | | McHorVer02WidthEq4_AArch64_neon (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
1618 | | PixelAvgWidthEq4_AArch64_neon (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
1619 | | } |
1620 | | } |
1621 | | |
1622 | | void McLuma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1623 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
1624 | | static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] |
1625 | | {McCopy_AArch64_neon, McHorVer01_AArch64_neon, McHorVer02_AArch64_neon, McHorVer03_AArch64_neon}, |
1626 | | {McHorVer10_AArch64_neon, McHorVer11_AArch64_neon, McHorVer12_AArch64_neon, McHorVer13_AArch64_neon}, |
1627 | | {McHorVer20_AArch64_neon, McHorVer21_AArch64_neon, McHorVer22_AArch64_neon, McHorVer23_AArch64_neon}, |
1628 | | {McHorVer30_AArch64_neon, McHorVer31_AArch64_neon, McHorVer32_AArch64_neon, McHorVer33_AArch64_neon}, |
1629 | | }; |
1630 | | // pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2); |
1631 | | pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
1632 | | } |
1633 | | void McChroma_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
1634 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
1635 | | if (0 == iMvX && 0 == iMvY) { |
1636 | | if (8 == iWidth) |
1637 | | McCopyWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1638 | | else if (iWidth == 4) |
1639 | | McCopyWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1640 | | else //here iWidth == 2 |
1641 | | McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
1642 | | } else { |
1643 | | const int32_t kiD8x = iMvX & 0x07; |
1644 | | const int32_t kiD8y = iMvY & 0x07; |
1645 | | if (8 == iWidth) |
1646 | | McChromaWidthEq8_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); |
1647 | | else if (4 == iWidth) |
1648 | | McChromaWidthEq4_AArch64_neon (pSrc, iSrcStride, pDst, iDstStride, (int32_t*) (g_kuiABCD[kiD8y][kiD8x]), iHeight); |
1649 | | else //here iWidth == 2 |
1650 | | McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight); |
1651 | | } |
1652 | | } |
1653 | | void PixelAvg_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, |
1654 | | const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { |
1655 | | static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = { |
1656 | | PixStrideAvgWidthEq8_AArch64_neon, |
1657 | | PixStrideAvgWidthEq16_AArch64_neon |
1658 | | }; |
1659 | | kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight); |
1660 | | } |
1661 | | #endif |
1662 | | |
1663 | | #if defined(HAVE_MMI) |
1664 | | #define MMI_LOAD_8P(f0, f2, f4, r0) \ |
1665 | | "gsldlc1 "#f0", 0x7("#r0") \n\t" \ |
1666 | | "gsldrc1 "#f0", 0x0("#r0") \n\t" \ |
1667 | | "punpckhbh "#f2", "#f0", "#f4" \n\t" \ |
1668 | | "punpcklbh "#f0", "#f0", "#f4" \n\t" |
1669 | | |
1670 | | #define FILTER_HV_W4(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \ |
1671 | | f20, f22, f24, f26, f28, f30, r0, r1, r2) \ |
1672 | | "paddh "#f0", "#f0", "#f20" \n\t" \ |
1673 | | "paddh "#f2", "#f2", "#f22" \n\t" \ |
1674 | | "mov.d "#f28", "#f8" \n\t" \ |
1675 | | "mov.d "#f30", "#f10" \n\t" \ |
1676 | | "mov.d "#f24", "#f4" \n\t" \ |
1677 | | "mov.d "#f26", "#f6" \n\t" \ |
1678 | | "dmfc1 "#r2", "#f8" \n\t" \ |
1679 | | "dli "#r1", 0x0010001000100010 \n\t" \ |
1680 | | "dmtc1 "#r1", "#f8" \n\t" \ |
1681 | | "paddh "#f0", "#f0", "#f8" \n\t" \ |
1682 | | "paddh "#f2", "#f2", "#f8" \n\t" \ |
1683 | | "paddh "#f28", "#f28", "#f12" \n\t" \ |
1684 | | "paddh "#f30", "#f30", "#f14" \n\t" \ |
1685 | | "paddh "#f24", "#f24", "#f16" \n\t" \ |
1686 | | "paddh "#f26", "#f26", "#f18" \n\t" \ |
1687 | | "dli "#r1", 0x2 \n\t" \ |
1688 | | "dmtc1 "#r1", "#f8" \n\t" \ |
1689 | | "psllh "#f28", "#f28", "#f8" \n\t" \ |
1690 | | "psllh "#f30", "#f30", "#f8" \n\t" \ |
1691 | | "psubh "#f28", "#f28", "#f24" \n\t" \ |
1692 | | "psubh "#f30", "#f30", "#f26" \n\t" \ |
1693 | | "paddh "#f0", "#f0", "#f28" \n\t" \ |
1694 | | "paddh "#f2", "#f2", "#f30" \n\t" \ |
1695 | | "psllh "#f28", "#f28", "#f8" \n\t" \ |
1696 | | "psllh "#f30", "#f30", "#f8" \n\t" \ |
1697 | | "paddh "#f0", "#f0", "#f28" \n\t" \ |
1698 | | "paddh "#f2", "#f2", "#f30" \n\t" \ |
1699 | | "dli "#r1", 0x5 \n\t" \ |
1700 | | "dmtc1 "#r1", "#f8" \n\t" \ |
1701 | | "psrah "#f0", "#f0", "#f8" \n\t" \ |
1702 | | "psrah "#f2", "#f2", "#f8" \n\t" \ |
1703 | | "xor "#f28", "#f28", "#f28" \n\t" \ |
1704 | | "packushb "#f0", "#f0", "#f2" \n\t" \ |
1705 | | "gsswlc1 "#f0", 0x3("#r0") \n\t" \ |
1706 | | "gsswrc1 "#f0", 0x0("#r0") \n\t" \ |
1707 | | "dmtc1 "#r2", "#f8" \n\t" |
1708 | | |
1709 | | #define FILTER_HV_W8(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \ |
1710 | | f20, f22, f24, f26, f28, f30, r0, r1, r2) \ |
1711 | | "paddh "#f0", "#f0", "#f20" \n\t" \ |
1712 | | "paddh "#f2", "#f2", "#f22" \n\t" \ |
1713 | | "mov.d "#f28", "#f8" \n\t" \ |
1714 | | "mov.d "#f30", "#f10" \n\t" \ |
1715 | | "mov.d "#f24", "#f4" \n\t" \ |
1716 | | "mov.d "#f26", "#f6" \n\t" \ |
1717 | | "dmfc1 "#r2", "#f8" \n\t" \ |
1718 | | "dli "#r1", 0x0010001000100010 \n\t" \ |
1719 | | "dmtc1 "#r1", "#f8" \n\t" \ |
1720 | | "paddh "#f0", "#f0", "#f8" \n\t" \ |
1721 | | "paddh "#f2", "#f2", "#f8" \n\t" \ |
1722 | | "paddh "#f28", "#f28", "#f12" \n\t" \ |
1723 | | "paddh "#f30", "#f30", "#f14" \n\t" \ |
1724 | | "paddh "#f24", "#f24", "#f16" \n\t" \ |
1725 | | "paddh "#f26", "#f26", "#f18" \n\t" \ |
1726 | | "dli "#r1", 0x2 \n\t" \ |
1727 | | "dmtc1 "#r1", "#f8" \n\t" \ |
1728 | | "psllh "#f28", "#f28", "#f8" \n\t" \ |
1729 | | "psllh "#f30", "#f30", "#f8" \n\t" \ |
1730 | | "psubh "#f28", "#f28", "#f24" \n\t" \ |
1731 | | "psubh "#f30", "#f30", "#f26" \n\t" \ |
1732 | | "paddh "#f0", "#f0", "#f28" \n\t" \ |
1733 | | "paddh "#f2", "#f2", "#f30" \n\t" \ |
1734 | | "psllh "#f28", "#f28", "#f8" \n\t" \ |
1735 | | "psllh "#f30", "#f30", "#f8" \n\t" \ |
1736 | | "paddh "#f0", "#f0", "#f28" \n\t" \ |
1737 | | "paddh "#f2", "#f2", "#f30" \n\t" \ |
1738 | | "dli "#r1", 0x5 \n\t" \ |
1739 | | "dmtc1 "#r1", "#f8" \n\t" \ |
1740 | | "psrah "#f0", "#f0", "#f8" \n\t" \ |
1741 | | "psrah "#f2", "#f2", "#f8" \n\t" \ |
1742 | | "xor "#f28", "#f28", "#f28" \n\t" \ |
1743 | | "packushb "#f0", "#f0", "#f2" \n\t" \ |
1744 | | "gssdlc1 "#f0", 0x7("#r0") \n\t" \ |
1745 | | "gssdrc1 "#f0", 0x0("#r0") \n\t" \ |
1746 | | "dmtc1 "#r2", "#f8" \n\t" |
1747 | | |
1748 | | #define FILTER_VER_ALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \ |
1749 | | f20, f22, f24, f26, f28, f30, r0, r1, r2, r3, r4) \ |
1750 | | "paddh "#f0", "#f0", "#f20" \n\t" \ |
1751 | | "paddh "#f2", "#f2", "#f22" \n\t" \ |
1752 | | "mov.d "#f24", "#f4" \n\t" \ |
1753 | | "mov.d "#f26", "#f6" \n\t" \ |
1754 | | "mov.d "#f28", "#f8" \n\t" \ |
1755 | | "mov.d "#f30", "#f10" \n\t" \ |
1756 | | "dli "#r2", 0x2 \n\t" \ |
1757 | | "paddh "#f24", "#f24", "#f16" \n\t" \ |
1758 | | "paddh "#f26", "#f26", "#f18" \n\t" \ |
1759 | | "dmfc1 "#r3", "#f8" \n\t" \ |
1760 | | "paddh "#f28", "#f28", "#f12" \n\t" \ |
1761 | | "paddh "#f30", "#f30", "#f14" \n\t" \ |
1762 | | "dmtc1 "#r2", "#f8" \n\t" \ |
1763 | | "psubh "#f0", "#f0", "#f24" \n\t" \ |
1764 | | "psubh "#f2", "#f2", "#f26" \n\t" \ |
1765 | | "psrah "#f0", "#f0", "#f8" \n\t" \ |
1766 | | "psrah "#f2", "#f2", "#f8" \n\t" \ |
1767 | | "paddh "#f0", "#f0", "#f28" \n\t" \ |
1768 | | "paddh "#f2", "#f2", "#f30" \n\t" \ |
1769 | | "psubh "#f0", "#f0", "#f24" \n\t" \ |
1770 | | "psubh "#f2", "#f2", "#f26" \n\t" \ |
1771 | | "psrah "#f0", "#f0", "#f8" \n\t" \ |
1772 | | "psrah "#f2", "#f2", "#f8" \n\t" \ |
1773 | | "dmtc1 "#r4", "#f8" \n\t" \ |
1774 | | "paddh "#f28", "#f28", "#f0" \n\t" \ |
1775 | | "paddh "#f30", "#f30", "#f2" \n\t" \ |
1776 | | "dli "#r2", 0x6 \n\t" \ |
1777 | | "paddh "#f28", "#f28", "#f8" \n\t" \ |
1778 | | "paddh "#f30", "#f30", "#f8" \n\t" \ |
1779 | | "dmtc1 "#r2", "#f8" \n\t" \ |
1780 | | "psrah "#f28", "#f28", "#f8" \n\t" \ |
1781 | | "psrah "#f30", "#f30", "#f8" \n\t" \ |
1782 | | "packushb "#f28", "#f28", "#f30" \n\t" \ |
1783 | | "gssdxc1 "#f28", 0x0("#r0", "#r1") \n\t" \ |
1784 | | "dmtc1 "#r3", "#f8" \n\t" |
1785 | | |
1786 | | #define FILTER_VER_UNALIGN(f0, f2, f4, f6, f8, f10, f12, f14, f16, f18, \ |
1787 | | f20, f22, f24, f26, f28, f30, r0, r1, r2, r3) \ |
1788 | | "paddh "#f0", "#f0", "#f20" \n\t" \ |
1789 | | "paddh "#f2", "#f2", "#f22" \n\t" \ |
1790 | | "mov.d "#f24", "#f4" \n\t" \ |
1791 | | "mov.d "#f26", "#f6" \n\t" \ |
1792 | | "mov.d "#f28", "#f8" \n\t" \ |
1793 | | "mov.d "#f30", "#f10" \n\t" \ |
1794 | | "dli "#r1", 0x2 \n\t" \ |
1795 | | "paddh "#f24", "#f24", "#f16" \n\t" \ |
1796 | | "paddh "#f26", "#f26", "#f18" \n\t" \ |
1797 | | "dmfc1 "#r2", "#f8" \n\t" \ |
1798 | | "paddh "#f28", "#f28", "#f12" \n\t" \ |
1799 | | "paddh "#f30", "#f30", "#f14" \n\t" \ |
1800 | | "dmtc1 "#r1", "#f8" \n\t" \ |
1801 | | "psubh "#f0", "#f0", "#f24" \n\t" \ |
1802 | | "psubh "#f2", "#f2", "#f26" \n\t" \ |
1803 | | "psrah "#f0", "#f0", "#f8" \n\t" \ |
1804 | | "psrah "#f2", "#f2", "#f8" \n\t" \ |
1805 | | "paddh "#f0", "#f0", "#f28" \n\t" \ |
1806 | | "paddh "#f2", "#f2", "#f30" \n\t" \ |
1807 | | "psubh "#f0", "#f0", "#f24" \n\t" \ |
1808 | | "psubh "#f2", "#f2", "#f26" \n\t" \ |
1809 | | "psrah "#f0", "#f0", "#f8" \n\t" \ |
1810 | | "psrah "#f2", "#f2", "#f8" \n\t" \ |
1811 | | "dmtc1 "#r3", "#f8" \n\t" \ |
1812 | | "paddh "#f28", "#f28", "#f0" \n\t" \ |
1813 | | "paddh "#f30", "#f30", "#f2" \n\t" \ |
1814 | | "dli "#r1", 0x6 \n\t" \ |
1815 | | "paddh "#f28", "#f28", "#f8" \n\t" \ |
1816 | | "paddh "#f30", "#f30", "#f8" \n\t" \ |
1817 | | "dmtc1 "#r1", "#f8" \n\t" \ |
1818 | | "psrah "#f28", "#f28", "#f8" \n\t" \ |
1819 | | "psrah "#f30", "#f30", "#f8" \n\t" \ |
1820 | | "packushb "#f28", "#f28", "#f30" \n\t" \ |
1821 | | "gssdlc1 "#f28", 0x7("#r0") \n\t" \ |
1822 | | "gssdrc1 "#f28", 0x0("#r0") \n\t" \ |
1823 | | "dmtc1 "#r2", "#f8" \n\t" |
1824 | | |
1825 | | void McHorVer20Width5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, |
1826 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
1827 | | BACKUP_REG; |
1828 | | __asm__ volatile ( |
1829 | | ".set arch=loongson3a \n\t" |
1830 | | "xor $f28, $f28, $f28 \n\t" |
1831 | | PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t" |
1832 | | "dli $8, 0x2 \n\t" |
1833 | | "dli $10, 0x0010001000100010 \n\t" |
1834 | | "dli $11, 0x5 \n\t" |
1835 | | "1: \n\t" |
1836 | | "xor $f28, $f28, $f28 \n\t" |
1837 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
1838 | | "gsldlc1 $f4, 0xc(%[pSrc]) \n\t" |
1839 | | "gsldlc1 $f8, 0x8(%[pSrc]) \n\t" |
1840 | | "gsldlc1 $f12, 0xb(%[pSrc]) \n\t" |
1841 | | "gsldlc1 $f16, 0x9(%[pSrc]) \n\t" |
1842 | | "gsldlc1 $f20, 0xa(%[pSrc]) \n\t" |
1843 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
1844 | | "gsldrc1 $f4, 0x5(%[pSrc]) \n\t" |
1845 | | "gsldrc1 $f8, 0x1(%[pSrc]) \n\t" |
1846 | | "gsldrc1 $f12, 0x4(%[pSrc]) \n\t" |
1847 | | "gsldrc1 $f16, 0x2(%[pSrc]) \n\t" |
1848 | | "gsldrc1 $f20, 0x3(%[pSrc]) \n\t" |
1849 | | "punpckhbh $f2, $f0, $f28 \n\t" |
1850 | | "punpckhbh $f6, $f4, $f28 \n\t" |
1851 | | "punpckhbh $f10, $f8, $f28 \n\t" |
1852 | | "punpckhbh $f14, $f12, $f28 \n\t" |
1853 | | "punpckhbh $f18, $f16, $f28 \n\t" |
1854 | | "punpckhbh $f22, $f20, $f28 \n\t" |
1855 | | "punpcklbh $f0, $f0, $f28 \n\t" |
1856 | | "punpcklbh $f4, $f4, $f28 \n\t" |
1857 | | "punpcklbh $f8, $f8, $f28 \n\t" |
1858 | | "punpcklbh $f12, $f12, $f28 \n\t" |
1859 | | "punpcklbh $f16, $f16, $f28 \n\t" |
1860 | | "punpcklbh $f20, $f20, $f28 \n\t" |
1861 | | |
1862 | | "mov.d $f28, $f8 \n\t" |
1863 | | "mov.d $f30, $f10 \n\t" |
1864 | | "paddh $f28, $f28, $f12 \n\t" |
1865 | | "paddh $f30, $f30, $f14 \n\t" |
1866 | | "mov.d $f24, $f16 \n\t" |
1867 | | "mov.d $f26, $f18 \n\t" |
1868 | | "paddh $f24, $f24, $f20 \n\t" |
1869 | | "paddh $f26, $f26, $f22 \n\t" |
1870 | | "dmfc1 $9, $f12 \n\t" |
1871 | | "dmtc1 $8, $f12 \n\t" |
1872 | | "psllh $f24, $f24, $f12 \n\t" |
1873 | | "psllh $f26, $f26, $f12 \n\t" |
1874 | | "psubh $f24, $f24, $f28 \n\t" |
1875 | | "psubh $f26, $f26, $f30 \n\t" |
1876 | | "paddh $f0, $f0, $f4 \n\t" |
1877 | | "paddh $f2, $f2, $f6 \n\t" |
1878 | | "paddh $f0, $f0, $f24 \n\t" |
1879 | | "paddh $f2, $f2, $f26 \n\t" |
1880 | | "psllh $f24, $f24, $f12 \n\t" |
1881 | | "psllh $f26, $f26, $f12 \n\t" |
1882 | | "paddh $f0, $f0, $f24 \n\t" |
1883 | | "paddh $f2, $f2, $f26 \n\t" |
1884 | | |
1885 | | "dmtc1 $10, $f12 \n\t" |
1886 | | "paddh $f0, $f0, $f12 \n\t" |
1887 | | "paddh $f2, $f2, $f12 \n\t" |
1888 | | "dmtc1 $11, $f12 \n\t" |
1889 | | "psrah $f0, $f0, $f12 \n\t" |
1890 | | "psrah $f2, $f2, $f12 \n\t" |
1891 | | "packushb $f0, $f0, $f2 \n\t" |
1892 | | |
1893 | | "gsswlc1 $f0, 0x3(%[pDst]) \n\t" |
1894 | | "gsswrc1 $f0, 0x0(%[pDst]) \n\t" |
1895 | | |
1896 | | "gsldlc1 $f0, 0xd(%[pSrc]) \n\t" |
1897 | | "xor $f28, $f28, $f28 \n\t" |
1898 | | "gsldrc1 $f0, 0x6(%[pSrc]) \n\t" |
1899 | | "punpckhbh $f2, $f0, $f28 \n\t" |
1900 | | "punpcklbh $f0, $f0, $f28 \n\t" |
1901 | | "dmtc1 $9, $f12 \n\t" |
1902 | | "dmtc1 $8, $f24 \n\t" |
1903 | | |
1904 | | "paddh $f16, $f16, $f4 \n\t" |
1905 | | "paddh $f18, $f18, $f6 \n\t" |
1906 | | "paddh $f20, $f20, $f12 \n\t" |
1907 | | "paddh $f22, $f22, $f14 \n\t" |
1908 | | "psllh $f20, $f20, $f24 \n\t" |
1909 | | "psllh $f22, $f22, $f24 \n\t" |
1910 | | "psubh $f20, $f20, $f16 \n\t" |
1911 | | "psubh $f22, $f22, $f18 \n\t" |
1912 | | "paddh $f8, $f8, $f0 \n\t" |
1913 | | "paddh $f10, $f10, $f2 \n\t" |
1914 | | "paddh $f8, $f8, $f20 \n\t" |
1915 | | "paddh $f10, $f10, $f22 \n\t" |
1916 | | "psllh $f20, $f20, $f24 \n\t" |
1917 | | "psllh $f22, $f22, $f24 \n\t" |
1918 | | "paddh $f8, $f8, $f20 \n\t" |
1919 | | "paddh $f10, $f10, $f22 \n\t" |
1920 | | |
1921 | | "dmtc1 $10, $f24 \n\t" |
1922 | | "paddh $f8, $f8, $f24 \n\t" |
1923 | | "paddh $f10, $f10, $f24 \n\t" |
1924 | | "dmtc1 $11, $f24 \n\t" |
1925 | | "psrah $f8, $f8, $f24 \n\t" |
1926 | | "psrah $f10, $f10, $f24 \n\t" |
1927 | | "packushb $f8, $f8, $f10 \n\t" |
1928 | | "gsswlc1 $f8, 0x4(%[pDst]) \n\t" |
1929 | | "gsswrc1 $f8, 0x1(%[pDst]) \n\t" |
1930 | | |
1931 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
1932 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
1933 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
1934 | | "bnez %[iHeight], 1b \n\t" |
1935 | | : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst), |
1936 | | [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight) |
1937 | | : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride) |
1938 | | : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8", |
1939 | | "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", |
1940 | | "$f28", "$f30" |
1941 | | ); |
1942 | | RECOVER_REG; |
1943 | | } |
1944 | | |
1945 | | void McHorVer20Width9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, |
1946 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
1947 | | BACKUP_REG; |
1948 | | __asm__ volatile ( |
1949 | | ".set arch=loongson3a \n\t" |
1950 | | PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t" |
1951 | | "xor $f28, $f28, $f28 \n\t" |
1952 | | "dli $8, 0x2 \n\t" |
1953 | | "dli $9, 0x9 \n\t" |
1954 | | "dli $10, 0x0010001000100010 \n\t" |
1955 | | "dli $11, 0x5 \n\t" |
1956 | | "bne %[iWidth], $9, 2f \n\t" |
1957 | | "1: \n\t" |
1958 | | "xor $f28, $f28, $f28 \n\t" |
1959 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
1960 | | "gsldlc1 $f4, 0xc(%[pSrc]) \n\t" |
1961 | | "gsldlc1 $f8, 0x8(%[pSrc]) \n\t" |
1962 | | "gsldlc1 $f12, 0xb(%[pSrc]) \n\t" |
1963 | | "gsldlc1 $f16, 0x9(%[pSrc]) \n\t" |
1964 | | "gsldlc1 $f20, 0xa(%[pSrc]) \n\t" |
1965 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
1966 | | "gsldrc1 $f4, 0x5(%[pSrc]) \n\t" |
1967 | | "gsldrc1 $f8, 0x1(%[pSrc]) \n\t" |
1968 | | "gsldrc1 $f12, 0x4(%[pSrc]) \n\t" |
1969 | | "gsldrc1 $f16, 0x2(%[pSrc]) \n\t" |
1970 | | "gsldrc1 $f20, 0x3(%[pSrc]) \n\t" |
1971 | | "punpckhbh $f2, $f0, $f28 \n\t" |
1972 | | "punpckhbh $f6, $f4, $f28 \n\t" |
1973 | | "punpckhbh $f10, $f8, $f28 \n\t" |
1974 | | "punpckhbh $f14, $f12, $f28 \n\t" |
1975 | | "punpckhbh $f18, $f16, $f28 \n\t" |
1976 | | "punpckhbh $f22, $f20, $f28 \n\t" |
1977 | | "punpcklbh $f0, $f0, $f28 \n\t" |
1978 | | "punpcklbh $f4, $f4, $f28 \n\t" |
1979 | | "punpcklbh $f8, $f8, $f28 \n\t" |
1980 | | "punpcklbh $f12, $f12, $f28 \n\t" |
1981 | | "punpcklbh $f16, $f16, $f28 \n\t" |
1982 | | "punpcklbh $f20, $f20, $f28 \n\t" |
1983 | | |
1984 | | "mov.d $f28, $f8 \n\t" |
1985 | | "mov.d $f30, $f10 \n\t" |
1986 | | "paddh $f28, $f28, $f12 \n\t" |
1987 | | "paddh $f30, $f30, $f14 \n\t" |
1988 | | "mov.d $f24, $f16 \n\t" |
1989 | | "mov.d $f26, $f18 \n\t" |
1990 | | "paddh $f24, $f24, $f20 \n\t" |
1991 | | "paddh $f26, $f26, $f22 \n\t" |
1992 | | "dmfc1 $9, $f12 \n\t" |
1993 | | "dmtc1 $8, $f12 \n\t" |
1994 | | "psllh $f24, $f24, $f12 \n\t" |
1995 | | "psllh $f26, $f26, $f12 \n\t" |
1996 | | "psubh $f24, $f24, $f28 \n\t" |
1997 | | "psubh $f26, $f26, $f30 \n\t" |
1998 | | "paddh $f0, $f0, $f4 \n\t" |
1999 | | "paddh $f2, $f2, $f6 \n\t" |
2000 | | "paddh $f0, $f0, $f24 \n\t" |
2001 | | "paddh $f2, $f2, $f26 \n\t" |
2002 | | "psllh $f24, $f24, $f12 \n\t" |
2003 | | "psllh $f26, $f26, $f12 \n\t" |
2004 | | "paddh $f0, $f0, $f24 \n\t" |
2005 | | "paddh $f2, $f2, $f26 \n\t" |
2006 | | |
2007 | | "dmtc1 $10, $f12 \n\t" |
2008 | | "paddh $f0, $f0, $f12 \n\t" |
2009 | | "paddh $f2, $f2, $f12 \n\t" |
2010 | | "dmtc1 $11, $f12 \n\t" |
2011 | | "psrah $f0, $f0, $f12 \n\t" |
2012 | | "psrah $f2, $f2, $f12 \n\t" |
2013 | | "packushb $f0, $f0, $f2 \n\t" |
2014 | | |
2015 | | "gsswlc1 $f0, 0x3(%[pDst]) \n\t" |
2016 | | "gsswrc1 $f0, 0x0(%[pDst]) \n\t" |
2017 | | |
2018 | | "gsldlc1 $f0, 0xd(%[pSrc]) \n\t" |
2019 | | "xor $f28, $f28, $f28 \n\t" |
2020 | | "gsldrc1 $f0, 0x6(%[pSrc]) \n\t" |
2021 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2022 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2023 | | "dmtc1 $9, $f12 \n\t" |
2024 | | "dmtc1 $8, $f24 \n\t" |
2025 | | |
2026 | | "paddh $f16, $f16, $f4 \n\t" |
2027 | | "paddh $f18, $f18, $f6 \n\t" |
2028 | | "paddh $f20, $f20, $f12 \n\t" |
2029 | | "paddh $f22, $f22, $f14 \n\t" |
2030 | | "psllh $f20, $f20, $f24 \n\t" |
2031 | | "psllh $f22, $f22, $f24 \n\t" |
2032 | | "psubh $f20, $f20, $f16 \n\t" |
2033 | | "psubh $f22, $f22, $f18 \n\t" |
2034 | | "paddh $f8, $f8, $f0 \n\t" |
2035 | | "paddh $f10, $f10, $f2 \n\t" |
2036 | | "paddh $f8, $f8, $f20 \n\t" |
2037 | | "paddh $f10, $f10, $f22 \n\t" |
2038 | | "psllh $f20, $f20, $f24 \n\t" |
2039 | | "psllh $f22, $f22, $f24 \n\t" |
2040 | | "paddh $f8, $f8, $f20 \n\t" |
2041 | | "paddh $f10, $f10, $f22 \n\t" |
2042 | | |
2043 | | "dmtc1 $10, $f24 \n\t" |
2044 | | "paddh $f8, $f8, $f24 \n\t" |
2045 | | "paddh $f10, $f10, $f24 \n\t" |
2046 | | "dmtc1 $11, $f24 \n\t" |
2047 | | "psrah $f8, $f8, $f24 \n\t" |
2048 | | "psrah $f10, $f10, $f24 \n\t" |
2049 | | "packushb $f8, $f8, $f10 \n\t" |
2050 | | "gssdlc1 $f8, 0x8(%[pDst]) \n\t" |
2051 | | "gssdrc1 $f8, 0x1(%[pDst]) \n\t" |
2052 | | |
2053 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
2054 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2055 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2056 | | "bnez %[iHeight], 1b \n\t" |
2057 | | "j 3f \n\t" |
2058 | | |
2059 | | "2: \n\t" |
2060 | | "xor $f28, $f28, $f28 \n\t" |
2061 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
2062 | | "gsldlc1 $f4, 0xc(%[pSrc]) \n\t" |
2063 | | "gsldlc1 $f8, 0x8(%[pSrc]) \n\t" |
2064 | | "gsldlc1 $f12, 0xb(%[pSrc]) \n\t" |
2065 | | "gsldlc1 $f16, 0x9(%[pSrc]) \n\t" |
2066 | | "gsldlc1 $f20, 0xa(%[pSrc]) \n\t" |
2067 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
2068 | | "gsldrc1 $f4, 0x5(%[pSrc]) \n\t" |
2069 | | "gsldrc1 $f8, 0x1(%[pSrc]) \n\t" |
2070 | | "gsldrc1 $f12, 0x4(%[pSrc]) \n\t" |
2071 | | "gsldrc1 $f16, 0x2(%[pSrc]) \n\t" |
2072 | | "gsldrc1 $f20, 0x3(%[pSrc]) \n\t" |
2073 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2074 | | "punpckhbh $f6, $f4, $f28 \n\t" |
2075 | | "punpckhbh $f10, $f8, $f28 \n\t" |
2076 | | "punpckhbh $f14, $f12, $f28 \n\t" |
2077 | | "punpckhbh $f18, $f16, $f28 \n\t" |
2078 | | "punpckhbh $f22, $f20, $f28 \n\t" |
2079 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2080 | | "punpcklbh $f4, $f4, $f28 \n\t" |
2081 | | "punpcklbh $f8, $f8, $f28 \n\t" |
2082 | | "punpcklbh $f12, $f12, $f28 \n\t" |
2083 | | "punpcklbh $f16, $f16, $f28 \n\t" |
2084 | | "punpcklbh $f20, $f20, $f28 \n\t" |
2085 | | |
2086 | | "dmtc1 $8, $f30 \n\t" |
2087 | | "paddh $f8, $f8, $f12 \n\t" |
2088 | | "paddh $f10, $f10, $f14 \n\t" |
2089 | | "paddh $f16, $f16, $f20 \n\t" |
2090 | | "paddh $f18, $f18, $f22 \n\t" |
2091 | | "psllh $f16, $f16, $f30 \n\t" |
2092 | | "psllh $f18, $f18, $f30 \n\t" |
2093 | | "psubh $f16, $f16, $f8 \n\t" |
2094 | | "psubh $f18, $f18, $f10 \n\t" |
2095 | | "paddh $f0, $f0, $f4 \n\t" |
2096 | | "paddh $f2, $f2, $f6 \n\t" |
2097 | | "paddh $f0, $f0, $f16 \n\t" |
2098 | | "paddh $f2, $f2, $f18 \n\t" |
2099 | | "psllh $f16, $f16, $f30 \n\t" |
2100 | | "psllh $f18, $f18, $f30 \n\t" |
2101 | | "paddh $f0, $f0, $f16 \n\t" |
2102 | | "paddh $f2, $f2, $f18 \n\t" |
2103 | | |
2104 | | "dmtc1 $10, $f30 \n\t" |
2105 | | "paddh $f0, $f0, $f30 \n\t" |
2106 | | "paddh $f2, $f2, $f30 \n\t" |
2107 | | "dmtc1 $11, $f30 \n\t" |
2108 | | "psrah $f0, $f0, $f30 \n\t" |
2109 | | "psrah $f2, $f2, $f30 \n\t" |
2110 | | "packushb $f0, $f0, $f2 \n\t" |
2111 | | "gssdlc1 $f0, 0x7(%[pDst]) \n\t" |
2112 | | "gssdrc1 $f0, 0x0(%[pDst]) \n\t" |
2113 | | |
2114 | | "gsldlc1 $f0, 15(%[pSrc]) \n\t" |
2115 | | "gsldlc1 $f4, 0x14(%[pSrc]) \n\t" |
2116 | | "gsldlc1 $f8, 0x10(%[pSrc]) \n\t" |
2117 | | "gsldlc1 $f12, 0x13(%[pSrc]) \n\t" |
2118 | | "gsldlc1 $f16, 0x11(%[pSrc]) \n\t" |
2119 | | "gsldlc1 $f20, 0x12(%[pSrc]) \n\t" |
2120 | | "gsldrc1 $f0, 8(%[pSrc]) \n\t" |
2121 | | "gsldrc1 $f4, 0xd(%[pSrc]) \n\t" |
2122 | | "gsldrc1 $f8, 0x9(%[pSrc]) \n\t" |
2123 | | "gsldrc1 $f12, 0xc(%[pSrc]) \n\t" |
2124 | | "gsldrc1 $f16, 0xa(%[pSrc]) \n\t" |
2125 | | "gsldrc1 $f20, 0xb(%[pSrc]) \n\t" |
2126 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2127 | | "punpckhbh $f6, $f4, $f28 \n\t" |
2128 | | "punpckhbh $f10, $f8, $f28 \n\t" |
2129 | | "punpckhbh $f14, $f12, $f28 \n\t" |
2130 | | "punpckhbh $f18, $f16, $f28 \n\t" |
2131 | | "punpckhbh $f22, $f20, $f28 \n\t" |
2132 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2133 | | "punpcklbh $f4, $f4, $f28 \n\t" |
2134 | | "punpcklbh $f8, $f8, $f28 \n\t" |
2135 | | "punpcklbh $f12, $f12, $f28 \n\t" |
2136 | | "punpcklbh $f16, $f16, $f28 \n\t" |
2137 | | "punpcklbh $f20, $f20, $f28 \n\t" |
2138 | | |
2139 | | "mov.d $f28, $f8 \n\t" |
2140 | | "mov.d $f30, $f10 \n\t" |
2141 | | "paddh $f28, $f28, $f12 \n\t" |
2142 | | "paddh $f30, $f30, $f14 \n\t" |
2143 | | "mov.d $f24, $f16 \n\t" |
2144 | | "mov.d $f26, $f18 \n\t" |
2145 | | "paddh $f24, $f24, $f20 \n\t" |
2146 | | "paddh $f26, $f26, $f22 \n\t" |
2147 | | "dmfc1 $9, $f12 \n\t" |
2148 | | "dmtc1 $8, $f12 \n\t" |
2149 | | "psllh $f24, $f24, $f12 \n\t" |
2150 | | "psllh $f26, $f26, $f12 \n\t" |
2151 | | "psubh $f24, $f24, $f28 \n\t" |
2152 | | "psubh $f26, $f26, $f30 \n\t" |
2153 | | "paddh $f0, $f0, $f4 \n\t" |
2154 | | "paddh $f2, $f2, $f6 \n\t" |
2155 | | "paddh $f0, $f0, $f24 \n\t" |
2156 | | "paddh $f2, $f2, $f26 \n\t" |
2157 | | "psllh $f24, $f24, $f12 \n\t" |
2158 | | "psllh $f26, $f26, $f12 \n\t" |
2159 | | "paddh $f0, $f0, $f24 \n\t" |
2160 | | "paddh $f2, $f2, $f26 \n\t" |
2161 | | |
2162 | | "dmtc1 $10, $f30 \n\t" |
2163 | | "paddh $f0, $f0, $f30 \n\t" |
2164 | | "paddh $f2, $f2, $f30 \n\t" |
2165 | | "dmtc1 $11, $f30 \n\t" |
2166 | | "psrah $f0, $f0, $f30 \n\t" |
2167 | | "psrah $f2, $f2, $f30 \n\t" |
2168 | | "packushb $f0, $f0, $f2 \n\t" |
2169 | | "gsswlc1 $f0, 0xb(%[pDst]) \n\t" |
2170 | | "gsswrc1 $f0, 0x8(%[pDst]) \n\t" |
2171 | | |
2172 | | "dmtc1 $9, $f12 \n\t" |
2173 | | "xor $f28, $f28, $f28 \n\t" |
2174 | | "dli $9, 0x20 \n\t" |
2175 | | "gsldlc1 $f0, 0x15(%[pSrc]) \n\t" |
2176 | | "dmtc1 $9, $f30 \n\t" |
2177 | | "gsldrc1 $f0, 0xE(%[pSrc]) \n\t" |
2178 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2179 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2180 | | "dmtc1 $8, $f24 \n\t" |
2181 | | |
2182 | | "paddh $f16, $f16, $f4 \n\t" |
2183 | | "paddh $f18, $f18, $f6 \n\t" |
2184 | | "paddh $f20, $f20, $f12 \n\t" |
2185 | | "paddh $f22, $f22, $f14 \n\t" |
2186 | | "psllh $f20, $f20, $f24 \n\t" |
2187 | | "psllh $f22, $f22, $f24 \n\t" |
2188 | | "psubh $f20, $f20, $f16 \n\t" |
2189 | | "psubh $f22, $f22, $f18 \n\t" |
2190 | | "paddh $f8, $f8, $f0 \n\t" |
2191 | | "paddh $f10, $f10, $f2 \n\t" |
2192 | | "paddh $f8, $f8, $f20 \n\t" |
2193 | | "paddh $f10, $f10, $f22 \n\t" |
2194 | | "psllh $f20, $f20, $f24 \n\t" |
2195 | | "psllh $f22, $f22, $f24 \n\t" |
2196 | | "paddh $f8, $f8, $f20 \n\t" |
2197 | | "paddh $f10, $f10, $f22 \n\t" |
2198 | | |
2199 | | "dmtc1 $10, $f24 \n\t" |
2200 | | "paddh $f8, $f8, $f24 \n\t" |
2201 | | "paddh $f10, $f10, $f24 \n\t" |
2202 | | "dmtc1 $11, $f24 \n\t" |
2203 | | "psrah $f8, $f8, $f24 \n\t" |
2204 | | "psrah $f10, $f10, $f24 \n\t" |
2205 | | "packushb $f8, $f8, $f10 \n\t" |
2206 | | "gssdlc1 $f8, 0x10(%[pDst]) \n\t" |
2207 | | "gssdrc1 $f8, 0x9(%[pDst]) \n\t" |
2208 | | |
2209 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
2210 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2211 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2212 | | "bnez %[iHeight], 2b \n\t" |
2213 | | "3: \n\t" |
2214 | | : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst), |
2215 | | [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight) |
2216 | | : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride) |
2217 | | : "memory", "$8", "$9", "$10", "$11", "$f0", "$f2", "$f4", "$f6", "$f8", |
2218 | | "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", |
2219 | | "$f28", "$f30" |
2220 | | ); |
2221 | | RECOVER_REG; |
2222 | | } |
2223 | | |
2224 | | //horizontal filter to gain half sample, that is (2, 0) location in quarter sample |
2225 | | static inline void McHorVer20Width5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride, |
2226 | | uint8_t* pDst, int32_t iDstStride, |
2227 | | int32_t iWidth, int32_t iHeight) { |
2228 | | if (iWidth == 17 || iWidth == 9) |
2229 | | McHorVer20Width9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
2230 | | else //if (iWidth == 5) |
2231 | | McHorVer20Width5_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
2232 | | } |
2233 | | |
2234 | | void McHorVer02Height5_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, |
2235 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
2236 | | BACKUP_REG; |
2237 | | __asm__ volatile ( |
2238 | | ".set arch=loongson3a \n\t" |
2239 | | "move $12, %[pSrc] \n\t" |
2240 | | "move $13, %[pDst] \n\t" |
2241 | | "move $14, %[iHeight] \n\t" |
2242 | | |
2243 | | "dsrl %[iWidth], %[iWidth], 0x2 \n\t" |
2244 | | PTR_ADDU "$10, %[iSrcStride], %[iSrcStride] \n\t" |
2245 | | PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t" |
2246 | | |
2247 | | "1: \n\t" |
2248 | | "xor $f28, $f28, $f28 \n\t" |
2249 | | MMI_LOAD_8P($f0, $f2, $f28, %[pSrc]) |
2250 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2251 | | MMI_LOAD_8P($f4, $f6, $f28, $8) |
2252 | | |
2253 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2254 | | MMI_LOAD_8P($f8, $f10, $f28, %[pSrc]) |
2255 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2256 | | MMI_LOAD_8P($f12, $f14, $f28, $8) |
2257 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2258 | | MMI_LOAD_8P($f16, $f18, $f28, %[pSrc]) |
2259 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2260 | | MMI_LOAD_8P($f20, $f22, $f28, $8) |
2261 | | FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, |
2262 | | $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9) |
2263 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2264 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2265 | | MMI_LOAD_8P($f24, $f26, $f28, %[pSrc]) |
2266 | | "mov.d $f0, $f4 \n\t" |
2267 | | "mov.d $f2, $f6 \n\t" |
2268 | | "mov.d $f4, $f8 \n\t" |
2269 | | "mov.d $f6, $f10 \n\t" |
2270 | | "mov.d $f8, $f12 \n\t" |
2271 | | "mov.d $f10, $f14 \n\t" |
2272 | | "mov.d $f12, $f16 \n\t" |
2273 | | "mov.d $f14, $f18 \n\t" |
2274 | | "mov.d $f16, $f20 \n\t" |
2275 | | "mov.d $f18, $f22 \n\t" |
2276 | | "mov.d $f20, $f24 \n\t" |
2277 | | "mov.d $f22, $f26 \n\t" |
2278 | | |
2279 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2280 | | PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
2281 | | |
2282 | | "2: \n\t" |
2283 | | FILTER_HV_W4($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, |
2284 | | $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9) |
2285 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2286 | | "beqz %[iHeight], 3f \n\t" |
2287 | | |
2288 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2289 | | MMI_LOAD_8P($f24, $f26, $f28, %[pSrc]) |
2290 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2291 | | FILTER_HV_W4($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, |
2292 | | $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9) |
2293 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2294 | | "beqz %[iHeight], 3f \n\t" |
2295 | | |
2296 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2297 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2298 | | MMI_LOAD_8P($f28, $f30, $f0, $8) |
2299 | | FILTER_HV_W4($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, |
2300 | | $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9) |
2301 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2302 | | "beqz %[iHeight], 3f \n\t" |
2303 | | |
2304 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2305 | | MMI_LOAD_8P($f0, $f2, $f4, %[pSrc]) |
2306 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2307 | | FILTER_HV_W4($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, |
2308 | | $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9) |
2309 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2310 | | "beqz %[iHeight], 3f \n\t" |
2311 | | |
2312 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2313 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2314 | | MMI_LOAD_8P($f4, $f6, $f8, $8) |
2315 | | FILTER_HV_W4($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, |
2316 | | $f8, $f10, $f12, $f14, %[pDst], $8, $9) |
2317 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2318 | | "beqz %[iHeight], 3f \n\t" |
2319 | | |
2320 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2321 | | MMI_LOAD_8P($f8, $f10, $f12, %[pSrc]) |
2322 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2323 | | FILTER_HV_W4($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, |
2324 | | $f12, $f14, $f16, $f18, %[pDst], $8, $9) |
2325 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2326 | | "beqz %[iHeight], 3f \n\t" |
2327 | | |
2328 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2329 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2330 | | MMI_LOAD_8P($f12, $f14, $f16, $8) |
2331 | | FILTER_HV_W4($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, |
2332 | | $f16, $f18, $f20, $f22, %[pDst], $8, $9) |
2333 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2334 | | "beqz %[iHeight], 3f \n\t" |
2335 | | |
2336 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2337 | | MMI_LOAD_8P($f16, $f18, $f20, %[pSrc]) |
2338 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2339 | | FILTER_HV_W4($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, |
2340 | | $f20, $f22, $f24, $f26, %[pDst], $8, $9) |
2341 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2342 | | "beqz %[iHeight], 3f \n\t" |
2343 | | |
2344 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2345 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2346 | | MMI_LOAD_8P($f20, $f22, $f24, $8) |
2347 | | "j 2b \n\t" |
2348 | | |
2349 | | "3: \n\t" |
2350 | | PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" |
2351 | | "beqz %[iWidth], 4f \n\t" |
2352 | | "move %[pSrc], $12 \n\t" |
2353 | | "move %[pDst], $13 \n\t" |
2354 | | "move %[iHeight], $14 \n\t" |
2355 | | PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t" |
2356 | | PTR_ADDIU "%[pSrc], %[pSrc], 0x4 \n\t" |
2357 | | PTR_ADDIU "%[pDst], %[pDst], 0x4 \n\t" |
2358 | | "j 1b \n\t" |
2359 | | "4: \n\t" |
2360 | | : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst), |
2361 | | [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight) |
2362 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
2363 | | : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4", |
2364 | | "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", |
2365 | | "$f24", "$f26", "$f28", "$f30" |
2366 | | ); |
2367 | | RECOVER_REG; |
2368 | | } |
2369 | | |
2370 | | void McHorVer02Height9Or17_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, |
2371 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
2372 | | BACKUP_REG; |
2373 | | __asm__ volatile ( |
2374 | | ".set arch=loongson3a \n\t" |
2375 | | "move $12, %[pSrc] \n\t" |
2376 | | "move $13, %[pDst] \n\t" |
2377 | | "move $14, %[iHeight] \n\t" |
2378 | | |
2379 | | "dsrl %[iWidth], %[iWidth], 0x3 \n\t" |
2380 | | PTR_ADDU "$10, %[iSrcStride], %[iSrcStride] \n\t" |
2381 | | PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t" |
2382 | | |
2383 | | "1: \n\t" |
2384 | | "dli $8, 0x20 \n\t" |
2385 | | "xor $f28, $f28, $f28 \n\t" |
2386 | | "dmtc1 $8, $f30 \n\t" |
2387 | | |
2388 | | MMI_LOAD_8P($f0, $f2, $f28, %[pSrc]) |
2389 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2390 | | MMI_LOAD_8P($f4, $f6, $f28, $8) |
2391 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2392 | | MMI_LOAD_8P($f8, $f10, $f28, %[pSrc]) |
2393 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2394 | | MMI_LOAD_8P($f12, $f14, $f28, $8) |
2395 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2396 | | MMI_LOAD_8P($f16, $f18, $f28, %[pSrc]) |
2397 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2398 | | MMI_LOAD_8P($f20, $f22, $f28, $8) |
2399 | | FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, |
2400 | | $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9) |
2401 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2402 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2403 | | MMI_LOAD_8P($f24, $f26, $f28, %[pSrc]) |
2404 | | "mov.d $f0, $f4 \n\t" |
2405 | | "mov.d $f2, $f6 \n\t" |
2406 | | "mov.d $f4, $f8 \n\t" |
2407 | | "mov.d $f6, $f10 \n\t" |
2408 | | "mov.d $f8, $f12 \n\t" |
2409 | | "mov.d $f10, $f14 \n\t" |
2410 | | "mov.d $f12, $f16 \n\t" |
2411 | | "mov.d $f14, $f18 \n\t" |
2412 | | "mov.d $f16, $f20 \n\t" |
2413 | | "mov.d $f18, $f22 \n\t" |
2414 | | "mov.d $f20, $f24 \n\t" |
2415 | | "mov.d $f22, $f26 \n\t" |
2416 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2417 | | PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
2418 | | |
2419 | | "2: \n\t" |
2420 | | FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, |
2421 | | $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9) |
2422 | | "dmtc1 $9, $f8 \n\t" |
2423 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2424 | | "beqz %[iHeight], 3f \n\t" |
2425 | | |
2426 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2427 | | MMI_LOAD_8P($f24, $f26, $f28, %[pSrc]) |
2428 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2429 | | FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, |
2430 | | $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9) |
2431 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2432 | | "beqz %[iHeight], 3f \n\t" |
2433 | | |
2434 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2435 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2436 | | MMI_LOAD_8P($f28, $f30, $f0, $8) |
2437 | | FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, |
2438 | | $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9) |
2439 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2440 | | "beqz %[iHeight], 3f \n\t" |
2441 | | |
2442 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2443 | | MMI_LOAD_8P($f0, $f2, $f4, %[pSrc]) |
2444 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2445 | | FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, |
2446 | | $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9) |
2447 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2448 | | "beqz %[iHeight], 3f \n\t" |
2449 | | |
2450 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2451 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2452 | | MMI_LOAD_8P($f4, $f6, $f8, $8) |
2453 | | FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, |
2454 | | $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9) |
2455 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2456 | | "beqz %[iHeight], 3f \n\t" |
2457 | | |
2458 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2459 | | MMI_LOAD_8P($f8, $f10, $f12, %[pSrc]) |
2460 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2461 | | FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, |
2462 | | $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9) |
2463 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2464 | | "beqz %[iHeight], 3f \n\t" |
2465 | | |
2466 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2467 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2468 | | MMI_LOAD_8P($f12, $f14, $f16, $8) |
2469 | | FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, |
2470 | | $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9) |
2471 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2472 | | "beqz %[iHeight], 3f \n\t" |
2473 | | |
2474 | | PTR_ADDU "%[pSrc], %[pSrc], $10 \n\t" |
2475 | | MMI_LOAD_8P($f16, $f18, $f20, %[pSrc]) |
2476 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2477 | | FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, |
2478 | | $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9) |
2479 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2480 | | "beqz %[iHeight], 3f \n\t" |
2481 | | |
2482 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2483 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
2484 | | MMI_LOAD_8P($f20, $f22, $f24, $8) |
2485 | | "j 2b \n\t" |
2486 | | |
2487 | | "3: \n\t" |
2488 | | PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" |
2489 | | "beqz %[iWidth], 4f \n\t" |
2490 | | |
2491 | | "move %[pSrc], $12 \n\t" |
2492 | | "move %[pDst], $13 \n\t" |
2493 | | "move %[iHeight], $14 \n\t" |
2494 | | PTR_SUBU "%[pSrc], %[pSrc], $10 \n\t" |
2495 | | PTR_ADDIU "%[pSrc], %[pSrc], 0x8 \n\t" |
2496 | | PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t" |
2497 | | "j 1b \n\t" |
2498 | | "4: \n\t" |
2499 | | : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst), |
2500 | | [iWidth]"+&r"(iWidth), [iHeight]"+&r"(iHeight) |
2501 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
2502 | | : "memory", "$8", "$9", "$10", "$12", "$13", "$14", "$f0", "$f2", "$f4", |
2503 | | "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", "$f22", |
2504 | | "$f24", "$f26", "$f28", "$f30" |
2505 | | ); |
2506 | | RECOVER_REG; |
2507 | | } |
2508 | | |
2509 | | //vertical filter to gain half sample, that is (0, 2) location in quarter sample |
2510 | | static inline void McHorVer02Height5Or9Or17_mmi(const uint8_t* pSrc, int32_t iSrcStride, |
2511 | | uint8_t* pDst, int32_t iDstStride, |
2512 | | int32_t iWidth, int32_t iHeight) { |
2513 | | if (iWidth == 16 || iWidth == 8) |
2514 | | McHorVer02Height9Or17_mmi(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight ); |
2515 | | else |
2516 | | McHorVer02Height5_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
2517 | | } |
2518 | | |
2519 | | static inline void McHorVer22HorFirst_mmi(const uint8_t *pSrc, int32_t iSrcStride, |
2520 | | uint8_t * pTap, int32_t iTapStride, |
2521 | | int32_t iWidth, int32_t iHeight) { |
2522 | | BACKUP_REG; |
2523 | | __asm__ volatile ( |
2524 | | ".set arch=loongson3a \n\t" |
2525 | | "dli $8, 0x9 \n\t" |
2526 | | PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
2527 | | PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
2528 | | "bne %[iWidth], $8, 2f \n\t" |
2529 | | |
2530 | | "1: \n\t" |
2531 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
2532 | | "xor $f28, $f28, $f28 \n\t" |
2533 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
2534 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2535 | | "gsldlc1 $f4, 0xc(%[pSrc]) \n\t" |
2536 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2537 | | "gsldrc1 $f4, 0x5(%[pSrc]) \n\t" |
2538 | | "punpckhbh $f6, $f4, $f28 \n\t" |
2539 | | "gsldlc1 $f8, 0x8(%[pSrc]) \n\t" |
2540 | | "punpcklbh $f4, $f4, $f28 \n\t" |
2541 | | "gsldrc1 $f8, 0x1(%[pSrc]) \n\t" |
2542 | | "punpckhbh $f10, $f8, $f28 \n\t" |
2543 | | "gsldlc1 $f12, 0xb(%[pSrc]) \n\t" |
2544 | | "punpcklbh $f8, $f8, $f28 \n\t" |
2545 | | "gsldrc1 $f12, 0x4(%[pSrc]) \n\t" |
2546 | | "punpckhbh $f14, $f12, $f28 \n\t" |
2547 | | "gsldlc1 $f16, 0x9(%[pSrc]) \n\t" |
2548 | | "punpcklbh $f12, $f12, $f28 \n\t" |
2549 | | "gsldrc1 $f16, 0x2(%[pSrc]) \n\t" |
2550 | | "punpckhbh $f18, $f16, $f28 \n\t" |
2551 | | "gsldlc1 $f20, 0xa(%[pSrc]) \n\t" |
2552 | | "punpcklbh $f16, $f16, $f28 \n\t" |
2553 | | "gsldrc1 $f20, 0x3(%[pSrc]) \n\t" |
2554 | | "punpckhbh $f22, $f20, $f28 \n\t" |
2555 | | "punpcklbh $f20, $f20, $f28 \n\t" |
2556 | | |
2557 | | "mov.d $f28, $f8 \n\t" |
2558 | | "mov.d $f30, $f10 \n\t" |
2559 | | "paddh $f28, $f28, $f12 \n\t" |
2560 | | "paddh $f30, $f30, $f14 \n\t" |
2561 | | "mov.d $f24, $f16 \n\t" |
2562 | | "mov.d $f26, $f18 \n\t" |
2563 | | "paddh $f24, $f24, $f20 \n\t" |
2564 | | "paddh $f26, $f26, $f22 \n\t" |
2565 | | "dli $8, 0x2 \n\t" |
2566 | | "dmfc1 $9, $f12 \n\t" |
2567 | | "dmtc1 $8, $f12 \n\t" |
2568 | | "psllh $f24, $f24, $f12 \n\t" |
2569 | | "psllh $f26, $f26, $f12 \n\t" |
2570 | | "psubh $f24, $f24, $f28 \n\t" |
2571 | | "psubh $f26, $f26, $f30 \n\t" |
2572 | | "paddh $f0, $f0, $f4 \n\t" |
2573 | | "paddh $f2, $f2, $f6 \n\t" |
2574 | | "paddh $f0, $f0, $f24 \n\t" |
2575 | | "paddh $f2, $f2, $f26 \n\t" |
2576 | | "psllh $f24, $f24, $f12 \n\t" |
2577 | | "psllh $f26, $f26, $f12 \n\t" |
2578 | | "paddh $f0, $f0, $f24 \n\t" |
2579 | | "paddh $f2, $f2, $f26 \n\t" |
2580 | | "gsswlc1 $f0, 0x3(%[pTap]) \n\t" |
2581 | | "gsswrc1 $f0, 0x0(%[pTap]) \n\t" |
2582 | | |
2583 | | "gsldlc1 $f0, 0xd(%[pSrc]) \n\t" |
2584 | | "xor $f28, $f28, $f28 \n\t" |
2585 | | "gsldrc1 $f0, 0x6(%[pSrc]) \n\t" |
2586 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2587 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2588 | | "dli $8, 0x2 \n\t" |
2589 | | "dmtc1 $9, $f12 \n\t" |
2590 | | "dmtc1 $8, $f24 \n\t" |
2591 | | |
2592 | | "paddh $f16, $f16, $f4 \n\t" |
2593 | | "paddh $f18, $f18, $f6 \n\t" |
2594 | | "paddh $f20, $f20, $f12 \n\t" |
2595 | | "paddh $f22, $f22, $f14 \n\t" |
2596 | | "psllh $f20, $f20, $f24 \n\t" |
2597 | | "psllh $f22, $f22, $f24 \n\t" |
2598 | | "psubh $f20, $f20, $f16 \n\t" |
2599 | | "psubh $f22, $f22, $f18 \n\t" |
2600 | | "paddh $f8, $f8, $f0 \n\t" |
2601 | | "paddh $f10, $f10, $f2 \n\t" |
2602 | | "paddh $f8, $f8, $f20 \n\t" |
2603 | | "paddh $f10, $f10, $f22 \n\t" |
2604 | | "psllh $f20, $f20, $f24 \n\t" |
2605 | | "psllh $f22, $f22, $f24 \n\t" |
2606 | | "paddh $f8, $f8, $f20 \n\t" |
2607 | | "paddh $f10, $f10, $f22 \n\t" |
2608 | | "gssdlc1 $f8, 0x9(%[pTap]) \n\t" |
2609 | | "gssdlc1 $f10, 0x11(%[pTap]) \n\t" |
2610 | | "gssdrc1 $f8, 0x2(%[pTap]) \n\t" |
2611 | | "gssdrc1 $f10, 0xa(%[pTap]) \n\t" |
2612 | | |
2613 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
2614 | | PTR_ADDU "%[pTap], %[pTap], %[iTapStride] \n\t" |
2615 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2616 | | "bnez %[iHeight], 1b \n\t" |
2617 | | "j 3f \n\t" |
2618 | | |
2619 | | "2: \n\t" |
2620 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
2621 | | "xor $f28, $f28, $f28 \n\t" |
2622 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
2623 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2624 | | "gsldlc1 $f4, 0xc(%[pSrc]) \n\t" |
2625 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2626 | | "gsldrc1 $f4, 0x5(%[pSrc]) \n\t" |
2627 | | "punpckhbh $f6, $f4, $f28 \n\t" |
2628 | | "gsldlc1 $f8, 0x8(%[pSrc]) \n\t" |
2629 | | "punpcklbh $f4, $f4, $f28 \n\t" |
2630 | | "gsldrc1 $f8, 0x1(%[pSrc]) \n\t" |
2631 | | "punpckhbh $f10, $f8, $f28 \n\t" |
2632 | | "gsldlc1 $f12, 0xb(%[pSrc]) \n\t" |
2633 | | "punpcklbh $f8, $f8, $f28 \n\t" |
2634 | | "gsldrc1 $f12, 0x4(%[pSrc]) \n\t" |
2635 | | "punpckhbh $f14, $f12, $f28 \n\t" |
2636 | | "gsldlc1 $f16, 0x9(%[pSrc]) \n\t" |
2637 | | "punpcklbh $f12, $f12, $f28 \n\t" |
2638 | | "gsldrc1 $f16, 0x2(%[pSrc]) \n\t" |
2639 | | "punpckhbh $f18, $f16, $f28 \n\t" |
2640 | | "gsldlc1 $f20, 0xa(%[pSrc]) \n\t" |
2641 | | "punpcklbh $f16, $f16, $f28 \n\t" |
2642 | | "gsldrc1 $f20, 0x3(%[pSrc]) \n\t" |
2643 | | "punpckhbh $f22, $f20, $f28 \n\t" |
2644 | | "dli $8, 0x2 \n\t" |
2645 | | "punpcklbh $f20, $f20, $f28 \n\t" |
2646 | | |
2647 | | "dmtc1 $8, $f30 \n\t" |
2648 | | "paddh $f8, $f8, $f12 \n\t" |
2649 | | "paddh $f10, $f10, $f14 \n\t" |
2650 | | "paddh $f16, $f16, $f20 \n\t" |
2651 | | "paddh $f18, $f18, $f22 \n\t" |
2652 | | "psllh $f16, $f16, $f30 \n\t" |
2653 | | "psllh $f18, $f18, $f30 \n\t" |
2654 | | "psubh $f16, $f16, $f8 \n\t" |
2655 | | "psubh $f18, $f18, $f10 \n\t" |
2656 | | "paddh $f0, $f0, $f4 \n\t" |
2657 | | "paddh $f2, $f2, $f6 \n\t" |
2658 | | "paddh $f0, $f0, $f16 \n\t" |
2659 | | "paddh $f2, $f2, $f18 \n\t" |
2660 | | "psllh $f16, $f16, $f30 \n\t" |
2661 | | "psllh $f18, $f18, $f30 \n\t" |
2662 | | "paddh $f0, $f0, $f16 \n\t" |
2663 | | "paddh $f2, $f2, $f18 \n\t" |
2664 | | "gssqc1 $f2, $f0, 0x0(%[pTap]) \n\t" |
2665 | | |
2666 | | "gsldlc1 $f0, 15(%[pSrc]) \n\t" |
2667 | | "gsldrc1 $f0, 8(%[pSrc]) \n\t" |
2668 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2669 | | "gsldlc1 $f4, 0x14(%[pSrc]) \n\t" |
2670 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2671 | | "gsldrc1 $f4, 0xd(%[pSrc]) \n\t" |
2672 | | "punpckhbh $f6, $f4, $f28 \n\t" |
2673 | | "gsldlc1 $f8, 0x10(%[pSrc]) \n\t" |
2674 | | "punpcklbh $f4, $f4, $f28 \n\t" |
2675 | | "gsldrc1 $f8, 0x9(%[pSrc]) \n\t" |
2676 | | "punpckhbh $f10, $f8, $f28 \n\t" |
2677 | | "gsldlc1 $f12, 0x13(%[pSrc]) \n\t" |
2678 | | "punpcklbh $f8, $f8, $f28 \n\t" |
2679 | | "gsldrc1 $f12, 0xc(%[pSrc]) \n\t" |
2680 | | "punpckhbh $f14, $f12, $f28 \n\t" |
2681 | | "gsldlc1 $f16, 0x11(%[pSrc]) \n\t" |
2682 | | "punpcklbh $f12, $f12, $f28 \n\t" |
2683 | | "gsldrc1 $f16, 0xa(%[pSrc]) \n\t" |
2684 | | "punpckhbh $f18, $f16, $f28 \n\t" |
2685 | | "gsldlc1 $f20, 0x12(%[pSrc]) \n\t" |
2686 | | "punpcklbh $f16, $f16, $f28 \n\t" |
2687 | | "gsldrc1 $f20, 0xb(%[pSrc]) \n\t" |
2688 | | "punpckhbh $f22, $f20, $f28 \n\t" |
2689 | | "punpcklbh $f20, $f20, $f28 \n\t" |
2690 | | |
2691 | | "mov.d $f28, $f8 \n\t" |
2692 | | "mov.d $f30, $f10 \n\t" |
2693 | | "paddh $f28, $f28, $f12 \n\t" |
2694 | | "paddh $f30, $f30, $f14 \n\t" |
2695 | | "mov.d $f24, $f16 \n\t" |
2696 | | "mov.d $f26, $f18 \n\t" |
2697 | | "dli $8, 0x2 \n\t" |
2698 | | "paddh $f24, $f24, $f20 \n\t" |
2699 | | "paddh $f26, $f26, $f22 \n\t" |
2700 | | "dmfc1 $9, $f12 \n\t" |
2701 | | "dmtc1 $8, $f12 \n\t" |
2702 | | "psllh $f24, $f24, $f12 \n\t" |
2703 | | "psllh $f26, $f26, $f12 \n\t" |
2704 | | "psubh $f24, $f24, $f28 \n\t" |
2705 | | "psubh $f26, $f26, $f30 \n\t" |
2706 | | "paddh $f0, $f0, $f4 \n\t" |
2707 | | "paddh $f2, $f2, $f6 \n\t" |
2708 | | "paddh $f0, $f0, $f24 \n\t" |
2709 | | "paddh $f2, $f2, $f26 \n\t" |
2710 | | "psllh $f24, $f24, $f12 \n\t" |
2711 | | "psllh $f26, $f26, $f12 \n\t" |
2712 | | "paddh $f0, $f0, $f24 \n\t" |
2713 | | "paddh $f2, $f2, $f26 \n\t" |
2714 | | "gsswlc1 $f0, 0x13(%[pTap]) \n\t" |
2715 | | "gsswrc1 $f0, 0x10(%[pTap]) \n\t" |
2716 | | |
2717 | | "gsldlc1 $f0, 0x15(%[pSrc]) \n\t" |
2718 | | "xor $f28, $f28, $f28 \n\t" |
2719 | | "gsldrc1 $f0, 0xE(%[pSrc]) \n\t" |
2720 | | "punpckhbh $f2, $f0, $f28 \n\t" |
2721 | | "punpcklbh $f0, $f0, $f28 \n\t" |
2722 | | "dli $8, 0x2 \n\t" |
2723 | | "dmtc1 $9, $f12 \n\t" |
2724 | | "dmtc1 $8, $f24 \n\t" |
2725 | | |
2726 | | "paddh $f16, $f16, $f4 \n\t" |
2727 | | "paddh $f18, $f18, $f6 \n\t" |
2728 | | "paddh $f20, $f20, $f12 \n\t" |
2729 | | "paddh $f22, $f22, $f14 \n\t" |
2730 | | "psllh $f20, $f20, $f24 \n\t" |
2731 | | "psllh $f22, $f22, $f24 \n\t" |
2732 | | "psubh $f20, $f20, $f16 \n\t" |
2733 | | "psubh $f22, $f22, $f18 \n\t" |
2734 | | "paddh $f8, $f8, $f0 \n\t" |
2735 | | "paddh $f10, $f10, $f2 \n\t" |
2736 | | "paddh $f8, $f8, $f20 \n\t" |
2737 | | "paddh $f10, $f10, $f22 \n\t" |
2738 | | "psllh $f20, $f20, $f24 \n\t" |
2739 | | "psllh $f22, $f22, $f24 \n\t" |
2740 | | "paddh $f8, $f8, $f20 \n\t" |
2741 | | "paddh $f10, $f10, $f22 \n\t" |
2742 | | "gssdlc1 $f8, 0x19(%[pTap]) \n\t" |
2743 | | "gssdlc1 $f10, 0x21(%[pTap]) \n\t" |
2744 | | "gssdrc1 $f8, 0x12(%[pTap]) \n\t" |
2745 | | "gssdrc1 $f10, 0x1a(%[pTap]) \n\t" |
2746 | | |
2747 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
2748 | | PTR_ADDU "%[pTap], %[pTap], %[iTapStride] \n\t" |
2749 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2750 | | "bnez %[iHeight], 2b \n\t" |
2751 | | "3: \n\t" |
2752 | | : [pSrc]"+&r"(pSrc), [pTap]"+&r"(pTap), [iWidth]"+&r"(iWidth), |
2753 | | [iHeight]"+&r"(iHeight) |
2754 | | : [iSrcStride]"r"(iSrcStride), [iTapStride]"r"(iTapStride) |
2755 | | : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", |
2756 | | "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" |
2757 | | ); |
2758 | | RECOVER_REG; |
2759 | | } |
2760 | | |
2761 | | static inline void McHorVer22Width8VerLastAlign_mmi(const uint8_t *pTap, |
2762 | | int32_t iTapStride, uint8_t * pDst, int32_t iDstStride, |
2763 | | int32_t iWidth, int32_t iHeight) { |
2764 | | BACKUP_REG; |
2765 | | __asm__ volatile ( |
2766 | | ".set arch=loongson3a \n\t" |
2767 | | "move $10, %[pTap] \n\t" |
2768 | | "move $11, %[pDst] \n\t" |
2769 | | "move $12, %[iHeight] \n\t" |
2770 | | "dsrl %[iWidth], 0x3 \n\t" |
2771 | | PTR_ADDU "$13, %[iTapStride], %[iTapStride] \n\t" |
2772 | | PTR_ADDU "$14, %[iDstStride], %[iDstStride] \n\t" |
2773 | | "dli $15, 0x0020002000200020 \n\t" |
2774 | | |
2775 | | "4: \n\t" |
2776 | | "gslqc1 $f2, $f0, 0x0(%[pTap]) \n\t" |
2777 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2778 | | "gslqc1 $f6, $f4, 0x0($8) \n\t" |
2779 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2780 | | "gslqc1 $f10, $f8, 0x0(%[pTap]) \n\t" |
2781 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2782 | | "gslqc1 $f14, $f12, 0x0($8) \n\t" |
2783 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2784 | | "gslqc1 $f18, $f16, 0x0(%[pTap]) \n\t" |
2785 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2786 | | "gslqc1 $f22, $f20, 0x0($8) \n\t" |
2787 | | |
2788 | | FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, |
2789 | | $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15) |
2790 | | |
2791 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2792 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2793 | | "gslqc1 $f26, $f24, 0x0(%[pTap]) \n\t" |
2794 | | "mov.d $f0, $f4 \n\t" |
2795 | | "mov.d $f2, $f6 \n\t" |
2796 | | "mov.d $f4, $f8 \n\t" |
2797 | | "mov.d $f6, $f10 \n\t" |
2798 | | "mov.d $f8, $f12 \n\t" |
2799 | | "mov.d $f10, $f14 \n\t" |
2800 | | "mov.d $f12, $f16 \n\t" |
2801 | | "mov.d $f14, $f18 \n\t" |
2802 | | "mov.d $f16, $f20 \n\t" |
2803 | | "mov.d $f18, $f22 \n\t" |
2804 | | "mov.d $f20, $f24 \n\t" |
2805 | | "mov.d $f22, $f26 \n\t" |
2806 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2807 | | PTR_SUBU "%[pTap], %[pTap], %[iTapStride] \n\t" |
2808 | | |
2809 | | "5: \n\t" |
2810 | | FILTER_VER_ALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, |
2811 | | $f22, $f24, $f26, $f28, $f30, %[pDst], $0, $8, $9, $15) |
2812 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2813 | | "beqz %[iHeight], 6f \n\t" |
2814 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2815 | | "gslqc1 $f26, $f24, 0x0(%[pTap]) \n\t" |
2816 | | |
2817 | | FILTER_VER_ALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, |
2818 | | $f26, $f28, $f30, $f0, $f2, %[pDst], %[iDstStride], $8, $9, $15) |
2819 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2820 | | "beqz %[iHeight], 6f \n\t" |
2821 | | PTR_ADDU "%[pDst], %[pDst], $14 \n\t" |
2822 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2823 | | "gslqc1 $f30, $f28, 0x0($8) \n\t" |
2824 | | |
2825 | | FILTER_VER_ALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, |
2826 | | $f30, $f0, $f2, $f4, $f6, %[pDst], $0, $8, $9, $15) |
2827 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2828 | | "beqz %[iHeight], 6f \n\t" |
2829 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2830 | | "gslqc1 $f2, $f0, 0x0(%[pTap]) \n\t" |
2831 | | |
2832 | | FILTER_VER_ALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, |
2833 | | $f2, $f4, $f6, $f8, $f10, %[pDst], %[iDstStride], $8, $9, $15) |
2834 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2835 | | "beqz %[iHeight], 6f \n\t" |
2836 | | PTR_ADDU "%[pDst], %[pDst], $14 \n\t" |
2837 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2838 | | "gslqc1 $f6, $f4, 0x0($8) \n\t" |
2839 | | |
2840 | | FILTER_VER_ALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, |
2841 | | $f6, $f8, $f10, $f12, $f14, %[pDst], $0, $8, $9, $15) |
2842 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2843 | | "beqz %[iHeight], 6f \n\t" |
2844 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2845 | | "gslqc1 $f10, $f8, 0x0(%[pTap]) \n\t" |
2846 | | |
2847 | | FILTER_VER_ALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, |
2848 | | $f10, $f12, $f14, $f16, $f18, %[pDst], %[iDstStride], $8, $9, $15) |
2849 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2850 | | "beqz %[iHeight], 6f \n\t" |
2851 | | PTR_ADDU "%[pDst], %[pDst], $14 \n\t" |
2852 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2853 | | "gslqc1 $f14, $f12, 0x0($8) \n\t" |
2854 | | |
2855 | | FILTER_VER_ALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, |
2856 | | $f14, $f16, $f18, $f20, $f22, %[pDst], $0, $8, $9, $15) |
2857 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2858 | | "beqz %[iHeight], 6f \n\t" |
2859 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2860 | | "gslqc1 $f18, $f16, 0x0(%[pTap]) \n\t" |
2861 | | |
2862 | | FILTER_VER_ALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, |
2863 | | $f18, $f20, $f22, $f24, $f26, %[pDst], %[iDstStride], $8, $9, $15) |
2864 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2865 | | "beqz %[iHeight], 6f \n\t" |
2866 | | PTR_ADDU "%[pDst], %[pDst], $14 \n\t" |
2867 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2868 | | "gslqc1 $f22, $f20, 0x0($8) \n\t" |
2869 | | "j 5b \n\t" |
2870 | | |
2871 | | "6: \n\t" |
2872 | | PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" |
2873 | | "beqz %[iWidth], 7f \n\t" |
2874 | | "move %[pTap], $10 \n\t" |
2875 | | "move %[pDst], $11 \n\t" |
2876 | | "move %[iHeight], $12 \n\t" |
2877 | | PTR_ADDIU "%[pTap], %[pTap], 0x10 \n\t" |
2878 | | PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t" |
2879 | | "j 4b \n\t" |
2880 | | "7: \n\t" |
2881 | | : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst), |
2882 | | [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight) |
2883 | | : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride) |
2884 | | : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$f0", |
2885 | | "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", |
2886 | | "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" |
2887 | | ); |
2888 | | RECOVER_REG; |
2889 | | } |
2890 | | |
2891 | | static inline void McHorVer22Width8VerLastUnAlign_mmi(const uint8_t *pTap, |
2892 | | int32_t iTapStride, uint8_t * pDst, int32_t iDstStride, |
2893 | | int32_t iWidth, int32_t iHeight) { |
2894 | | BACKUP_REG; |
2895 | | __asm__ volatile ( |
2896 | | ".set arch=loongson3a \n\t" |
2897 | | "move $10, %[pTap] \n\t" |
2898 | | "move $11, %[pDst] \n\t" |
2899 | | "move $12, %[iHeight] \n\t" |
2900 | | "dsrl %[iWidth], 0x3 \n\t" |
2901 | | PTR_ADDU "$13, %[iTapStride], %[iTapStride] \n\t" |
2902 | | "dli $14, 0x0020002000200020 \n\t" |
2903 | | |
2904 | | "4: \n\t" |
2905 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2906 | | "gsldlc1 $f0, 0x7(%[pTap]) \n\t" |
2907 | | "gsldlc1 $f2, 0xF(%[pTap]) \n\t" |
2908 | | "gsldlc1 $f4, 0x7($8) \n\t" |
2909 | | "gsldlc1 $f6, 0xF($8) \n\t" |
2910 | | "gsldrc1 $f0, 0x0(%[pTap]) \n\t" |
2911 | | "gsldrc1 $f2, 0x8(%[pTap]) \n\t" |
2912 | | "gsldrc1 $f4, 0x0($8) \n\t" |
2913 | | "gsldrc1 $f6, 0x8($8) \n\t" |
2914 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2915 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2916 | | "gsldlc1 $f8, 0x7(%[pTap]) \n\t" |
2917 | | "gsldlc1 $f10, 0xF(%[pTap]) \n\t" |
2918 | | "gsldlc1 $f12, 0x7($8) \n\t" |
2919 | | "gsldlc1 $f14, 0xF($8) \n\t" |
2920 | | "gsldrc1 $f8, 0x0(%[pTap]) \n\t" |
2921 | | "gsldrc1 $f10, 0x8(%[pTap]) \n\t" |
2922 | | "gsldrc1 $f12, 0x0($8) \n\t" |
2923 | | "gsldrc1 $f14, 0x8($8) \n\t" |
2924 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2925 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2926 | | "gsldlc1 $f16, 0x7(%[pTap]) \n\t" |
2927 | | "gsldlc1 $f18, 0xF(%[pTap]) \n\t" |
2928 | | "gsldlc1 $f20, 0x7($8) \n\t" |
2929 | | "gsldlc1 $f22, 0xF($8) \n\t" |
2930 | | "gsldrc1 $f16, 0x0(%[pTap]) \n\t" |
2931 | | "gsldrc1 $f18, 0x8(%[pTap]) \n\t" |
2932 | | "gsldrc1 $f20, 0x0($8) \n\t" |
2933 | | "gsldrc1 $f22, 0x8($8) \n\t" |
2934 | | |
2935 | | FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, |
2936 | | $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14) |
2937 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2938 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2939 | | "gsldlc1 $f24, 0x7(%[pTap]) \n\t" |
2940 | | "gsldlc1 $f26, 0xF(%[pTap]) \n\t" |
2941 | | "gsldrc1 $f24, 0x0(%[pTap]) \n\t" |
2942 | | "gsldrc1 $f26, 0x8(%[pTap]) \n\t" |
2943 | | "mov.d $f0, $f4 \n\t" |
2944 | | "mov.d $f2, $f6 \n\t" |
2945 | | "mov.d $f4, $f8 \n\t" |
2946 | | "mov.d $f6, $f10 \n\t" |
2947 | | "mov.d $f8, $f12 \n\t" |
2948 | | "mov.d $f10, $f14 \n\t" |
2949 | | "mov.d $f12, $f16 \n\t" |
2950 | | "mov.d $f14, $f18 \n\t" |
2951 | | "mov.d $f16, $f20 \n\t" |
2952 | | "mov.d $f18, $f22 \n\t" |
2953 | | "mov.d $f20, $f24 \n\t" |
2954 | | "mov.d $f22, $f26 \n\t" |
2955 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2956 | | PTR_SUBU "%[pTap], %[pTap], %[iTapStride] \n\t" |
2957 | | |
2958 | | "5: \n\t" |
2959 | | FILTER_VER_UNALIGN($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, |
2960 | | $f20, $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9, $14) |
2961 | | |
2962 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2963 | | "beqz %[iHeight], 6f \n\t" |
2964 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2965 | | "gsldlc1 $f24, 0x7(%[pTap]) \n\t" |
2966 | | "gsldlc1 $f26, 0xF(%[pTap]) \n\t" |
2967 | | "gsldrc1 $f24, 0x0(%[pTap]) \n\t" |
2968 | | "gsldrc1 $f26, 0x8(%[pTap]) \n\t" |
2969 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2970 | | |
2971 | | FILTER_VER_UNALIGN($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, |
2972 | | $f24, $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9, $14) |
2973 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2974 | | "beqz %[iHeight], 6f \n\t" |
2975 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2976 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2977 | | "gsldlc1 $f28, 0x7($8) \n\t" |
2978 | | "gsldlc1 $f30, 0xF($8) \n\t" |
2979 | | "gsldrc1 $f28, 0x0($8) \n\t" |
2980 | | "gsldrc1 $f30, 0x8($8) \n\t" |
2981 | | |
2982 | | FILTER_VER_UNALIGN($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, |
2983 | | $f28, $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9, $14) |
2984 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2985 | | "beqz %[iHeight], 6f \n\t" |
2986 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
2987 | | "gsldlc1 $f0, 0x7(%[pTap]) \n\t" |
2988 | | "gsldlc1 $f2, 0xF(%[pTap]) \n\t" |
2989 | | "gsldrc1 $f0, 0x0(%[pTap]) \n\t" |
2990 | | "gsldrc1 $f2, 0x8(%[pTap]) \n\t" |
2991 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2992 | | |
2993 | | FILTER_VER_UNALIGN($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, |
2994 | | $f30, $f0, $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9, $14) |
2995 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
2996 | | "beqz %[iHeight], 6f \n\t" |
2997 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
2998 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
2999 | | "gsldlc1 $f4, 0x7($8) \n\t" |
3000 | | "gsldlc1 $f6, 0xF($8) \n\t" |
3001 | | "gsldrc1 $f4, 0x0($8) \n\t" |
3002 | | "gsldrc1 $f6, 0x8($8) \n\t" |
3003 | | |
3004 | | FILTER_VER_UNALIGN($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, |
3005 | | $f4, $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9, $14) |
3006 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3007 | | "beqz %[iHeight], 6f \n\t" |
3008 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
3009 | | "gsldlc1 $f8, 0x7(%[pTap]) \n\t" |
3010 | | "gsldlc1 $f10, 0xF(%[pTap]) \n\t" |
3011 | | "gsldrc1 $f8, 0x0(%[pTap]) \n\t" |
3012 | | "gsldrc1 $f10, 0x8(%[pTap]) \n\t" |
3013 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3014 | | |
3015 | | FILTER_VER_UNALIGN($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, |
3016 | | $f8, $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9, $14) |
3017 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3018 | | "beqz %[iHeight], 6f \n\t" |
3019 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3020 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
3021 | | "gsldlc1 $f12, 0x7($8) \n\t" |
3022 | | "gsldlc1 $f14, 0xF($8) \n\t" |
3023 | | "gsldrc1 $f12, 0x0($8) \n\t" |
3024 | | "gsldrc1 $f14, 0x8($8) \n\t" |
3025 | | |
3026 | | FILTER_VER_UNALIGN($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, |
3027 | | $f12, $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9, $14) |
3028 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3029 | | "beqz %[iHeight], 6f \n\t" |
3030 | | PTR_ADDU "%[pTap], %[pTap], $13 \n\t" |
3031 | | "gsldlc1 $f16, 0x7(%[pTap]) \n\t" |
3032 | | "gsldlc1 $f18, 0xF(%[pTap]) \n\t" |
3033 | | "gsldrc1 $f16, 0x0(%[pTap]) \n\t" |
3034 | | "gsldrc1 $f18, 0x8(%[pTap]) \n\t" |
3035 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3036 | | |
3037 | | FILTER_VER_UNALIGN($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, |
3038 | | $f16, $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9, $14) |
3039 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3040 | | "beqz %[iHeight], 6f \n\t" |
3041 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3042 | | PTR_ADDU "$8, %[pTap], %[iTapStride] \n\t" |
3043 | | "gsldlc1 $f20, 0x7($8) \n\t" |
3044 | | "gsldlc1 $f22, 0xF($8) \n\t" |
3045 | | "gsldrc1 $f20, 0x0($8) \n\t" |
3046 | | "gsldrc1 $f22, 0x8($8) \n\t" |
3047 | | "j 5b \n\t" |
3048 | | |
3049 | | "6: \n\t" |
3050 | | PTR_ADDIU "%[iWidth], %[iWidth], -0x1 \n\t" |
3051 | | "beqz %[iWidth], 7f \n\t" |
3052 | | "move %[pTap], $10 \n\t" |
3053 | | "move %[pDst], $11 \n\t" |
3054 | | "move %[iHeight], $12 \n\t" |
3055 | | PTR_ADDIU "%[pTap], %[pTap], 0x10 \n\t" |
3056 | | PTR_ADDIU "%[pDst], %[pDst], 0x8 \n\t" |
3057 | | "j 4b \n\t" |
3058 | | |
3059 | | "7: \n\t" |
3060 | | : [pTap]"+&r"((unsigned char *)pTap), [pDst]"+&r"((unsigned char *)pDst), |
3061 | | [iWidth]"+&r"((int)iWidth), [iHeight]"+&r"((int)iHeight) |
3062 | | : [iTapStride]"r"((int)iTapStride), [iDstStride]"r"((int)iDstStride) |
3063 | | : "memory", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$f0", "$f2", |
3064 | | "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", "$f16", "$f18", "$f20", |
3065 | | "$f22", "$f24", "$f26", "$f28", "$f30" |
3066 | | ); |
3067 | | RECOVER_REG; |
3068 | | } |
3069 | | |
3070 | | //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample |
3071 | | static inline void McHorVer22Width5Or9Or17Height5Or9Or17_mmi(const uint8_t* pSrc, |
3072 | | int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
3073 | | int32_t iWidth, int32_t iHeight) { |
3074 | | ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16) |
3075 | | |
3076 | | if (iWidth == 17 || iWidth == 9){ |
3077 | | int32_t tmp1 = 2 * (iWidth - 8); |
3078 | | McHorVer22HorFirst_mmi(pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5); |
3079 | | |
3080 | | McHorVer22Width8VerLastAlign_mmi((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight); |
3081 | | |
3082 | | McHorVer22Width8VerLastUnAlign_mmi((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, |
3083 | | iDstStride, 8, iHeight); |
3084 | | } else { |
3085 | | int16_t iTmp[17 + 5]; |
3086 | | int32_t i, j, k; |
3087 | | |
3088 | | for (i = 0; i < iHeight; i++) { |
3089 | | for (j = 0; j < iWidth + 5; j++) { |
3090 | | iTmp[j] = FilterInput8bitWithStride_c (pSrc - 2 + j, iSrcStride); |
3091 | | } |
3092 | | for (k = 0; k < iWidth; k++) { |
3093 | | pDst[k] = WelsClip1 ((HorFilterInput16bit_c (&iTmp[k]) + 512) >> 10); |
3094 | | } |
3095 | | pSrc += iSrcStride; |
3096 | | pDst += iDstStride; |
3097 | | } |
3098 | | } |
3099 | | } |
3100 | | |
3101 | | void McCopyWidthEq4_mmi(const uint8_t *pSrc, int iSrcStride, |
3102 | | uint8_t *pDst, int iDstStride, int iHeight) { |
3103 | | __asm__ volatile ( |
3104 | | ".set arch=loongson3a \n\t" |
3105 | | "1: \n\t" |
3106 | | "lwl $8, 0x3(%[pSrc]) \n\t" |
3107 | | "lwr $8, 0x0(%[pSrc]) \n\t" |
3108 | | "swl $8, 0x3(%[pDst]) \n\t" |
3109 | | "swr $8, 0x0(%[pDst]) \n\t" |
3110 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3111 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3112 | | PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t" |
3113 | | "bnez %[iHeight], 1b \n\t" |
3114 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight) |
3115 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3116 | | : "memory", "$8" |
3117 | | ); |
3118 | | } |
3119 | | |
3120 | | void McCopyWidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, |
3121 | | uint8_t *pDst, int iDstStride, int iHeight) { |
3122 | | __asm__ volatile ( |
3123 | | ".set arch=loongson3a \n\t" |
3124 | | "1: \n\t" |
3125 | | "ldl $8, 0x7(%[pSrc]) \n\t" |
3126 | | "ldr $8, 0x0(%[pSrc]) \n\t" |
3127 | | "sdl $8, 0x7(%[pDst]) \n\t" |
3128 | | "sdr $8, 0x0(%[pDst]) \n\t" |
3129 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3130 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3131 | | PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t" |
3132 | | "bnez %[iHeight], 1b \n\t" |
3133 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight) |
3134 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3135 | | : "memory", "$8" |
3136 | | ); |
3137 | | } |
3138 | | |
3139 | | void McCopyWidthEq16_mmi(const uint8_t *pSrc, int iSrcStride, |
3140 | | uint8_t *pDst, int iDstStride, int iHeight) { |
3141 | | __asm__ volatile ( |
3142 | | ".set arch=loongson3a \n\t" |
3143 | | "1: \n\t" |
3144 | | "ldl $8, 0x7(%[pSrc]) \n\t" |
3145 | | "ldl $9, 0xF(%[pSrc]) \n\t" |
3146 | | "ldr $8, 0x0(%[pSrc]) \n\t" |
3147 | | "ldr $9, 0x8(%[pSrc]) \n\t" |
3148 | | "sdl $8, 0x7(%[pDst]) \n\t" |
3149 | | "sdl $9, 0xF(%[pDst]) \n\t" |
3150 | | "sdr $8, 0x0(%[pDst]) \n\t" |
3151 | | "sdr $9, 0x8(%[pDst]) \n\t" |
3152 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3153 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3154 | | PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t" |
3155 | | "bnez %[iHeight], 1b \n\t" |
3156 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight) |
3157 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3158 | | : "memory", "$8", "$9" |
3159 | | ); |
3160 | | } |
3161 | | |
3162 | | static inline void McCopy_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
3163 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
3164 | | if (iWidth == 16) |
3165 | | McCopyWidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3166 | | else if (iWidth == 8) |
3167 | | McCopyWidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3168 | | else if (iWidth == 4) |
3169 | | McCopyWidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3170 | | else |
3171 | | McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3172 | | } |
3173 | | |
3174 | | void McChromaWidthEq4_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, |
3175 | | int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) { |
3176 | | __asm__ volatile ( |
3177 | | ".set arch=loongson3a \n\t" |
3178 | | "gsldlc1 $f6, 0x7(%[pABCD]) \n\t" |
3179 | | "gsldrc1 $f6, 0x0(%[pABCD]) \n\t" |
3180 | | "xor $f14, $f14, $f14 \n\t" |
3181 | | "punpcklbh $f6, $f6, $f6 \n\t" |
3182 | | "mov.d $f8, $f6 \n\t" |
3183 | | "punpcklhw $f6, $f6, $f6 \n\t" |
3184 | | "punpckhhw $f8, $f8, $f8 \n\t" |
3185 | | "mov.d $f10, $f6 \n\t" |
3186 | | "punpcklbh $f6, $f6, $f14 \n\t" |
3187 | | "punpckhbh $f10, $f10, $f14 \n\t" |
3188 | | |
3189 | | "mov.d $f12, $f8 \n\t" |
3190 | | "punpcklbh $f8, $f8, $f14 \n\t" |
3191 | | "punpckhbh $f12, $f12, $f14 \n\t" |
3192 | | PTR_ADDU "%[pABCD], %[pSrc], %[iSrcStride] \n\t" |
3193 | | "dli $8, 0x6 \n\t" |
3194 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
3195 | | "gsldlc1 $f2, 0x8(%[pSrc]) \n\t" |
3196 | | "dmtc1 $8, $f16 \n\t" |
3197 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
3198 | | "gsldrc1 $f2, 0x1(%[pSrc]) \n\t" |
3199 | | "dli $8, 0x0020002000200020 \n\t" |
3200 | | "punpcklbh $f0, $f0, $f14 \n\t" |
3201 | | "punpcklbh $f2, $f2, $f14 \n\t" |
3202 | | |
3203 | | "dmtc1 $8, $f18 \n\t" |
3204 | | "1: \n\t" |
3205 | | "pmullh $f0, $f0, $f6 \n\t" |
3206 | | "pmullh $f2, $f2, $f10 \n\t" |
3207 | | "paddh $f0, $f0, $f2 \n\t" |
3208 | | |
3209 | | "gsldlc1 $f2, 0x7(%[pABCD]) \n\t" |
3210 | | "gsldrc1 $f2, 0x0(%[pABCD]) \n\t" |
3211 | | "punpcklbh $f2, $f2, $f14 \n\t" |
3212 | | "mov.d $f4, $f2 \n\t" |
3213 | | "pmullh $f2, $f2, $f8 \n\t" |
3214 | | "paddh $f0, $f0, $f2 \n\t" |
3215 | | "gsldlc1 $f2, 0x8(%[pABCD]) \n\t" |
3216 | | "gsldrc1 $f2, 0x1(%[pABCD]) \n\t" |
3217 | | "punpcklbh $f2, $f2, $f14 \n\t" |
3218 | | "mov.d $f14, $f2 \n\t" |
3219 | | "pmullh $f2, $f2, $f12 \n\t" |
3220 | | "paddh $f0, $f0, $f2 \n\t" |
3221 | | "mov.d $f2, $f14 \n\t" |
3222 | | "paddh $f0, $f0, $f18 \n\t" |
3223 | | "psrlh $f0, $f0, $f16 \n\t" |
3224 | | "xor $f14, $f14, $f14 \n\t" |
3225 | | "packushb $f0, $f0, $f14 \n\t" |
3226 | | "gsswlc1 $f0, 0x3(%[pDst]) \n\t" |
3227 | | "gsswrc1 $f0, 0x0(%[pDst]) \n\t" |
3228 | | "mov.d $f0, $f4 \n\t" |
3229 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3230 | | PTR_ADDU "%[pABCD], %[pABCD], %[iSrcStride] \n\t" |
3231 | | PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t" |
3232 | | "bnez %[iHeight], 1b \n\t" |
3233 | | : [pSrc]"+&r"((unsigned char *)pSrc), [pDst]"+&r"((unsigned char *)pDst), |
3234 | | [pABCD]"+&r"((unsigned char *)pABCD), [iHeight]"+&r"((int)iHeight) |
3235 | | : [iSrcStride]"r"((int)iSrcStride), [iDstStride]"r"((int)iDstStride) |
3236 | | : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", |
3237 | | "$f14", "$f16", "$f18" |
3238 | | ); |
3239 | | } |
3240 | | |
3241 | | void McChromaWidthEq8_mmi(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, |
3242 | | int32_t iDstStride, const uint8_t *pABCD, int32_t iHeight) { |
3243 | | BACKUP_REG; |
3244 | | __asm__ volatile ( |
3245 | | ".set arch=loongson3a \n\t" |
3246 | | "gsldlc1 $f12, 0x7(%[pABCD]) \n\t" |
3247 | | "xor $f28, $f28, $f28 \n\t" |
3248 | | "gsldrc1 $f12, 0x0(%[pABCD]) \n\t" |
3249 | | "punpcklbh $f12, $f12, $f12 \n\t" |
3250 | | "punpckhhw $f14, $f12, $f12 \n\t" |
3251 | | "punpcklhw $f12, $f12, $f12 \n\t" |
3252 | | |
3253 | | "mov.d $f16, $f14 \n\t" |
3254 | | "punpckhwd $f14, $f12, $f12 \n\t" |
3255 | | "punpcklwd $f12, $f12, $f12 \n\t" |
3256 | | "punpckhwd $f18, $f16, $f16 \n\t" |
3257 | | "punpcklwd $f16, $f16, $f16 \n\t" |
3258 | | "mov.d $f20, $f14 \n\t" |
3259 | | "mov.d $f24, $f18 \n\t" |
3260 | | |
3261 | | "punpckhbh $f14, $f12, $f28 \n\t" |
3262 | | "punpcklbh $f12, $f12, $f28 \n\t" |
3263 | | "punpckhbh $f22, $f20, $f28 \n\t" |
3264 | | "punpcklbh $f20, $f20, $f28 \n\t" |
3265 | | "punpckhbh $f18, $f16, $f28 \n\t" |
3266 | | "punpcklbh $f16, $f16, $f28 \n\t" |
3267 | | "punpckhbh $f26, $f24, $f28 \n\t" |
3268 | | "punpcklbh $f24, $f24, $f28 \n\t" |
3269 | | |
3270 | | PTR_ADDU "%[pABCD], %[pSrc], %[iSrcStride] \n\t" |
3271 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
3272 | | "gsldlc1 $f4, 0x8(%[pSrc]) \n\t" |
3273 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
3274 | | "gsldrc1 $f4, 0x1(%[pSrc]) \n\t" |
3275 | | "punpckhbh $f2, $f0, $f28 \n\t" |
3276 | | "punpcklbh $f0, $f0, $f28 \n\t" |
3277 | | "punpckhbh $f6, $f4, $f28 \n\t" |
3278 | | "punpcklbh $f4, $f4, $f28 \n\t" |
3279 | | "1: \n\t" |
3280 | | "dli $8, 0x20 \n\t" |
3281 | | "dmtc1 $8, $f30 \n\t" |
3282 | | |
3283 | | "pmullh $f0, $f0, $f12 \n\t" |
3284 | | "pmullh $f2, $f2, $f14 \n\t" |
3285 | | "pmullh $f4, $f4, $f20 \n\t" |
3286 | | "pmullh $f6, $f6, $f22 \n\t" |
3287 | | "paddh $f0, $f0, $f4 \n\t" |
3288 | | "paddh $f2, $f2, $f6 \n\t" |
3289 | | |
3290 | | "gsldlc1 $f4, 0x7(%[pABCD]) \n\t" |
3291 | | "gsldrc1 $f4, 0x0(%[pABCD]) \n\t" |
3292 | | "punpckhbh $f6, $f4, $f28 \n\t" |
3293 | | "punpcklbh $f4, $f4, $f28 \n\t" |
3294 | | "mov.d $f8, $f4 \n\t" |
3295 | | "mov.d $f10, $f6 \n\t" |
3296 | | "pmullh $f4, $f4, $f16 \n\t" |
3297 | | "pmullh $f6, $f6, $f18 \n\t" |
3298 | | "paddh $f0, $f0, $f4 \n\t" |
3299 | | "paddh $f2, $f2, $f6 \n\t" |
3300 | | |
3301 | | "gsldlc1 $f4, 0x8(%[pABCD]) \n\t" |
3302 | | "gsldrc1 $f4, 0x1(%[pABCD]) \n\t" |
3303 | | "punpckhbh $f6, $f4, $f28 \n\t" |
3304 | | "punpcklbh $f4, $f4, $f28 \n\t" |
3305 | | "mov.d $f28, $f4 \n\t" |
3306 | | "mov.d $f30, $f6 \n\t" |
3307 | | "pmullh $f4, $f4, $f24 \n\t" |
3308 | | "pmullh $f6, $f6, $f26 \n\t" |
3309 | | "paddh $f0, $f0, $f4 \n\t" |
3310 | | "paddh $f2, $f2, $f6 \n\t" |
3311 | | "mov.d $f4, $f28 \n\t" |
3312 | | "mov.d $f6, $f30 \n\t" |
3313 | | |
3314 | | "dli $8, 0x0020002000200020 \n\t" |
3315 | | "dmfc1 $9, $f20 \n\t" |
3316 | | "dmtc1 $8, $f20 \n\t" |
3317 | | "dli $8, 0x6 \n\t" |
3318 | | "paddh $f0, $f0, $f20 \n\t" |
3319 | | "paddh $f2, $f2, $f20 \n\t" |
3320 | | "dmtc1 $8, $f20 \n\t" |
3321 | | "psrlh $f0, $f0, $f20 \n\t" |
3322 | | "psrlh $f2, $f2, $f20 \n\t" |
3323 | | |
3324 | | "xor $f28, $f28, $f28 \n\t" |
3325 | | "packushb $f0, $f0, $f2 \n\t" |
3326 | | "gssdlc1 $f0, 0x7(%[pDst]) \n\t" |
3327 | | "gssdrc1 $f0, 0x0(%[pDst]) \n\t" |
3328 | | |
3329 | | "mov.d $f0, $f8 \n\t" |
3330 | | "mov.d $f2, $f10 \n\t" |
3331 | | "dmtc1 $9, $f20 \n\t" |
3332 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3333 | | PTR_ADDU "%[pABCD], %[pABCD], %[iSrcStride] \n\t" |
3334 | | |
3335 | | PTR_ADDIU "%[iHeight], %[iHeight], -1 \n\t" |
3336 | | "bnez %[iHeight], 1b \n\t" |
3337 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [pABCD]"+&r"(pABCD), |
3338 | | [iHeight]"+&r"(iHeight) |
3339 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3340 | | : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", |
3341 | | "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" |
3342 | | ); |
3343 | | RECOVER_REG; |
3344 | | } |
3345 | | |
3346 | | void McChroma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
3347 | | int32_t iDstStride, int16_t iMvX, int16_t iMvY, |
3348 | | int32_t iWidth, int32_t iHeight) { |
3349 | | static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = { |
3350 | | McChromaWidthEq4_mmi, |
3351 | | McChromaWidthEq8_mmi |
3352 | | }; |
3353 | | const int32_t kiD8x = iMvX & 0x07; |
3354 | | const int32_t kiD8y = iMvY & 0x07; |
3355 | | if (kiD8x == 0 && kiD8y == 0) { |
3356 | | McCopy_mmi (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
3357 | | return; |
3358 | | } |
3359 | | if (iWidth != 2) { |
3360 | | kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, |
3361 | | g_kuiABCD[kiD8y][kiD8x], iHeight); |
3362 | | } else |
3363 | | McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, |
3364 | | iWidth, iHeight); |
3365 | | } |
3366 | | |
3367 | | void McHorVer20WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst, |
3368 | | int iDstStride, int iHeight) { |
3369 | | BACKUP_REG; |
3370 | | __asm__ volatile ( |
3371 | | ".set arch=loongson3a \n\t" |
3372 | | PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t" |
3373 | | "xor $f28, $f28, $f28 \n\t" |
3374 | | "dli $8, 0x0010001000100010 \n\t" |
3375 | | "dmtc1 $8, $f24 \n\t" |
3376 | | "dli $8, 0x2 \n\t" |
3377 | | "dmtc1 $8, $f26 \n\t" |
3378 | | "dli $8, 0x5 \n\t" |
3379 | | "dmtc1 $8, $f30 \n\t" |
3380 | | "1: \n\t" |
3381 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
3382 | | "gsldlc1 $f4, 0xc(%[pSrc]) \n\t" |
3383 | | "gsldlc1 $f8, 0x8(%[pSrc]) \n\t" |
3384 | | "gsldlc1 $f12, 0xb(%[pSrc]) \n\t" |
3385 | | "gsldlc1 $f16, 0x9(%[pSrc]) \n\t" |
3386 | | "gsldlc1 $f20, 0xa(%[pSrc]) \n\t" |
3387 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
3388 | | "gsldrc1 $f4, 0x5(%[pSrc]) \n\t" |
3389 | | "gsldrc1 $f8, 0x1(%[pSrc]) \n\t" |
3390 | | "gsldrc1 $f12, 0x4(%[pSrc]) \n\t" |
3391 | | "gsldrc1 $f16, 0x2(%[pSrc]) \n\t" |
3392 | | "gsldrc1 $f20, 0x3(%[pSrc]) \n\t" |
3393 | | "punpckhbh $f2, $f0, $f28 \n\t" |
3394 | | "punpckhbh $f6, $f4, $f28 \n\t" |
3395 | | "punpckhbh $f10, $f8, $f28 \n\t" |
3396 | | "punpckhbh $f14, $f12, $f28 \n\t" |
3397 | | "punpckhbh $f18, $f16, $f28 \n\t" |
3398 | | "punpckhbh $f22, $f20, $f28 \n\t" |
3399 | | "punpcklbh $f0, $f0, $f28 \n\t" |
3400 | | "punpcklbh $f4, $f4, $f28 \n\t" |
3401 | | "punpcklbh $f8, $f8, $f28 \n\t" |
3402 | | "punpcklbh $f12, $f12, $f28 \n\t" |
3403 | | "punpcklbh $f16, $f16, $f28 \n\t" |
3404 | | "punpcklbh $f20, $f20, $f28 \n\t" |
3405 | | "paddh $f8, $f8, $f12 \n\t" |
3406 | | "paddh $f10, $f10, $f14 \n\t" |
3407 | | "paddh $f16, $f16, $f20 \n\t" |
3408 | | "paddh $f18, $f18, $f22 \n\t" |
3409 | | "psllh $f16, $f16, $f26 \n\t" |
3410 | | "psllh $f18, $f18, $f26 \n\t" |
3411 | | "psubh $f16, $f16, $f8 \n\t" |
3412 | | "psubh $f18, $f18, $f10 \n\t" |
3413 | | "paddh $f0, $f0, $f4 \n\t" |
3414 | | "paddh $f2, $f2, $f6 \n\t" |
3415 | | "paddh $f0, $f0, $f16 \n\t" |
3416 | | "paddh $f2, $f2, $f18 \n\t" |
3417 | | "psllh $f16, $f16, $f26 \n\t" |
3418 | | "psllh $f18, $f18, $f26 \n\t" |
3419 | | "paddh $f0, $f0, $f16 \n\t" |
3420 | | "paddh $f2, $f2, $f18 \n\t" |
3421 | | "paddh $f0, $f0, $f24 \n\t" |
3422 | | "paddh $f2, $f2, $f24 \n\t" |
3423 | | "psrah $f0, $f0, $f30 \n\t" |
3424 | | "psrah $f2, $f2, $f30 \n\t" |
3425 | | "packushb $f0, $f0, $f2 \n\t" |
3426 | | "gssdlc1 $f0, 0x7(%[pDst]) \n\t" |
3427 | | "gssdrc1 $f0, 0x0(%[pDst]) \n\t" |
3428 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3429 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3430 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3431 | | "bnez %[iHeight], 1b \n\t" |
3432 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight) |
3433 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3434 | | : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", |
3435 | | "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" |
3436 | | ); |
3437 | | RECOVER_REG; |
3438 | | } |
3439 | | |
3440 | | void McHorVer20WidthEq16_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst, |
3441 | | int iDstStride, int iHeight) { |
3442 | | BACKUP_REG; |
3443 | | __asm__ volatile ( |
3444 | | ".set arch=loongson3a \n\t" |
3445 | | PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t" |
3446 | | "dli $8, 0x0010001000100010 \n\t" |
3447 | | "dmtc1 $8, $f24 \n\t" |
3448 | | "dli $8, 0x2 \n\t" |
3449 | | "dmtc1 $8, $f26 \n\t" |
3450 | | "dli $8, 0x5 \n\t" |
3451 | | "dmtc1 $8, $f30 \n\t" |
3452 | | "1: \n\t" |
3453 | | "xor $f28, $f28, $f28 \n\t" |
3454 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
3455 | | "gsldlc1 $f4, 0xc(%[pSrc]) \n\t" |
3456 | | "gsldlc1 $f8, 0x8(%[pSrc]) \n\t" |
3457 | | "gsldlc1 $f12, 0xb(%[pSrc]) \n\t" |
3458 | | "gsldlc1 $f16, 0x9(%[pSrc]) \n\t" |
3459 | | "gsldlc1 $f20, 0xa(%[pSrc]) \n\t" |
3460 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
3461 | | "gsldrc1 $f4, 0x5(%[pSrc]) \n\t" |
3462 | | "gsldrc1 $f8, 0x1(%[pSrc]) \n\t" |
3463 | | "gsldrc1 $f12, 0x4(%[pSrc]) \n\t" |
3464 | | "gsldrc1 $f16, 0x2(%[pSrc]) \n\t" |
3465 | | "gsldrc1 $f20, 0x3(%[pSrc]) \n\t" |
3466 | | "punpckhbh $f2, $f0, $f28 \n\t" |
3467 | | "punpckhbh $f6, $f4, $f28 \n\t" |
3468 | | "punpckhbh $f10, $f8, $f28 \n\t" |
3469 | | "punpckhbh $f14, $f12, $f28 \n\t" |
3470 | | "punpckhbh $f18, $f16, $f28 \n\t" |
3471 | | "punpckhbh $f22, $f20, $f28 \n\t" |
3472 | | "punpcklbh $f0, $f0, $f28 \n\t" |
3473 | | "punpcklbh $f4, $f4, $f28 \n\t" |
3474 | | "punpcklbh $f8, $f8, $f28 \n\t" |
3475 | | "punpcklbh $f12, $f12, $f28 \n\t" |
3476 | | "punpcklbh $f16, $f16, $f28 \n\t" |
3477 | | "punpcklbh $f20, $f20, $f28 \n\t" |
3478 | | "paddh $f8, $f8, $f12 \n\t" |
3479 | | "paddh $f10, $f10, $f14 \n\t" |
3480 | | "paddh $f16, $f16, $f20 \n\t" |
3481 | | "paddh $f18, $f18, $f22 \n\t" |
3482 | | "psllh $f16, $f16, $f26 \n\t" |
3483 | | "psllh $f18, $f18, $f26 \n\t" |
3484 | | "psubh $f16, $f16, $f8 \n\t" |
3485 | | "psubh $f18, $f18, $f10 \n\t" |
3486 | | "paddh $f0, $f0, $f4 \n\t" |
3487 | | "paddh $f2, $f2, $f6 \n\t" |
3488 | | "paddh $f0, $f0, $f16 \n\t" |
3489 | | "paddh $f2, $f2, $f18 \n\t" |
3490 | | "psllh $f16, $f16, $f26 \n\t" |
3491 | | "psllh $f18, $f18, $f26 \n\t" |
3492 | | "paddh $f0, $f0, $f16 \n\t" |
3493 | | "paddh $f2, $f2, $f18 \n\t" |
3494 | | "paddh $f0, $f0, $f24 \n\t" |
3495 | | "paddh $f2, $f2, $f24 \n\t" |
3496 | | "psrah $f0, $f0, $f30 \n\t" |
3497 | | "psrah $f2, $f2, $f30 \n\t" |
3498 | | "packushb $f0, $f0, $f2 \n\t" |
3499 | | "gssdlc1 $f0, 0x7(%[pDst]) \n\t" |
3500 | | "gssdrc1 $f0, 0x0(%[pDst]) \n\t" |
3501 | | "gsldlc1 $f0, 0xF(%[pSrc]) \n\t" |
3502 | | "gsldlc1 $f4, 0x14(%[pSrc]) \n\t" |
3503 | | "gsldlc1 $f8, 0x10(%[pSrc]) \n\t" |
3504 | | "gsldlc1 $f12, 0x13(%[pSrc]) \n\t" |
3505 | | "gsldlc1 $f16, 0x11(%[pSrc]) \n\t" |
3506 | | "gsldlc1 $f20, 0x12(%[pSrc]) \n\t" |
3507 | | "gsldrc1 $f0, 0x8(%[pSrc]) \n\t" |
3508 | | "gsldrc1 $f4, 0xd(%[pSrc]) \n\t" |
3509 | | "gsldrc1 $f8, 0x9(%[pSrc]) \n\t" |
3510 | | "gsldrc1 $f12, 0xc(%[pSrc]) \n\t" |
3511 | | "gsldrc1 $f16, 0xa(%[pSrc]) \n\t" |
3512 | | "gsldrc1 $f20, 0xb(%[pSrc]) \n\t" |
3513 | | "punpckhbh $f2, $f0, $f28 \n\t" |
3514 | | "punpckhbh $f6, $f4, $f28 \n\t" |
3515 | | "punpckhbh $f10, $f8, $f28 \n\t" |
3516 | | "punpckhbh $f14, $f12, $f28 \n\t" |
3517 | | "punpckhbh $f18, $f16, $f28 \n\t" |
3518 | | "punpckhbh $f22, $f20, $f28 \n\t" |
3519 | | "punpcklbh $f0, $f0, $f28 \n\t" |
3520 | | "punpcklbh $f4, $f4, $f28 \n\t" |
3521 | | "punpcklbh $f8, $f8, $f28 \n\t" |
3522 | | "punpcklbh $f12, $f12, $f28 \n\t" |
3523 | | "punpcklbh $f16, $f16, $f28 \n\t" |
3524 | | "punpcklbh $f20, $f20, $f28 \n\t" |
3525 | | "paddh $f8, $f8, $f12 \n\t" |
3526 | | "paddh $f10, $f10, $f14 \n\t" |
3527 | | "paddh $f16, $f16, $f20 \n\t" |
3528 | | "paddh $f18, $f18, $f22 \n\t" |
3529 | | "psllh $f16, $f16, $f26 \n\t" |
3530 | | "psllh $f18, $f18, $f26 \n\t" |
3531 | | "psubh $f16, $f16, $f8 \n\t" |
3532 | | "psubh $f18, $f18, $f10 \n\t" |
3533 | | "paddh $f0, $f0, $f4 \n\t" |
3534 | | "paddh $f2, $f2, $f6 \n\t" |
3535 | | "paddh $f0, $f0, $f16 \n\t" |
3536 | | "paddh $f2, $f2, $f18 \n\t" |
3537 | | "psllh $f16, $f16, $f26 \n\t" |
3538 | | "psllh $f18, $f18, $f26 \n\t" |
3539 | | "paddh $f0, $f0, $f16 \n\t" |
3540 | | "paddh $f2, $f2, $f18 \n\t" |
3541 | | "paddh $f0, $f0, $f24 \n\t" |
3542 | | "paddh $f2, $f2, $f24 \n\t" |
3543 | | "psrah $f0, $f0, $f30 \n\t" |
3544 | | "psrah $f2, $f2, $f30 \n\t" |
3545 | | "packushb $f0, $f0, $f2 \n\t" |
3546 | | "gssdlc1 $f0, 0xF(%[pDst]) \n\t" |
3547 | | "gssdrc1 $f0, 0x8(%[pDst]) \n\t" |
3548 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3549 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3550 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3551 | | "bnez %[iHeight], 1b \n\t" |
3552 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight) |
3553 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3554 | | : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", |
3555 | | "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" |
3556 | | ); |
3557 | | RECOVER_REG; |
3558 | | } |
3559 | | |
3560 | | void McHorVer20WidthEq4_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst, |
3561 | | int iDstStride, int iHeight) { |
3562 | | __asm__ volatile ( |
3563 | | ".set arch=loongson3a \n\t" |
3564 | | "1: \n\t" |
3565 | | PTR_ADDIU "%[pSrc], %[pSrc], -0x2 \n\t" |
3566 | | "xor $f14, $f14, $f14 \n\t" |
3567 | | "dli $8, 0x0010001000100010 \n\t" |
3568 | | "dmtc1 $8, $f12 \n\t" |
3569 | | "1: \n\t" |
3570 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
3571 | | "gsldlc1 $f2, 0xc(%[pSrc]) \n\t" |
3572 | | "gsldlc1 $f4, 0x8(%[pSrc]) \n\t" |
3573 | | "gsldlc1 $f6, 0xb(%[pSrc]) \n\t" |
3574 | | "gsldlc1 $f8, 0x9(%[pSrc]) \n\t" |
3575 | | "gsldlc1 $f10, 0xa(%[pSrc]) \n\t" |
3576 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
3577 | | "gsldrc1 $f2, 0x5(%[pSrc]) \n\t" |
3578 | | "gsldrc1 $f4, 0x1(%[pSrc]) \n\t" |
3579 | | "gsldrc1 $f6, 0x4(%[pSrc]) \n\t" |
3580 | | "gsldrc1 $f8, 0x2(%[pSrc]) \n\t" |
3581 | | "gsldrc1 $f10, 0x3(%[pSrc]) \n\t" |
3582 | | "dli $8, 0x2 \n\t" |
3583 | | "punpcklbh $f0, $f0, $f14 \n\t" |
3584 | | "punpcklbh $f2, $f2, $f14 \n\t" |
3585 | | "punpcklbh $f4, $f4, $f14 \n\t" |
3586 | | "punpcklbh $f6, $f6, $f14 \n\t" |
3587 | | "punpcklbh $f8, $f8, $f14 \n\t" |
3588 | | "punpcklbh $f10, $f10, $f14 \n\t" |
3589 | | "dmtc1 $8, $f16 \n\t" |
3590 | | "paddh $f4, $f4, $f6 \n\t" |
3591 | | "paddh $f8, $f8, $f10 \n\t" |
3592 | | "psllh $f8, $f8, $f16 \n\t" |
3593 | | "psubh $f8, $f8, $f4 \n\t" |
3594 | | "paddh $f0, $f0, $f2 \n\t" |
3595 | | "paddh $f0, $f0, $f8 \n\t" |
3596 | | "dli $8, 0x5 \n\t" |
3597 | | "psllh $f8, $f8, $f16 \n\t" |
3598 | | "paddh $f0, $f0, $f8 \n\t" |
3599 | | "paddh $f0, $f0, $f12 \n\t" |
3600 | | "dmtc1 $8, $f16 \n\t" |
3601 | | "psrah $f0, $f0, $f16 \n\t" |
3602 | | "packushb $f0, $f0, $f14 \n\t" |
3603 | | "gsswlc1 $f0, 0x3(%[pDst]) \n\t" |
3604 | | "gsswrc1 $f0, 0x0(%[pDst]) \n\t" |
3605 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3606 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3607 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3608 | | "bnez %[iHeight], 1b \n\t" |
3609 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight) |
3610 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3611 | | : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", |
3612 | | "$f14", "$f16" |
3613 | | ); |
3614 | | } |
3615 | | |
3616 | | static inline void McHorVer20_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
3617 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
3618 | | if (iWidth == 16) |
3619 | | McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3620 | | else if (iWidth == 8) |
3621 | | McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3622 | | else |
3623 | | McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3624 | | } |
3625 | | |
3626 | | void McHorVer02WidthEq8_mmi(const uint8_t *pSrc, int iSrcStride, uint8_t *pDst, |
3627 | | int iDstStride, int iHeight) { |
3628 | | BACKUP_REG; |
3629 | | __asm__ volatile ( |
3630 | | ".set arch=loongson3a \n\t" |
3631 | | PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3632 | | PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3633 | | "xor $f28, $f28, $f28 \n\t" |
3634 | | MMI_LOAD_8P($f0, $f2, $f28, %[pSrc]) |
3635 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
3636 | | MMI_LOAD_8P($f4, $f6, $f28, $8) |
3637 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3638 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3639 | | MMI_LOAD_8P($f8, $f10, $f28, %[pSrc]) |
3640 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
3641 | | MMI_LOAD_8P($f12, $f14, $f28, $8) |
3642 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3643 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3644 | | MMI_LOAD_8P($f16, $f18, $f28, %[pSrc]) |
3645 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
3646 | | MMI_LOAD_8P($f20, $f22, $f28, $8) |
3647 | | |
3648 | | "1: \n\t" |
3649 | | FILTER_HV_W8($f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, |
3650 | | $f22, $f24, $f26, $f28, $f30, %[pDst], $8, $9) |
3651 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3652 | | "beqz %[iHeight], 2f \n\t" |
3653 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3654 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3655 | | MMI_LOAD_8P($f24, $f26, $f28, %[pSrc]) |
3656 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3657 | | FILTER_HV_W8($f4, $f6, $f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, |
3658 | | $f26, $f28, $f30, $f0, $f2, %[pDst], $8, $9) |
3659 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3660 | | "beqz %[iHeight], 2f \n\t" |
3661 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3662 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
3663 | | MMI_LOAD_8P($f28, $f30, $f0, $8) |
3664 | | FILTER_HV_W8($f8, $f10, $f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, |
3665 | | $f30, $f0, $f2, $f4, $f6, %[pDst], $8, $9) |
3666 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3667 | | "beqz %[iHeight], 2f \n\t" |
3668 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3669 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3670 | | MMI_LOAD_8P($f0, $f2, $f4, %[pSrc]) |
3671 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3672 | | FILTER_HV_W8($f12, $f14, $f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, |
3673 | | $f2, $f4, $f6, $f8, $f10, %[pDst], $8, $9) |
3674 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3675 | | "beqz %[iHeight], 2f \n\t" |
3676 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3677 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
3678 | | MMI_LOAD_8P($f4, $f6, $f8, $8) |
3679 | | FILTER_HV_W8($f16, $f18, $f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, |
3680 | | $f6, $f8, $f10, $f12, $f14, %[pDst], $8, $9) |
3681 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3682 | | "beqz %[iHeight], 2f \n\t" |
3683 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3684 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3685 | | MMI_LOAD_8P($f8, $f10, $f12, %[pSrc]) |
3686 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3687 | | FILTER_HV_W8($f20, $f22, $f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, |
3688 | | $f10, $f12, $f14, $f16, $f18, %[pDst], $8, $9) |
3689 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3690 | | "beqz %[iHeight], 2f \n\t" |
3691 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3692 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
3693 | | MMI_LOAD_8P($f12, $f14, $f16, $8) |
3694 | | FILTER_HV_W8($f24, $f26, $f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, |
3695 | | $f14, $f16, $f18, $f20, $f22, %[pDst], $8, $9) |
3696 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3697 | | "beqz %[iHeight], 2f \n\t" |
3698 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3699 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3700 | | MMI_LOAD_8P($f16, $f18, $f20, %[pSrc]) |
3701 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3702 | | FILTER_HV_W8($f28, $f30, $f0, $f2, $f4, $f6, $f8, $f10, $f12, $f14, $f16, |
3703 | | $f18, $f20, $f22, $f24, $f26, %[pDst], $8, $9) |
3704 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3705 | | "beqz %[iHeight], 2f \n\t" |
3706 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3707 | | PTR_ADDU "$8, %[pSrc], %[iSrcStride] \n\t" |
3708 | | MMI_LOAD_8P($f20, $f22, $f24, $8) |
3709 | | "j 1b \n\t" |
3710 | | "2: \n\t" |
3711 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight) |
3712 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3713 | | : "memory", "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", |
3714 | | "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" |
3715 | | ); |
3716 | | RECOVER_REG; |
3717 | | } |
3718 | | |
3719 | | static inline void McHorVer02WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride, |
3720 | | uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { |
3721 | | McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3722 | | McHorVer02WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); |
3723 | | } |
3724 | | |
3725 | | static inline void McHorVer02_mmi(const uint8_t* pSrc, int32_t iSrcStride, |
3726 | | uint8_t* pDst, int32_t iDstStride, int32_t iWidth, |
3727 | | int32_t iHeight) { |
3728 | | if (iWidth == 16) |
3729 | | McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3730 | | else if (iWidth == 8) |
3731 | | McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3732 | | else |
3733 | | McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight); |
3734 | | } |
3735 | | |
3736 | | void McHorVer22Width8HorFirst_mmi(const uint8_t *pSrc, int16_t iSrcStride, |
3737 | | uint8_t *pDst, int32_t iDstStride, int32_t iHeight) { |
3738 | | BACKUP_REG; |
3739 | | __asm__ volatile ( |
3740 | | ".set arch=loongson3a \n\t" |
3741 | | "xor $f28, $f28, $f28 \n\t" |
3742 | | PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3743 | | PTR_SUBU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3744 | | "dli $8, 0x2 \n\t" |
3745 | | "dmtc1 $8, $f30 \n\t" |
3746 | | "1: \n\t" |
3747 | | "xor $f28, $f28, $f28 \n\t" |
3748 | | "gsldlc1 $f0, 0x7(%[pSrc]) \n\t" |
3749 | | "gsldlc1 $f4, 0xc(%[pSrc]) \n\t" |
3750 | | "gsldlc1 $f8, 0x8(%[pSrc]) \n\t" |
3751 | | "gsldlc1 $f12, 0xb(%[pSrc]) \n\t" |
3752 | | "gsldlc1 $f16, 0x9(%[pSrc]) \n\t" |
3753 | | "gsldlc1 $f20, 0xa(%[pSrc]) \n\t" |
3754 | | "gsldrc1 $f0, 0x0(%[pSrc]) \n\t" |
3755 | | "gsldrc1 $f4, 0x5(%[pSrc]) \n\t" |
3756 | | "gsldrc1 $f8, 0x1(%[pSrc]) \n\t" |
3757 | | "gsldrc1 $f12, 0x4(%[pSrc]) \n\t" |
3758 | | "gsldrc1 $f16, 0x2(%[pSrc]) \n\t" |
3759 | | "gsldrc1 $f20, 0x3(%[pSrc]) \n\t" |
3760 | | "punpckhbh $f2, $f0, $f28 \n\t" |
3761 | | "punpckhbh $f6, $f4, $f28 \n\t" |
3762 | | "punpckhbh $f10, $f8, $f28 \n\t" |
3763 | | "punpckhbh $f14, $f12, $f28 \n\t" |
3764 | | "punpckhbh $f18, $f16, $f28 \n\t" |
3765 | | "punpckhbh $f22, $f20, $f28 \n\t" |
3766 | | "punpcklbh $f0, $f0, $f28 \n\t" |
3767 | | "punpcklbh $f4, $f4, $f28 \n\t" |
3768 | | "punpcklbh $f8, $f8, $f28 \n\t" |
3769 | | "punpcklbh $f12, $f12, $f28 \n\t" |
3770 | | "punpcklbh $f16, $f16, $f28 \n\t" |
3771 | | "punpcklbh $f20, $f20, $f28 \n\t" |
3772 | | "paddh $f8, $f8, $f12 \n\t" |
3773 | | "paddh $f10, $f10, $f14 \n\t" |
3774 | | "paddh $f16, $f16, $f20 \n\t" |
3775 | | "paddh $f18, $f18, $f22 \n\t" |
3776 | | "psllh $f16, $f16, $f30 \n\t" |
3777 | | "psllh $f18, $f18, $f30 \n\t" |
3778 | | "psubh $f16, $f16, $f8 \n\t" |
3779 | | "psubh $f18, $f18, $f10 \n\t" |
3780 | | "paddh $f0, $f0, $f4 \n\t" |
3781 | | "paddh $f2, $f2, $f6 \n\t" |
3782 | | "paddh $f0, $f0, $f16 \n\t" |
3783 | | "paddh $f2, $f2, $f18 \n\t" |
3784 | | "psllh $f16, $f16, $f30 \n\t" |
3785 | | "psllh $f18, $f18, $f30 \n\t" |
3786 | | "paddh $f0, $f0, $f16 \n\t" |
3787 | | "paddh $f2, $f2, $f18 \n\t" |
3788 | | "gssdlc1 $f0, 0x7(%[pDst]) \n\t" |
3789 | | "gssdlc1 $f2, 0xF(%[pDst]) \n\t" |
3790 | | "gssdrc1 $f0, 0x0(%[pDst]) \n\t" |
3791 | | "gssdrc1 $f2, 0x8(%[pDst]) \n\t" |
3792 | | PTR_ADDU "%[pSrc], %[pSrc], %[iSrcStride] \n\t" |
3793 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3794 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3795 | | "bnez %[iHeight], 1b \n\t" |
3796 | | : [pSrc]"+&r"(pSrc), [pDst]"+&r"(pDst), [iHeight]"+&r"(iHeight) |
3797 | | : [iSrcStride]"r"(iSrcStride), [iDstStride]"r"(iDstStride) |
3798 | | : "memory", "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", |
3799 | | "$f14", "$f16", "$f18", "$f20", "$f22", "$f24", "$f26", "$f28", "$f30" |
3800 | | ); |
3801 | | RECOVER_REG; |
3802 | | } |
3803 | | |
3804 | | static inline void McHorVer22WidthEq8_mmi(const uint8_t* pSrc, int32_t iSrcStride, |
3805 | | uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { |
3806 | | ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16) |
3807 | | McHorVer22Width8HorFirst_mmi (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5); |
3808 | | McHorVer22Width8VerLastAlign_mmi ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight); |
3809 | | } |
3810 | | |
3811 | | static inline void McHorVer22WidthEq16_mmi(const uint8_t* pSrc, int32_t iSrcStride, |
3812 | | uint8_t* pDst, int32_t iDstStride, int32_t iHeight) { |
3813 | | McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3814 | | McHorVer22WidthEq8_mmi (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); |
3815 | | } |
3816 | | |
3817 | | static inline void McHorVer22_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
3818 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
3819 | | if (iWidth == 16) |
3820 | | McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3821 | | else if (iWidth == 8) |
3822 | | McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
3823 | | else |
3824 | | McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight); |
3825 | | } |
3826 | | |
3827 | | void PixelAvgWidthEq4_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA, |
3828 | | int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) { |
3829 | | __asm__ volatile ( |
3830 | | ".set arch=loongson3a \n\t" |
3831 | | "1: \n\t" |
3832 | | "gsldlc1 $f0, 0x7(%[pSrcB]) \n\t" |
3833 | | "gsldlc1 $f2, 0x7(%[pSrcA]) \n\t" |
3834 | | "gsldrc1 $f0, 0x0(%[pSrcB]) \n\t" |
3835 | | "gsldrc1 $f2, 0x0(%[pSrcA]) \n\t" |
3836 | | "pavgb $f0, $f0, $f2 \n\t" |
3837 | | "gsswlc1 $f0, 0x3(%[pDst]) \n\t" |
3838 | | "gsswrc1 $f0, 0x0(%[pDst]) \n\t" |
3839 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x1 \n\t" |
3840 | | PTR_ADDU "%[pDst], %[pDst], %[iDstStride] \n\t" |
3841 | | PTR_ADDU "%[pSrcA], %[pSrcA], %[iSrcAStride] \n\t" |
3842 | | PTR_ADDU "%[pSrcB], %[pSrcB], %[iSrcBStride] \n\t" |
3843 | | "bnez %[iHeight], 1b \n\t" |
3844 | | : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA), |
3845 | | [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight) |
3846 | | : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride), |
3847 | | [iSrcBStride]"r"((int)iSrcBStride) |
3848 | | : "memory", "$8", "$9", "$10", "$f0", "$f2" |
3849 | | ); |
3850 | | } |
3851 | | |
3852 | | void PixelAvgWidthEq8_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA, |
3853 | | int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) { |
3854 | | __asm__ volatile ( |
3855 | | ".set arch=loongson3a \n\t" |
3856 | | "1: \n\t" |
3857 | | "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t" |
3858 | | "gsldlc1 $f2, 0x7(%[pSrcB]) \n\t" |
3859 | | "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t" |
3860 | | "gsldrc1 $f2, 0x0(%[pSrcB]) \n\t" |
3861 | | "pavgb $f0, $f0, $f2 \n\t" |
3862 | | PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t" |
3863 | | "gssdlc1 $f0, 0x7(%[pDst]) \n\t" |
3864 | | PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t" |
3865 | | "gssdrc1 $f0, 0x0(%[pDst]) \n\t" |
3866 | | "gsldlc1 $f0, 0x7($8) \n\t" |
3867 | | "gsldlc1 $f2, 0x7($9) \n\t" |
3868 | | "gsldrc1 $f0, 0x0($8) \n\t" |
3869 | | "gsldrc1 $f2, 0x0($9) \n\t" |
3870 | | "pavgb $f0, $f0, $f2 \n\t" |
3871 | | PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t" |
3872 | | "gssdlc1 $f0, 0x7($10) \n\t" |
3873 | | PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t" |
3874 | | "gssdrc1 $f0, 0x0($10) \n\t" |
3875 | | PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t" |
3876 | | PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t" |
3877 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x2 \n\t" |
3878 | | "bnez %[iHeight], 1b \n\t" |
3879 | | : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA), |
3880 | | [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight) |
3881 | | : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride), |
3882 | | [iSrcBStride]"r"((int)iSrcBStride) |
3883 | | : "memory", "$8", "$9", "$10", "$f0", "$f2" |
3884 | | ); |
3885 | | } |
3886 | | |
3887 | | void PixelAvgWidthEq16_mmi(uint8_t *pDst, int iDstStride, const uint8_t *pSrcA, |
3888 | | int iSrcAStride, const uint8_t *pSrcB, int iSrcBStride, int iHeight ) { |
3889 | | __asm__ volatile ( |
3890 | | ".set arch=loongson3a \n\t" |
3891 | | "1: \n\t" |
3892 | | "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t" |
3893 | | "gsldlc1 $f2, 0xF(%[pSrcA]) \n\t" |
3894 | | "gsldlc1 $f4, 0x7(%[pSrcB]) \n\t" |
3895 | | "gsldlc1 $f6, 0xF(%[pSrcB]) \n\t" |
3896 | | "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t" |
3897 | | "gsldrc1 $f2, 0x8(%[pSrcA]) \n\t" |
3898 | | "gsldrc1 $f4, 0x0(%[pSrcB]) \n\t" |
3899 | | "gsldrc1 $f6, 0x8(%[pSrcB]) \n\t" |
3900 | | "pavgb $f0, $f0, $f4 \n\t" |
3901 | | "pavgb $f2, $f2, $f6 \n\t" |
3902 | | PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t" |
3903 | | "gssdlc1 $f0, 0x7(%[pDst]) \n\t" |
3904 | | "gssdlc1 $f2, 0xF(%[pDst]) \n\t" |
3905 | | "gssdrc1 $f0, 0x0(%[pDst]) \n\t" |
3906 | | "gssdrc1 $f2, 0x8(%[pDst]) \n\t" |
3907 | | PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t" |
3908 | | "gsldlc1 $f0, 0x7($8) \n\t" |
3909 | | "gsldlc1 $f2, 0xF($8) \n\t" |
3910 | | "gsldrc1 $f0, 0x0($8) \n\t" |
3911 | | "gsldrc1 $f2, 0x8($8) \n\t" |
3912 | | PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t" |
3913 | | "gsldlc1 $f4, 0x7($9) \n\t" |
3914 | | "gsldlc1 $f6, 0xF($9) \n\t" |
3915 | | "gsldrc1 $f4, 0x0($9) \n\t" |
3916 | | "gsldrc1 $f6, 0x8($9) \n\t" |
3917 | | "pavgb $f0, $f0, $f4 \n\t" |
3918 | | "pavgb $f2, $f2, $f6 \n\t" |
3919 | | "gssdlc1 $f0, 0x7($10) \n\t" |
3920 | | "gssdlc1 $f2, 0xF($10) \n\t" |
3921 | | "gssdrc1 $f0, 0x0($10) \n\t" |
3922 | | "gssdrc1 $f2, 0x8($10) \n\t" |
3923 | | |
3924 | | PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t" |
3925 | | PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t" |
3926 | | PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t" |
3927 | | "gsldlc1 $f0, 0x7(%[pSrcA]) \n\t" |
3928 | | "gsldlc1 $f2, 0xF(%[pSrcA]) \n\t" |
3929 | | "gsldlc1 $f4, 0x7(%[pSrcB]) \n\t" |
3930 | | "gsldlc1 $f6, 0xF(%[pSrcB]) \n\t" |
3931 | | "gsldrc1 $f0, 0x0(%[pSrcA]) \n\t" |
3932 | | "gsldrc1 $f2, 0x8(%[pSrcA]) \n\t" |
3933 | | "gsldrc1 $f4, 0x0(%[pSrcB]) \n\t" |
3934 | | "gsldrc1 $f6, 0x8(%[pSrcB]) \n\t" |
3935 | | "pavgb $f0, $f0, $f4 \n\t" |
3936 | | "pavgb $f2, $f2, $f6 \n\t" |
3937 | | PTR_ADDU "$8, %[pSrcA], %[iSrcAStride] \n\t" |
3938 | | PTR_ADDU "$9, %[pSrcB], %[iSrcBStride] \n\t" |
3939 | | "gssdlc1 $f0, 0x7(%[pDst]) \n\t" |
3940 | | "gssdlc1 $f2, 0xF(%[pDst]) \n\t" |
3941 | | "gssdrc1 $f0, 0x0(%[pDst]) \n\t" |
3942 | | "gssdrc1 $f2, 0x8(%[pDst]) \n\t" |
3943 | | "gsldlc1 $f0, 0x7($8) \n\t" |
3944 | | "gsldlc1 $f2, 0xF($8) \n\t" |
3945 | | "gsldlc1 $f4, 0x7($9) \n\t" |
3946 | | "gsldlc1 $f6, 0xF($9) \n\t" |
3947 | | "gsldrc1 $f0, 0x0($8) \n\t" |
3948 | | "gsldrc1 $f2, 0x8($8) \n\t" |
3949 | | "gsldrc1 $f4, 0x0($9) \n\t" |
3950 | | "gsldrc1 $f6, 0x8($9) \n\t" |
3951 | | PTR_ADDU "$10, %[pDst], %[iDstStride] \n\t" |
3952 | | "pavgb $f0, $f0, $f4 \n\t" |
3953 | | "pavgb $f2, $f2, $f6 \n\t" |
3954 | | "gssdlc1 $f0, 0x7($10) \n\t" |
3955 | | "gssdlc1 $f2, 0xF($10) \n\t" |
3956 | | "gssdrc1 $f0, 0x0($10) \n\t" |
3957 | | "gssdrc1 $f2, 0x8($10) \n\t" |
3958 | | PTR_ADDU "%[pSrcA], $8, %[iSrcAStride] \n\t" |
3959 | | PTR_ADDU "%[pSrcB], $9, %[iSrcBStride] \n\t" |
3960 | | PTR_ADDU "%[pDst], $10, %[iDstStride] \n\t" |
3961 | | PTR_ADDIU "%[iHeight], %[iHeight], -0x4 \n\t" |
3962 | | "bnez %[iHeight], 1b \n\t" |
3963 | | : [pDst]"+&r"((unsigned char *)pDst), [pSrcA]"+&r"((unsigned char *)pSrcA), |
3964 | | [pSrcB]"+&r"((unsigned char *)pSrcB), [iHeight]"+&r"((int)iHeight) |
3965 | | : [iDstStride]"r"((int)iDstStride), [iSrcAStride]"r"((int)iSrcAStride), |
3966 | | [iSrcBStride]"r"((int)iSrcBStride) |
3967 | | : "memory", "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6" |
3968 | | ); |
3969 | | } |
3970 | | |
3971 | | static inline void McHorVer01_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
3972 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
3973 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
3974 | | if (iWidth == 16) { |
3975 | | McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight); |
3976 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
3977 | | } else if (iWidth == 8) { |
3978 | | McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight); |
3979 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
3980 | | } else { |
3981 | | McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight); |
3982 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
3983 | | } |
3984 | | } |
3985 | | |
3986 | | static inline void McHorVer03_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
3987 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
3988 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
3989 | | if (iWidth == 16) { |
3990 | | McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight); |
3991 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
3992 | | } else if (iWidth == 8) { |
3993 | | McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight); |
3994 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
3995 | | } else { |
3996 | | McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight); |
3997 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
3998 | | } |
3999 | | } |
4000 | | |
4001 | | static inline void McHorVer10_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4002 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4003 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
4004 | | if (iWidth == 16) { |
4005 | | McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pTmp, 16, iHeight); |
4006 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4007 | | } else if (iWidth == 8) { |
4008 | | McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pTmp, 16, iHeight); |
4009 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4010 | | } else { |
4011 | | McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pTmp, 16, iHeight); |
4012 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4013 | | } |
4014 | | } |
4015 | | |
4016 | | static inline void McHorVer11_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4017 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4018 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4019 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4020 | | if (iWidth == 16) { |
4021 | | McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4022 | | McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4023 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4024 | | } else if (iWidth == 8) { |
4025 | | McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4026 | | McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4027 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4028 | | } else { |
4029 | | McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4030 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); |
4031 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4032 | | } |
4033 | | } |
4034 | | |
4035 | | static inline void McHorVer12_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4036 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4037 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4038 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
4039 | | if (iWidth == 16) { |
4040 | | McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4041 | | McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4042 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4043 | | } else if (iWidth == 8) { |
4044 | | McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4045 | | McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4046 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4047 | | } else { |
4048 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); |
4049 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
4050 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4051 | | } |
4052 | | } |
4053 | | static inline void McHorVer13_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4054 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4055 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4056 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4057 | | if (iWidth == 16) { |
4058 | | McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4059 | | McHorVer02WidthEq16_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4060 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4061 | | } else if (iWidth == 8) { |
4062 | | McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4063 | | McHorVer02WidthEq8_mmi (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4064 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4065 | | } else { |
4066 | | McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4067 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4 , iHeight); |
4068 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4069 | | } |
4070 | | } |
4071 | | static inline void McHorVer21_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4072 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4073 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4074 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
4075 | | if (iWidth == 16) { |
4076 | | McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4077 | | McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4078 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4079 | | } else if (iWidth == 8) { |
4080 | | McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4081 | | McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4082 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4083 | | } else { |
4084 | | McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4085 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
4086 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4087 | | } |
4088 | | } |
4089 | | |
4090 | | static inline void McHorVer23_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4091 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4092 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4093 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
4094 | | if (iWidth == 16) { |
4095 | | McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4096 | | McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4097 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4098 | | } else if (iWidth == 8) { |
4099 | | McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4100 | | McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4101 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4102 | | } else { |
4103 | | McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4104 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
4105 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4106 | | } |
4107 | | } |
4108 | | static inline void McHorVer30_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4109 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4110 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4111 | | if (iWidth == 16) { |
4112 | | McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4113 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
4114 | | } else if (iWidth == 8) { |
4115 | | McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4116 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
4117 | | } else { |
4118 | | McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4119 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
4120 | | } |
4121 | | } |
4122 | | static inline void McHorVer31_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4123 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4124 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4125 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4126 | | if (iWidth == 16) { |
4127 | | McHorVer20WidthEq16_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4128 | | McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4129 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4130 | | } else if (iWidth == 8) { |
4131 | | McHorVer20WidthEq8_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4132 | | McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4133 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4134 | | } else { |
4135 | | McHorVer20WidthEq4_mmi (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4136 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
4137 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4138 | | } |
4139 | | } |
4140 | | static inline void McHorVer32_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4141 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4142 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4143 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
4144 | | if (iWidth == 16) { |
4145 | | McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4146 | | McHorVer22WidthEq16_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4147 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4148 | | } else if (iWidth == 8) { |
4149 | | McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4150 | | McHorVer22WidthEq8_mmi (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4151 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4152 | | } else { |
4153 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
4154 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
4155 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4156 | | } |
4157 | | } |
4158 | | static inline void McHorVer33_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4159 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4160 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4161 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4162 | | if (iWidth == 16) { |
4163 | | McHorVer20WidthEq16_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4164 | | McHorVer02WidthEq16_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4165 | | PixelAvgWidthEq16_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4166 | | } else if (iWidth == 8) { |
4167 | | McHorVer20WidthEq8_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4168 | | McHorVer02WidthEq8_mmi (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4169 | | PixelAvgWidthEq8_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4170 | | } else { |
4171 | | McHorVer20WidthEq4_mmi (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4172 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
4173 | | PixelAvgWidthEq4_mmi (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4174 | | } |
4175 | | } |
4176 | | |
4177 | | void McLuma_mmi(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, |
4178 | | int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) { |
4179 | | static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] |
4180 | | {McCopy_mmi, McHorVer01_mmi, McHorVer02_mmi, McHorVer03_mmi}, |
4181 | | {McHorVer10_mmi, McHorVer11_mmi, McHorVer12_mmi, McHorVer13_mmi}, |
4182 | | {McHorVer20_mmi, McHorVer21_mmi, McHorVer22_mmi, McHorVer23_mmi}, |
4183 | | {McHorVer30_mmi, McHorVer31_mmi, McHorVer32_mmi, McHorVer33_mmi}, |
4184 | | }; |
4185 | | |
4186 | | pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
4187 | | } |
4188 | | |
4189 | | void PixelAvg_mmi(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, |
4190 | | const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight) { |
4191 | | static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = { |
4192 | | PixelAvgWidthEq8_mmi, |
4193 | | PixelAvgWidthEq16_mmi |
4194 | | }; |
4195 | | kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight); |
4196 | | } |
4197 | | #endif//HAVE_MMI |
4198 | | |
4199 | | #if defined(HAVE_LSX) |
4200 | | static inline void McCopy_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4201 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4202 | | if (iWidth == 16) |
4203 | | McCopyWidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4204 | | else if (iWidth == 8) |
4205 | | McCopyWidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4206 | | else if (iWidth == 4) |
4207 | | McCopyWidthEq4_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4208 | | else |
4209 | | McCopyWidthEq2_c (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4210 | | } |
4211 | | |
4212 | | void McChroma_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4213 | | int32_t iDstStride, int16_t iMvX, int16_t iMvY, |
4214 | | int32_t iWidth, int32_t iHeight) { |
4215 | | static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = { |
4216 | | McChromaWidthEq4_lsx, |
4217 | | McChromaWidthEq8_lsx |
4218 | | }; |
4219 | | const int32_t kiD8x = iMvX & 0x07; |
4220 | | const int32_t kiD8y = iMvY & 0x07; |
4221 | | if (kiD8x == 0 && kiD8y == 0) { |
4222 | | McCopy_lsx (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
4223 | | return; |
4224 | | } |
4225 | | if (iWidth != 2) { |
4226 | | kpMcChromaWidthFuncs[iWidth >> 3] (pSrc, iSrcStride, pDst, iDstStride, |
4227 | | g_kuiABCD[kiD8y][kiD8x], iHeight); |
4228 | | } else |
4229 | | McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, |
4230 | | iWidth, iHeight); |
4231 | | } |
4232 | | |
4233 | | void PixelAvg_lsx(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, |
4234 | | int32_t iSrcAStride, const uint8_t* pSrcB, int32_t iSrcBStride, |
4235 | | int32_t iWidth, int32_t iHeight) { |
4236 | | static const PWelsSampleWidthAveragingFunc kpfFuncs[2] = { |
4237 | | PixelAvgWidthEq8_lsx, |
4238 | | PixelAvgWidthEq16_lsx |
4239 | | }; |
4240 | | kpfFuncs[iWidth >> 4] (pDst, iDstStride, pSrcA, iSrcAStride, pSrcB, iSrcBStride, iHeight); |
4241 | | } |
4242 | | |
4243 | | static inline void McHorVer01_lsx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4244 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4245 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
4246 | | if (iWidth == 16) { |
4247 | | McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pTmp, 16, iHeight); |
4248 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4249 | | } else if (iWidth == 8) { |
4250 | | McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pTmp, 16, iHeight); |
4251 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4252 | | } else { |
4253 | | McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight); |
4254 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4255 | | } |
4256 | | } |
4257 | | |
4258 | | static inline void McHorVer02_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4259 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4260 | | if (iWidth == 16) |
4261 | | McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4262 | | else if (iWidth == 8) |
4263 | | McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4264 | | else |
4265 | | McHorVer02_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight); |
4266 | | } |
4267 | | |
4268 | | static inline void McHorVer03_lsx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4269 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4270 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
4271 | | if (iWidth == 16) { |
4272 | | McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pTmp, 16, iHeight); |
4273 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
4274 | | } else if (iWidth == 8) { |
4275 | | McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pTmp, 16, iHeight); |
4276 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
4277 | | } else { |
4278 | | McHorVer02_c (pSrc, iSrcStride, pTmp, 16, 4, iHeight); |
4279 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pSrc + iSrcStride, iSrcStride, pTmp, 16, iHeight); |
4280 | | } |
4281 | | } |
4282 | | |
4283 | | static inline void McHorVer10_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4284 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4285 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pTmp, 256, 16); |
4286 | | if (iWidth == 16) { |
4287 | | McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pTmp, 16, iHeight); |
4288 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4289 | | } else if (iWidth == 8) { |
4290 | | McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pTmp, 16, iHeight); |
4291 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4292 | | } else { |
4293 | | McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pTmp, 16, iHeight); |
4294 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight); |
4295 | | } |
4296 | | } |
4297 | | |
4298 | | static inline void McHorVer11_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4299 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4300 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4301 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4302 | | if (iWidth == 16) { |
4303 | | McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4304 | | McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4305 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4306 | | } else if (iWidth == 8) { |
4307 | | McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4308 | | McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4309 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4310 | | } else { |
4311 | | McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4312 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); |
4313 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4314 | | } |
4315 | | } |
4316 | | |
4317 | | static inline void McHorVer22WidthEq16_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4318 | | int32_t iDstStride, int32_t iHeight) { |
4319 | | McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4320 | | McHorVer22WidthEq8_lsx (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight); |
4321 | | } |
4322 | | |
4323 | | static inline void McHorVer12_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4324 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4325 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4326 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
4327 | | if (iWidth == 16) { |
4328 | | McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4329 | | McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4330 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4331 | | } else if (iWidth == 8) { |
4332 | | McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4333 | | McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4334 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4335 | | } else { |
4336 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); |
4337 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
4338 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4339 | | } |
4340 | | } |
4341 | | |
4342 | | static inline void McHorVer13_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4343 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4344 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4345 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4346 | | if (iWidth == 16) { |
4347 | | McHorVer20WidthEq16_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4348 | | McHorVer02WidthEq16_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4349 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4350 | | } else if (iWidth == 8) { |
4351 | | McHorVer20WidthEq8_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4352 | | McHorVer02WidthEq8_lsx (pSrc, iSrcStride, pVerTmp, 16, iHeight); |
4353 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4354 | | } else { |
4355 | | McHorVer20WidthEq4_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4356 | | McHorVer02_c (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight); |
4357 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4358 | | } |
4359 | | } |
4360 | | |
4361 | | static inline void McHorVer20_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4362 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4363 | | if (iWidth == 16) |
4364 | | McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4365 | | else if (iWidth == 8) |
4366 | | McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4367 | | else |
4368 | | McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4369 | | } |
4370 | | |
4371 | | static inline void McHorVer21_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4372 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4373 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4374 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
4375 | | if (iWidth == 16) { |
4376 | | McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4377 | | McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4378 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4379 | | } else if (iWidth == 8) { |
4380 | | McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4381 | | McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4382 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4383 | | } else { |
4384 | | McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4385 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
4386 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4387 | | } |
4388 | | } |
4389 | | |
4390 | | static inline void McHorVer22_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4391 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4392 | | if (iWidth == 16) |
4393 | | McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4394 | | else if (iWidth == 8) |
4395 | | McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4396 | | else |
4397 | | McHorVer22_c (pSrc, iSrcStride, pDst, iDstStride, 4, iHeight); |
4398 | | } |
4399 | | |
4400 | | static inline void McHorVer23_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4401 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4402 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4403 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
4404 | | if (iWidth == 16) { |
4405 | | McHorVer20WidthEq16_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4406 | | McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4407 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4408 | | } else if (iWidth == 8) { |
4409 | | McHorVer20WidthEq8_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4410 | | McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4411 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4412 | | } else { |
4413 | | McHorVer20WidthEq4_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4414 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
4415 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight); |
4416 | | } |
4417 | | } |
4418 | | |
4419 | | static inline void McHorVer30_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4420 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4421 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4422 | | if (iWidth == 16) { |
4423 | | McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4424 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
4425 | | } else if (iWidth == 8) { |
4426 | | McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4427 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
4428 | | } else { |
4429 | | McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4430 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pSrc + 1, iSrcStride, pHorTmp, 16, iHeight); |
4431 | | } |
4432 | | } |
4433 | | |
4434 | | static inline void McHorVer31_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4435 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4436 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4437 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4438 | | if (iWidth == 16) { |
4439 | | McHorVer20WidthEq16_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4440 | | McHorVer02WidthEq16_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4441 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4442 | | } else if (iWidth == 8) { |
4443 | | McHorVer20WidthEq8_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4444 | | McHorVer02WidthEq8_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4445 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4446 | | } else { |
4447 | | McHorVer20WidthEq4_lsx (pSrc, iSrcStride, pHorTmp, 16, iHeight); |
4448 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
4449 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4450 | | } |
4451 | | } |
4452 | | |
4453 | | static inline void McHorVer32_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4454 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4455 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4456 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pCtrTmp, 256, 16); |
4457 | | if (iWidth == 16) { |
4458 | | McHorVer02WidthEq16_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4459 | | McHorVer22WidthEq16_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4460 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4461 | | } else if (iWidth == 8) { |
4462 | | McHorVer02WidthEq8_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4463 | | McHorVer22WidthEq8_lsx (pSrc, iSrcStride, pCtrTmp, 16, iHeight); |
4464 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4465 | | } else { |
4466 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
4467 | | McHorVer22_c (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight); |
4468 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight); |
4469 | | } |
4470 | | } |
4471 | | |
4472 | | static inline void McHorVer33_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4473 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4474 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pHorTmp, 256, 16); |
4475 | | ENFORCE_STACK_ALIGN_1D (uint8_t, pVerTmp, 256, 16); |
4476 | | if (iWidth == 16) { |
4477 | | McHorVer20WidthEq16_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4478 | | McHorVer02WidthEq16_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4479 | | PixelAvgWidthEq16_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4480 | | } else if (iWidth == 8) { |
4481 | | McHorVer20WidthEq8_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4482 | | McHorVer02WidthEq8_lsx (pSrc + 1, iSrcStride, pVerTmp, 16, iHeight); |
4483 | | PixelAvgWidthEq8_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4484 | | } else { |
4485 | | McHorVer20WidthEq4_lsx (pSrc + iSrcStride, iSrcStride, pHorTmp, 16, iHeight); |
4486 | | McHorVer02_c (pSrc + 1, iSrcStride, pVerTmp, 16, 4, iHeight); |
4487 | | PixelAvgWidthEq4_lsx (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight); |
4488 | | } |
4489 | | } |
4490 | | |
4491 | | void McLuma_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4492 | | int32_t iDstStride, int16_t iMvX, int16_t iMvY, |
4493 | | int32_t iWidth, int32_t iHeight) { |
4494 | | static const PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = { //[x][y] |
4495 | | {McCopy_lsx, McHorVer01_lsx, McHorVer02_lsx, McHorVer03_lsx}, |
4496 | | {McHorVer10_lsx, McHorVer11_lsx, McHorVer12_lsx, McHorVer13_lsx}, |
4497 | | {McHorVer20_lsx, McHorVer21_lsx, McHorVer22_lsx, McHorVer23_lsx}, |
4498 | | {McHorVer30_lsx, McHorVer31_lsx, McHorVer32_lsx, McHorVer33_lsx}, |
4499 | | }; |
4500 | | pWelsMcFunc[iMvX & 0x03][iMvY & 0x03] (pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight); |
4501 | | } |
4502 | | |
4503 | | static inline void McHorVer20Width5Or9Or17_lsx(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4504 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4505 | | if (iWidth == 17) { |
4506 | | McHorVer20WidthEq17_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4507 | | } else if (iWidth == 9) { |
4508 | | McHorVer20WidthEq9_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4509 | | } else {//if (iWidth == 5) |
4510 | | McHorVer20WidthEq5_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4511 | | } |
4512 | | } |
4513 | | |
4514 | | void McHorVer22Width5Or9Or17_lsx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, |
4515 | | int32_t iDstStride, int32_t iWidth, int32_t iHeight) { |
4516 | | if (iWidth == 17) { |
4517 | | McHorVer22WidthEq17_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4518 | | } else if (iWidth == 9) { |
4519 | | McHorVer22WidthEq9_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4520 | | } else { |
4521 | | McHorVer22WidthEq5_lsx(pSrc, iSrcStride, pDst, iDstStride, iHeight); |
4522 | | } |
4523 | | } |
4524 | | #endif//HAVE_LSX |
4525 | | |
4526 | | } // anon ns. |
4527 | | |
4528 | 0 | void WelsCommon::InitMcFunc (SMcFunc* pMcFuncs, uint32_t uiCpuFlag) { |
4529 | 0 | pMcFuncs->pfLumaHalfpelHor = McHorVer20_c; |
4530 | 0 | pMcFuncs->pfLumaHalfpelVer = McHorVer02_c; |
4531 | 0 | pMcFuncs->pfLumaHalfpelCen = McHorVer22_c; |
4532 | 0 | pMcFuncs->pfSampleAveraging = PixelAvg_c; |
4533 | 0 | pMcFuncs->pMcChromaFunc = McChroma_c; |
4534 | 0 | pMcFuncs->pMcLumaFunc = McLuma_c; |
4535 | |
|
4536 | | #if defined (X86_ASM) |
4537 | | if (uiCpuFlag & WELS_CPU_SSE2) { |
4538 | | pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_sse2; |
4539 | | pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_sse2; |
4540 | | pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_sse2; |
4541 | | pMcFuncs->pfSampleAveraging = PixelAvg_sse2; |
4542 | | pMcFuncs->pMcChromaFunc = McChroma_sse2; |
4543 | | pMcFuncs->pMcLumaFunc = McLuma_sse2; |
4544 | | } |
4545 | | |
4546 | | if (uiCpuFlag & WELS_CPU_SSSE3) { |
4547 | | pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_ssse3; |
4548 | | pMcFuncs->pfLumaHalfpelVer = McHorVer02_ssse3; |
4549 | | pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17_ssse3; |
4550 | | pMcFuncs->pMcChromaFunc = McChroma_ssse3; |
4551 | | pMcFuncs->pMcLumaFunc = McLuma_ssse3; |
4552 | | } |
4553 | | #ifdef HAVE_AVX2 |
4554 | | if (uiCpuFlag & WELS_CPU_AVX2) { |
4555 | | pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_avx2; |
4556 | | pMcFuncs->pfLumaHalfpelVer = McHorVer02_avx2; |
4557 | | pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17_avx2; |
4558 | | pMcFuncs->pMcLumaFunc = McLuma_avx2; |
4559 | | } |
4560 | | #endif |
4561 | | #endif //(X86_ASM) |
4562 | |
|
4563 | | #if defined(HAVE_NEON) |
4564 | | if (uiCpuFlag & WELS_CPU_NEON) { |
4565 | | pMcFuncs->pMcLumaFunc = McLuma_neon; |
4566 | | pMcFuncs->pMcChromaFunc = McChroma_neon; |
4567 | | pMcFuncs->pfSampleAveraging = PixelAvg_neon; |
4568 | | pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_neon;//iWidth+1:4/8/16 |
4569 | | pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_neon;//heigh+1:4/8/16 |
4570 | | pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_neon;//iWidth+1/heigh+1 |
4571 | | } |
4572 | | #endif |
4573 | | #if defined(HAVE_NEON_AARCH64) |
4574 | | if (uiCpuFlag & WELS_CPU_NEON) { |
4575 | | pMcFuncs->pMcLumaFunc = McLuma_AArch64_neon; |
4576 | | pMcFuncs->pMcChromaFunc = McChroma_AArch64_neon; |
4577 | | pMcFuncs->pfSampleAveraging = PixelAvg_AArch64_neon; |
4578 | | pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_AArch64_neon;//iWidth+1:4/8/16 |
4579 | | pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_AArch64_neon;//heigh+1:4/8/16 |
4580 | | pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_AArch64_neon;//iWidth+1/heigh+1 |
4581 | | } |
4582 | | #endif |
4583 | |
|
4584 | | #if defined(HAVE_MMI) |
4585 | | if (uiCpuFlag & WELS_CPU_MMI) { |
4586 | | pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_mmi; |
4587 | | pMcFuncs->pfLumaHalfpelVer = McHorVer02Height5Or9Or17_mmi; |
4588 | | pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17Height5Or9Or17_mmi; |
4589 | | pMcFuncs->pfSampleAveraging = PixelAvg_mmi; |
4590 | | pMcFuncs->pMcChromaFunc = McChroma_mmi; |
4591 | | pMcFuncs->pMcLumaFunc = McLuma_mmi; |
4592 | | } |
4593 | | #endif//HAVE_MMI |
4594 | |
|
4595 | | #if defined(HAVE_LSX) |
4596 | | if (uiCpuFlag & WELS_CPU_LSX) { |
4597 | | pMcFuncs->pMcChromaFunc = McChroma_lsx; |
4598 | | pMcFuncs->pfSampleAveraging = PixelAvg_lsx; |
4599 | | pMcFuncs->pMcLumaFunc = McLuma_lsx; |
4600 | | pMcFuncs->pfLumaHalfpelVer = McHorVer02_lsx; |
4601 | | pMcFuncs->pfLumaHalfpelHor = McHorVer20Width5Or9Or17_lsx; |
4602 | | pMcFuncs->pfLumaHalfpelCen = McHorVer22Width5Or9Or17_lsx; |
4603 | | } |
4604 | | #endif//HAVE_LSX |
4605 | 0 | } |