/src/FreeRDP/libfreerdp/primitives/neon/prim_YUV_neon.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /**  | 
2  |  |  * FreeRDP: A Remote Desktop Protocol Implementation  | 
3  |  |  * Optimized YUV/RGB conversion operations  | 
4  |  |  *  | 
5  |  |  * Copyright 2014 Thomas Erbesdobler  | 
6  |  |  * Copyright 2016-2017 Armin Novak <armin.novak@thincast.com>  | 
7  |  |  * Copyright 2016-2017 Norbert Federa <norbert.federa@thincast.com>  | 
8  |  |  * Copyright 2016-2017 Thincast Technologies GmbH  | 
9  |  |  *  | 
10  |  |  * Licensed under the Apache License, Version 2.0 (the "License");  | 
11  |  |  * you may not use this file except in compliance with the License.  | 
12  |  |  * You may obtain a copy of the License at  | 
13  |  |  *  | 
14  |  |  *     http://www.apache.org/licenses/LICENSE-2.0  | 
15  |  |  *  | 
16  |  |  * Unless required by applicable law or agreed to in writing, software  | 
17  |  |  * distributed under the License is distributed on an "AS IS" BASIS,  | 
18  |  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  | 
19  |  |  * See the License for the specific language governing permissions and  | 
20  |  |  * limitations under the License.  | 
21  |  |  */  | 
22  |  |  | 
23  |  | #include <freerdp/config.h>  | 
24  |  |  | 
25  |  | #include <winpr/sysinfo.h>  | 
26  |  | #include <winpr/crt.h>  | 
27  |  | #include <freerdp/types.h>  | 
28  |  | #include <freerdp/primitives.h>  | 
29  |  |  | 
30  |  | #include "prim_internal.h"  | 
31  |  | #include "prim_YUV.h"  | 
32  |  |  | 
33  |  | #if defined(NEON_ENABLED)  | 
34  |  | #include <arm_neon.h>  | 
35  |  |  | 
36  |  | static primitives_t* generic = NULL;  | 
37  |  |  | 
38  |  | static INLINE uint8x8_t neon_YUV2R(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,  | 
39  |  |                                    int16x4_t Eh, int16x4_t El)  | 
40  |  | { | 
41  |  |   /* R = (256 * Y + 403 * (V - 128)) >> 8 */  | 
42  |  |   const int16x4_t c403 = vdup_n_s16(403);  | 
43  |  |   const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);  | 
44  |  |   const int32x4_t CEl = vmlal_s16(Cl, El, c403);  | 
45  |  |   const int32x4_t Rh = vrshrq_n_s32(CEh, 8);  | 
46  |  |   const int32x4_t Rl = vrshrq_n_s32(CEl, 8);  | 
47  |  |   const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));  | 
48  |  |   return vqmovun_s16(R);  | 
49  |  | }  | 
50  |  |  | 
51  |  | static INLINE uint8x8_t neon_YUV2G(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,  | 
52  |  |                                    int16x4_t Eh, int16x4_t El)  | 
53  |  | { | 
54  |  |   /* G = (256L * Y -  48 * (U - 128) - 120 * (V - 128)) >> 8 */  | 
55  |  |   const int16x4_t c48 = vdup_n_s16(48);  | 
56  |  |   const int16x4_t c120 = vdup_n_s16(120);  | 
57  |  |   const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);  | 
58  |  |   const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);  | 
59  |  |   const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);  | 
60  |  |   const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);  | 
61  |  |   const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);  | 
62  |  |   const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);  | 
63  |  |   const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));  | 
64  |  |   return vqmovun_s16(G);  | 
65  |  | }  | 
66  |  |  | 
67  |  | static INLINE uint8x8_t neon_YUV2B(int32x4_t Ch, int32x4_t Cl, int16x4_t Dh, int16x4_t Dl,  | 
68  |  |                                    int16x4_t Eh, int16x4_t El)  | 
69  |  | { | 
70  |  |   /* B = (256L * Y + 475 * (U - 128)) >> 8*/  | 
71  |  |   const int16x4_t c475 = vdup_n_s16(475);  | 
72  |  |   const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);  | 
73  |  |   const int32x4_t CDl = vmlal_s16(Ch, Dl, c475);  | 
74  |  |   const int32x4_t Bh = vrshrq_n_s32(CDh, 8);  | 
75  |  |   const int32x4_t Bl = vrshrq_n_s32(CDl, 8);  | 
76  |  |   const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));  | 
77  |  |   return vqmovun_s16(B);  | 
78  |  | }  | 
79  |  |  | 
80  |  | static INLINE BYTE* neon_YuvToRgbPixel(BYTE* pRGB, int16x8_t Y, int16x8_t D, int16x8_t E,  | 
81  |  |                                        const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,  | 
82  |  |                                        const uint8_t aPos)  | 
83  |  | { | 
84  |  |   uint8x8x4_t bgrx;  | 
85  |  |   const int32x4_t Ch = vmulq_n_s32(vmovl_s16(vget_high_s16(Y)), 256); /* Y * 256 */  | 
86  |  |   const int32x4_t Cl = vmulq_n_s32(vmovl_s16(vget_low_s16(Y)), 256);  /* Y * 256 */  | 
87  |  |   const int16x4_t Dh = vget_high_s16(D);  | 
88  |  |   const int16x4_t Dl = vget_low_s16(D);  | 
89  |  |   const int16x4_t Eh = vget_high_s16(E);  | 
90  |  |   const int16x4_t El = vget_low_s16(E);  | 
91  |  |   { | 
92  |  |     /* B = (256L * Y + 475 * (U - 128)) >> 8*/  | 
93  |  |     const int16x4_t c475 = vdup_n_s16(475);  | 
94  |  |     const int32x4_t CDh = vmlal_s16(Ch, Dh, c475);  | 
95  |  |     const int32x4_t CDl = vmlal_s16(Cl, Dl, c475);  | 
96  |  |     const int32x4_t Bh = vrshrq_n_s32(CDh, 8);  | 
97  |  |     const int32x4_t Bl = vrshrq_n_s32(CDl, 8);  | 
98  |  |     const int16x8_t B = vcombine_s16(vqmovn_s32(Bl), vqmovn_s32(Bh));  | 
99  |  |     bgrx.val[bPos] = vqmovun_s16(B);  | 
100  |  |   }  | 
101  |  |   { | 
102  |  |     /* G = (256L * Y -  48 * (U - 128) - 120 * (V - 128)) >> 8 */  | 
103  |  |     const int16x4_t c48 = vdup_n_s16(48);  | 
104  |  |     const int16x4_t c120 = vdup_n_s16(120);  | 
105  |  |     const int32x4_t CDh = vmlsl_s16(Ch, Dh, c48);  | 
106  |  |     const int32x4_t CDl = vmlsl_s16(Cl, Dl, c48);  | 
107  |  |     const int32x4_t CDEh = vmlsl_s16(CDh, Eh, c120);  | 
108  |  |     const int32x4_t CDEl = vmlsl_s16(CDl, El, c120);  | 
109  |  |     const int32x4_t Gh = vrshrq_n_s32(CDEh, 8);  | 
110  |  |     const int32x4_t Gl = vrshrq_n_s32(CDEl, 8);  | 
111  |  |     const int16x8_t G = vcombine_s16(vqmovn_s32(Gl), vqmovn_s32(Gh));  | 
112  |  |     bgrx.val[gPos] = vqmovun_s16(G);  | 
113  |  |   }  | 
114  |  |   { | 
115  |  |     /* R = (256 * Y + 403 * (V - 128)) >> 8 */  | 
116  |  |     const int16x4_t c403 = vdup_n_s16(403);  | 
117  |  |     const int32x4_t CEh = vmlal_s16(Ch, Eh, c403);  | 
118  |  |     const int32x4_t CEl = vmlal_s16(Cl, El, c403);  | 
119  |  |     const int32x4_t Rh = vrshrq_n_s32(CEh, 8);  | 
120  |  |     const int32x4_t Rl = vrshrq_n_s32(CEl, 8);  | 
121  |  |     const int16x8_t R = vcombine_s16(vqmovn_s32(Rl), vqmovn_s32(Rh));  | 
122  |  |     bgrx.val[rPos] = vqmovun_s16(R);  | 
123  |  |   }  | 
124  |  |   { | 
125  |  |     /* A */  | 
126  |  |     bgrx.val[aPos] = vdup_n_u8(0xFF);  | 
127  |  |   }  | 
128  |  |   vst4_u8(pRGB, bgrx);  | 
129  |  |   pRGB += 32;  | 
130  |  |   return pRGB;  | 
131  |  | }  | 
132  |  |  | 
133  |  | static INLINE pstatus_t neon_YUV420ToX(const BYTE* const WINPR_RESTRICT pSrc[3],  | 
134  |  |                                        const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,  | 
135  |  |                                        UINT32 dstStep, const prim_size_t* WINPR_RESTRICT roi,  | 
136  |  |                                        const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,  | 
137  |  |                                        const uint8_t aPos)  | 
138  |  | { | 
139  |  |   const UINT32 nWidth = roi->width;  | 
140  |  |   const UINT32 nHeight = roi->height;  | 
141  |  |   const DWORD pad = nWidth % 16;  | 
142  |  |   const UINT32 yPad = srcStep[0] - roi->width;  | 
143  |  |   const UINT32 uPad = srcStep[1] - roi->width / 2;  | 
144  |  |   const UINT32 vPad = srcStep[2] - roi->width / 2;  | 
145  |  |   const UINT32 dPad = dstStep - roi->width * 4;  | 
146  |  |   const int16x8_t c128 = vdupq_n_s16(128);  | 
147  |  |  | 
148  |  |   for (UINT32 y = 0; y < nHeight; y += 2)  | 
149  |  |   { | 
150  |  |     const uint8_t* pY1 = pSrc[0] + y * srcStep[0];  | 
151  |  |     const uint8_t* pY2 = pY1 + srcStep[0];  | 
152  |  |     const uint8_t* pU = pSrc[1] + (y / 2) * srcStep[1];  | 
153  |  |     const uint8_t* pV = pSrc[2] + (y / 2) * srcStep[2];  | 
154  |  |     uint8_t* pRGB1 = pDst + y * dstStep;  | 
155  |  |     uint8_t* pRGB2 = pRGB1 + dstStep;  | 
156  |  |     const BOOL lastY = y >= nHeight - 1;  | 
157  |  |  | 
158  |  |     UINT32 x = 0;  | 
159  |  |     for (; x < nWidth - pad;)  | 
160  |  |     { | 
161  |  |       const uint8x8_t Uraw = vld1_u8(pU);  | 
162  |  |       const uint8x8x2_t Uu = vzip_u8(Uraw, Uraw);  | 
163  |  |       const int16x8_t U1 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[0]));  | 
164  |  |       const int16x8_t U2 = vreinterpretq_s16_u16(vmovl_u8(Uu.val[1]));  | 
165  |  |       const uint8x8_t Vraw = vld1_u8(pV);  | 
166  |  |       const uint8x8x2_t Vu = vzip_u8(Vraw, Vraw);  | 
167  |  |       const int16x8_t V1 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[0]));  | 
168  |  |       const int16x8_t V2 = vreinterpretq_s16_u16(vmovl_u8(Vu.val[1]));  | 
169  |  |       const int16x8_t D1 = vsubq_s16(U1, c128);  | 
170  |  |       const int16x8_t E1 = vsubq_s16(V1, c128);  | 
171  |  |       const int16x8_t D2 = vsubq_s16(U2, c128);  | 
172  |  |       const int16x8_t E2 = vsubq_s16(V2, c128);  | 
173  |  |       { | 
174  |  |         const uint8x8_t Y1u = vld1_u8(pY1);  | 
175  |  |         const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));  | 
176  |  |         pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D1, E1, rPos, gPos, bPos, aPos);  | 
177  |  |         pY1 += 8;  | 
178  |  |         x += 8;  | 
179  |  |       }  | 
180  |  |       { | 
181  |  |         const uint8x8_t Y1u = vld1_u8(pY1);  | 
182  |  |         const int16x8_t Y1 = vreinterpretq_s16_u16(vmovl_u8(Y1u));  | 
183  |  |         pRGB1 = neon_YuvToRgbPixel(pRGB1, Y1, D2, E2, rPos, gPos, bPos, aPos);  | 
184  |  |         pY1 += 8;  | 
185  |  |         x += 8;  | 
186  |  |       }  | 
187  |  |  | 
188  |  |       if (!lastY)  | 
189  |  |       { | 
190  |  |         { | 
191  |  |           const uint8x8_t Y2u = vld1_u8(pY2);  | 
192  |  |           const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));  | 
193  |  |           pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D1, E1, rPos, gPos, bPos, aPos);  | 
194  |  |           pY2 += 8;  | 
195  |  |         }  | 
196  |  |         { | 
197  |  |           const uint8x8_t Y2u = vld1_u8(pY2);  | 
198  |  |           const int16x8_t Y2 = vreinterpretq_s16_u16(vmovl_u8(Y2u));  | 
199  |  |           pRGB2 = neon_YuvToRgbPixel(pRGB2, Y2, D2, E2, rPos, gPos, bPos, aPos);  | 
200  |  |           pY2 += 8;  | 
201  |  |         }  | 
202  |  |       }  | 
203  |  |  | 
204  |  |       pU += 8;  | 
205  |  |       pV += 8;  | 
206  |  |     }  | 
207  |  |  | 
208  |  |     for (; x < nWidth; x++)  | 
209  |  |     { | 
210  |  |       const BYTE U = *pU;  | 
211  |  |       const BYTE V = *pV;  | 
212  |  |       { | 
213  |  |         const BYTE Y = *pY1++;  | 
214  |  |         const BYTE r = YUV2R(Y, U, V);  | 
215  |  |         const BYTE g = YUV2G(Y, U, V);  | 
216  |  |         const BYTE b = YUV2B(Y, U, V);  | 
217  |  |         pRGB1[aPos] = 0xFF;  | 
218  |  |         pRGB1[rPos] = r;  | 
219  |  |         pRGB1[gPos] = g;  | 
220  |  |         pRGB1[bPos] = b;  | 
221  |  |         pRGB1 += 4;  | 
222  |  |       }  | 
223  |  |  | 
224  |  |       if (!lastY)  | 
225  |  |       { | 
226  |  |         const BYTE Y = *pY2++;  | 
227  |  |         const BYTE r = YUV2R(Y, U, V);  | 
228  |  |         const BYTE g = YUV2G(Y, U, V);  | 
229  |  |         const BYTE b = YUV2B(Y, U, V);  | 
230  |  |         pRGB2[aPos] = 0xFF;  | 
231  |  |         pRGB2[rPos] = r;  | 
232  |  |         pRGB2[gPos] = g;  | 
233  |  |         pRGB2[bPos] = b;  | 
234  |  |         pRGB2 += 4;  | 
235  |  |       }  | 
236  |  |  | 
237  |  |       if (x % 2)  | 
238  |  |       { | 
239  |  |         pU++;  | 
240  |  |         pV++;  | 
241  |  |       }  | 
242  |  |     }  | 
243  |  |  | 
244  |  |     pRGB1 += dPad;  | 
245  |  |     pRGB2 += dPad;  | 
246  |  |     pY1 += yPad;  | 
247  |  |     pY2 += yPad;  | 
248  |  |     pU += uPad;  | 
249  |  |     pV += vPad;  | 
250  |  |   }  | 
251  |  |  | 
252  |  |   return PRIMITIVES_SUCCESS;  | 
253  |  | }  | 
254  |  |  | 
255  |  | static pstatus_t neon_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT const pSrc[3],  | 
256  |  |                                             const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,  | 
257  |  |                                             UINT32 dstStep, UINT32 DstFormat,  | 
258  |  |                                             const prim_size_t* WINPR_RESTRICT roi)  | 
259  |  | { | 
260  |  |   switch (DstFormat)  | 
261  |  |   { | 
262  |  |     case PIXEL_FORMAT_BGRA32:  | 
263  |  |     case PIXEL_FORMAT_BGRX32:  | 
264  |  |       return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);  | 
265  |  |  | 
266  |  |     case PIXEL_FORMAT_RGBA32:  | 
267  |  |     case PIXEL_FORMAT_RGBX32:  | 
268  |  |       return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);  | 
269  |  |  | 
270  |  |     case PIXEL_FORMAT_ARGB32:  | 
271  |  |     case PIXEL_FORMAT_XRGB32:  | 
272  |  |       return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);  | 
273  |  |  | 
274  |  |     case PIXEL_FORMAT_ABGR32:  | 
275  |  |     case PIXEL_FORMAT_XBGR32:  | 
276  |  |       return neon_YUV420ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);  | 
277  |  |  | 
278  |  |     default:  | 
279  |  |       return generic->YUV420ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);  | 
280  |  |   }  | 
281  |  | }  | 
282  |  |  | 
283  |  | static INLINE pstatus_t neon_YUV444ToX(const BYTE* const WINPR_RESTRICT pSrc[3],  | 
284  |  |                                        const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,  | 
285  |  |                                        UINT32 dstStep, const prim_size_t* WINPR_RESTRICT roi,  | 
286  |  |                                        const uint8_t rPos, const uint8_t gPos, const uint8_t bPos,  | 
287  |  |                                        const uint8_t aPos)  | 
288  |  | { | 
289  |  |   const UINT32 nWidth = roi->width;  | 
290  |  |   const UINT32 nHeight = roi->height;  | 
291  |  |   const UINT32 yPad = srcStep[0] - roi->width;  | 
292  |  |   const UINT32 uPad = srcStep[1] - roi->width;  | 
293  |  |   const UINT32 vPad = srcStep[2] - roi->width;  | 
294  |  |   const UINT32 dPad = dstStep - roi->width * 4;  | 
295  |  |   const uint8_t* pY = pSrc[0];  | 
296  |  |   const uint8_t* pU = pSrc[1];  | 
297  |  |   const uint8_t* pV = pSrc[2];  | 
298  |  |   uint8_t* pRGB = pDst;  | 
299  |  |   const int16x8_t c128 = vdupq_n_s16(128);  | 
300  |  |   const DWORD pad = nWidth % 8;  | 
301  |  |  | 
302  |  |   for (UINT32 y = 0; y < nHeight; y++)  | 
303  |  |   { | 
304  |  |     for (UINT32 x = 0; x < nWidth - pad; x += 8)  | 
305  |  |     { | 
306  |  |       const uint8x8_t Yu = vld1_u8(pY);  | 
307  |  |       const int16x8_t Y = vreinterpretq_s16_u16(vmovl_u8(Yu));  | 
308  |  |       const uint8x8_t Uu = vld1_u8(pU);  | 
309  |  |       const int16x8_t U = vreinterpretq_s16_u16(vmovl_u8(Uu));  | 
310  |  |       const uint8x8_t Vu = vld1_u8(pV);  | 
311  |  |       const int16x8_t V = vreinterpretq_s16_u16(vmovl_u8(Vu));  | 
312  |  |       /* Do the calculations on Y in 32bit width, the result of 255 * 256 does not fit  | 
313  |  |        * a signed 16 bit value. */  | 
314  |  |       const int16x8_t D = vsubq_s16(U, c128);  | 
315  |  |       const int16x8_t E = vsubq_s16(V, c128);  | 
316  |  |       pRGB = neon_YuvToRgbPixel(pRGB, Y, D, E, rPos, gPos, bPos, aPos);  | 
317  |  |       pY += 8;  | 
318  |  |       pU += 8;  | 
319  |  |       pV += 8;  | 
320  |  |     }  | 
321  |  |  | 
322  |  |     for (UINT32 x = 0; x < pad; x++)  | 
323  |  |     { | 
324  |  |       const BYTE Y = *pY++;  | 
325  |  |       const BYTE U = *pU++;  | 
326  |  |       const BYTE V = *pV++;  | 
327  |  |       const BYTE r = YUV2R(Y, U, V);  | 
328  |  |       const BYTE g = YUV2G(Y, U, V);  | 
329  |  |       const BYTE b = YUV2B(Y, U, V);  | 
330  |  |       pRGB[aPos] = 0xFF;  | 
331  |  |       pRGB[rPos] = r;  | 
332  |  |       pRGB[gPos] = g;  | 
333  |  |       pRGB[bPos] = b;  | 
334  |  |       pRGB += 4;  | 
335  |  |     }  | 
336  |  |  | 
337  |  |     pRGB += dPad;  | 
338  |  |     pY += yPad;  | 
339  |  |     pU += uPad;  | 
340  |  |     pV += vPad;  | 
341  |  |   }  | 
342  |  |  | 
343  |  |   return PRIMITIVES_SUCCESS;  | 
344  |  | }  | 
345  |  |  | 
346  |  | static pstatus_t neon_YUV444ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT const pSrc[3],  | 
347  |  |                                             const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDst,  | 
348  |  |                                             UINT32 dstStep, UINT32 DstFormat,  | 
349  |  |                                             const prim_size_t* WINPR_RESTRICT roi)  | 
350  |  | { | 
351  |  |   switch (DstFormat)  | 
352  |  |   { | 
353  |  |     case PIXEL_FORMAT_BGRA32:  | 
354  |  |     case PIXEL_FORMAT_BGRX32:  | 
355  |  |       return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 2, 1, 0, 3);  | 
356  |  |  | 
357  |  |     case PIXEL_FORMAT_RGBA32:  | 
358  |  |     case PIXEL_FORMAT_RGBX32:  | 
359  |  |       return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 0, 1, 2, 3);  | 
360  |  |  | 
361  |  |     case PIXEL_FORMAT_ARGB32:  | 
362  |  |     case PIXEL_FORMAT_XRGB32:  | 
363  |  |       return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 1, 2, 3, 0);  | 
364  |  |  | 
365  |  |     case PIXEL_FORMAT_ABGR32:  | 
366  |  |     case PIXEL_FORMAT_XBGR32:  | 
367  |  |       return neon_YUV444ToX(pSrc, srcStep, pDst, dstStep, roi, 3, 2, 1, 0);  | 
368  |  |  | 
369  |  |     default:  | 
370  |  |       return generic->YUV444ToRGB_8u_P3AC4R(pSrc, srcStep, pDst, dstStep, DstFormat, roi);  | 
371  |  |   }  | 
372  |  | }  | 
373  |  |  | 
374  |  | static pstatus_t neon_LumaToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],  | 
375  |  |                                    const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],  | 
376  |  |                                    const UINT32 dstStep[3], const RECTANGLE_16* WINPR_RESTRICT roi)  | 
377  |  | { | 
378  |  |   const UINT32 nWidth = roi->right - roi->left;  | 
379  |  |   const UINT32 nHeight = roi->bottom - roi->top;  | 
380  |  |   const UINT32 halfWidth = (nWidth + 1) / 2;  | 
381  |  |   const UINT32 halfHeight = (nHeight + 1) / 2;  | 
382  |  |   const UINT32 evenY = 0;  | 
383  |  |   const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left, | 
384  |  |                         pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,  | 
385  |  |                         pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };  | 
386  |  |   BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left, | 
387  |  |                   pDstRaw[1] + roi->top * dstStep[1] + roi->left,  | 
388  |  |                   pDstRaw[2] + roi->top * dstStep[2] + roi->left };  | 
389  |  |  | 
390  |  |   /* Y data is already here... */  | 
391  |  |   /* B1 */  | 
392  |  |   for (UINT32 y = 0; y < nHeight; y++)  | 
393  |  |   { | 
394  |  |     const BYTE* Ym = pSrc[0] + srcStep[0] * y;  | 
395  |  |     BYTE* pY = pDst[0] + dstStep[0] * y;  | 
396  |  |     memcpy(pY, Ym, nWidth);  | 
397  |  |   }  | 
398  |  |  | 
399  |  |   /* The first half of U, V are already here part of this frame. */  | 
400  |  |   /* B2 and B3 */  | 
401  |  |   for (UINT32 y = 0; y < halfHeight; y++)  | 
402  |  |   { | 
403  |  |     const UINT32 val2y = (2 * y + evenY);  | 
404  |  |     const BYTE* Um = pSrc[1] + srcStep[1] * y;  | 
405  |  |     const BYTE* Vm = pSrc[2] + srcStep[2] * y;  | 
406  |  |     BYTE* pU = pDst[1] + dstStep[1] * val2y;  | 
407  |  |     BYTE* pV = pDst[2] + dstStep[2] * val2y;  | 
408  |  |     BYTE* pU1 = pU + dstStep[1];  | 
409  |  |     BYTE* pV1 = pV + dstStep[2];  | 
410  |  |  | 
411  |  |     UINT32 x = 0;  | 
412  |  |     for (; x + 16 < halfWidth; x += 16)  | 
413  |  |     { | 
414  |  |       { | 
415  |  |         const uint8x16_t u = vld1q_u8(Um);  | 
416  |  |         uint8x16x2_t u2x;  | 
417  |  |         u2x.val[0] = u;  | 
418  |  |         u2x.val[1] = u;  | 
419  |  |         vst2q_u8(pU, u2x);  | 
420  |  |         vst2q_u8(pU1, u2x);  | 
421  |  |         Um += 16;  | 
422  |  |         pU += 32;  | 
423  |  |         pU1 += 32;  | 
424  |  |       }  | 
425  |  |       { | 
426  |  |         const uint8x16_t v = vld1q_u8(Vm);  | 
427  |  |         uint8x16x2_t v2x;  | 
428  |  |         v2x.val[0] = v;  | 
429  |  |         v2x.val[1] = v;  | 
430  |  |         vst2q_u8(pV, v2x);  | 
431  |  |         vst2q_u8(pV1, v2x);  | 
432  |  |         Vm += 16;  | 
433  |  |         pV += 32;  | 
434  |  |         pV1 += 32;  | 
435  |  |       }  | 
436  |  |     }  | 
437  |  |  | 
438  |  |     for (; x < halfWidth; x++)  | 
439  |  |     { | 
440  |  |       const BYTE u = *Um++;  | 
441  |  |       const BYTE v = *Vm++;  | 
442  |  |       *pU++ = u;  | 
443  |  |       *pU++ = u;  | 
444  |  |       *pU1++ = u;  | 
445  |  |       *pU1++ = u;  | 
446  |  |       *pV++ = v;  | 
447  |  |       *pV++ = v;  | 
448  |  |       *pV1++ = v;  | 
449  |  |       *pV1++ = v;  | 
450  |  |     }  | 
451  |  |   }  | 
452  |  |  | 
453  |  |   return PRIMITIVES_SUCCESS;  | 
454  |  | }  | 
455  |  |  | 
456  |  | static pstatus_t neon_ChromaFilter(BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],  | 
457  |  |                                    const RECTANGLE_16* WINPR_RESTRICT roi)  | 
458  |  | { | 
459  |  |   const UINT32 oddY = 1;  | 
460  |  |   const UINT32 evenY = 0;  | 
461  |  |   const UINT32 nWidth = roi->right - roi->left;  | 
462  |  |   const UINT32 nHeight = roi->bottom - roi->top;  | 
463  |  |   const UINT32 halfHeight = (nHeight + 1) / 2;  | 
464  |  |   const UINT32 halfWidth = (nWidth + 1) / 2;  | 
465  |  |   const UINT32 halfPad = halfWidth % 16;  | 
466  |  |  | 
467  |  |   /* Filter */  | 
468  |  |   for (UINT32 y = roi->top; y < halfHeight + roi->top; y++)  | 
469  |  |   { | 
470  |  |     const UINT32 val2y = (y * 2 + evenY);  | 
471  |  |     const UINT32 val2y1 = val2y + oddY;  | 
472  |  |     BYTE* pU1 = pDst[1] + dstStep[1] * val2y1;  | 
473  |  |     BYTE* pV1 = pDst[2] + dstStep[2] * val2y1;  | 
474  |  |     BYTE* pU = pDst[1] + dstStep[1] * val2y;  | 
475  |  |     BYTE* pV = pDst[2] + dstStep[2] * val2y;  | 
476  |  |  | 
477  |  |     if (val2y1 > nHeight)  | 
478  |  |       continue;  | 
479  |  |  | 
480  |  |     UINT32 x = roi->left / 2;  | 
481  |  |     for (; x < halfWidth + roi->left / 2 - halfPad; x += 16)  | 
482  |  |     { | 
483  |  |       { | 
484  |  |         /* U = (U2x,2y << 2) - U2x1,2y - U2x,2y1 - U2x1,2y1 */  | 
485  |  |         uint8x8x2_t u = vld2_u8(&pU[2 * x]);  | 
486  |  |         const int16x8_t up =  | 
487  |  |             vreinterpretq_s16_u16(vshll_n_u8(u.val[0], 2)); /* Ux2,2y << 2 */  | 
488  |  |         const uint8x8x2_t u1 = vld2_u8(&pU1[2 * x]);  | 
489  |  |         const uint16x8_t usub = vaddl_u8(u1.val[1], u1.val[0]); /* U2x,2y1 + U2x1,2y1 */  | 
490  |  |         const int16x8_t us = vreinterpretq_s16_u16(  | 
491  |  |             vaddw_u8(usub, u.val[1])); /* U2x1,2y + U2x,2y1 + U2x1,2y1 */  | 
492  |  |         const int16x8_t un = vsubq_s16(up, us);  | 
493  |  |         const uint8x8_t u8 = vqmovun_s16(un); /* CLIP(un) */  | 
494  |  |         u.val[0] = u8;  | 
495  |  |         vst2_u8(&pU[2 * x], u);  | 
496  |  |       }  | 
497  |  |       { | 
498  |  |         /* V = (V2x,2y << 2) - V2x1,2y - V2x,2y1 - V2x1,2y1 */  | 
499  |  |         uint8x8x2_t v = vld2_u8(&pV[2 * x]);  | 
500  |  |         const int16x8_t vp =  | 
501  |  |             vreinterpretq_s16_u16(vshll_n_u8(v.val[0], 2)); /* Vx2,2y << 2 */  | 
502  |  |         const uint8x8x2_t v1 = vld2_u8(&pV1[2 * x]);  | 
503  |  |         const uint16x8_t vsub = vaddl_u8(v1.val[1], v1.val[0]); /* V2x,2y1 + V2x1,2y1 */  | 
504  |  |         const int16x8_t vs = vreinterpretq_s16_u16(  | 
505  |  |             vaddw_u8(vsub, v.val[1])); /* V2x1,2y + V2x,2y1 + V2x1,2y1 */  | 
506  |  |         const int16x8_t vn = vsubq_s16(vp, vs);  | 
507  |  |         const uint8x8_t v8 = vqmovun_s16(vn); /* CLIP(vn) */  | 
508  |  |         v.val[0] = v8;  | 
509  |  |         vst2_u8(&pV[2 * x], v);  | 
510  |  |       }  | 
511  |  |     }  | 
512  |  |  | 
513  |  |     for (; x < halfWidth + roi->left / 2; x++)  | 
514  |  |     { | 
515  |  |       const UINT32 val2x = (x * 2);  | 
516  |  |       const UINT32 val2x1 = val2x + 1;  | 
517  |  |       const BYTE inU = pU[val2x];  | 
518  |  |       const BYTE inV = pV[val2x];  | 
519  |  |       const INT32 up = inU * 4;  | 
520  |  |       const INT32 vp = inV * 4;  | 
521  |  |       INT32 u2020;  | 
522  |  |       INT32 v2020;  | 
523  |  |  | 
524  |  |       if (val2x1 > nWidth)  | 
525  |  |         continue;  | 
526  |  |  | 
527  |  |       u2020 = up - pU[val2x1] - pU1[val2x] - pU1[val2x1];  | 
528  |  |       v2020 = vp - pV[val2x1] - pV1[val2x] - pV1[val2x1];  | 
529  |  |       pU[val2x] = CONDITIONAL_CLIP(u2020, inU);  | 
530  |  |       pV[val2x] = CONDITIONAL_CLIP(v2020, inV);  | 
531  |  |     }  | 
532  |  |   }  | 
533  |  |  | 
534  |  |   return PRIMITIVES_SUCCESS;  | 
535  |  | }  | 
536  |  |  | 
537  |  | static pstatus_t neon_ChromaV1ToYUV444(const BYTE* const WINPR_RESTRICT pSrcRaw[3],  | 
538  |  |                                        const UINT32 srcStep[3], BYTE* WINPR_RESTRICT pDstRaw[3],  | 
539  |  |                                        const UINT32 dstStep[3],  | 
540  |  |                                        const RECTANGLE_16* WINPR_RESTRICT roi)  | 
541  |  | { | 
542  |  |   const UINT32 mod = 16;  | 
543  |  |   UINT32 uY = 0;  | 
544  |  |   UINT32 vY = 0;  | 
545  |  |   const UINT32 nWidth = roi->right - roi->left;  | 
546  |  |   const UINT32 nHeight = roi->bottom - roi->top;  | 
547  |  |   const UINT32 halfWidth = (nWidth) / 2;  | 
548  |  |   const UINT32 halfHeight = (nHeight) / 2;  | 
549  |  |   const UINT32 oddY = 1;  | 
550  |  |   const UINT32 evenY = 0;  | 
551  |  |   const UINT32 oddX = 1;  | 
552  |  |   /* The auxilary frame is aligned to multiples of 16x16.  | 
553  |  |    * We need the padded height for B4 and B5 conversion. */  | 
554  |  |   const UINT32 padHeigth = nHeight + 16 - nHeight % 16;  | 
555  |  |   const UINT32 halfPad = halfWidth % 16;  | 
556  |  |   const BYTE* pSrc[3] = { pSrcRaw[0] + roi->top * srcStep[0] + roi->left, | 
557  |  |                         pSrcRaw[1] + roi->top / 2 * srcStep[1] + roi->left / 2,  | 
558  |  |                         pSrcRaw[2] + roi->top / 2 * srcStep[2] + roi->left / 2 };  | 
559  |  |   BYTE* pDst[3] = { pDstRaw[0] + roi->top * dstStep[0] + roi->left, | 
560  |  |                   pDstRaw[1] + roi->top * dstStep[1] + roi->left,  | 
561  |  |                   pDstRaw[2] + roi->top * dstStep[2] + roi->left };  | 
562  |  |  | 
563  |  |   /* The second half of U and V is a bit more tricky... */  | 
564  |  |   /* B4 and B5 */  | 
565  |  |   for (UINT32 y = 0; y < padHeigth; y++)  | 
566  |  |   { | 
567  |  |     const BYTE* Ya = pSrc[0] + srcStep[0] * y;  | 
568  |  |     BYTE* pX;  | 
569  |  |  | 
570  |  |     if ((y) % mod < (mod + 1) / 2)  | 
571  |  |     { | 
572  |  |       const UINT32 pos = (2 * uY++ + oddY);  | 
573  |  |  | 
574  |  |       if (pos >= nHeight)  | 
575  |  |         continue;  | 
576  |  |  | 
577  |  |       pX = pDst[1] + dstStep[1] * pos;  | 
578  |  |     }  | 
579  |  |     else  | 
580  |  |     { | 
581  |  |       const UINT32 pos = (2 * vY++ + oddY);  | 
582  |  |  | 
583  |  |       if (pos >= nHeight)  | 
584  |  |         continue;  | 
585  |  |  | 
586  |  |       pX = pDst[2] + dstStep[2] * pos;  | 
587  |  |     }  | 
588  |  |  | 
589  |  |     memcpy(pX, Ya, nWidth);  | 
590  |  |   }  | 
591  |  |  | 
592  |  |   /* B6 and B7 */  | 
593  |  |   for (UINT32 y = 0; y < halfHeight; y++)  | 
594  |  |   { | 
595  |  |     const UINT32 val2y = (y * 2 + evenY);  | 
596  |  |     const BYTE* Ua = pSrc[1] + srcStep[1] * y;  | 
597  |  |     const BYTE* Va = pSrc[2] + srcStep[2] * y;  | 
598  |  |     BYTE* pU = pDst[1] + dstStep[1] * val2y;  | 
599  |  |     BYTE* pV = pDst[2] + dstStep[2] * val2y;  | 
600  |  |  | 
601  |  |     UINT32 x = 0;  | 
602  |  |     for (; x < halfWidth - halfPad; x += 16)  | 
603  |  |     { | 
604  |  |       { | 
605  |  |         uint8x16x2_t u = vld2q_u8(&pU[2 * x]);  | 
606  |  |         u.val[1] = vld1q_u8(&Ua[x]);  | 
607  |  |         vst2q_u8(&pU[2 * x], u);  | 
608  |  |       }  | 
609  |  |       { | 
610  |  |         uint8x16x2_t v = vld2q_u8(&pV[2 * x]);  | 
611  |  |         v.val[1] = vld1q_u8(&Va[x]);  | 
612  |  |         vst2q_u8(&pV[2 * x], v);  | 
613  |  |       }  | 
614  |  |     }  | 
615  |  |  | 
616  |  |     for (; x < halfWidth; x++)  | 
617  |  |     { | 
618  |  |       const UINT32 val2x1 = (x * 2 + oddX);  | 
619  |  |       pU[val2x1] = Ua[x];  | 
620  |  |       pV[val2x1] = Va[x];  | 
621  |  |     }  | 
622  |  |   }  | 
623  |  |  | 
624  |  |   /* Filter */  | 
625  |  |   return neon_ChromaFilter(pDst, dstStep, roi);  | 
626  |  | }  | 
627  |  |  | 
628  |  | static pstatus_t neon_ChromaV2ToYUV444(const BYTE* const WINPR_RESTRICT pSrc[3],  | 
629  |  |                                        const UINT32 srcStep[3], UINT32 nTotalWidth,  | 
630  |  |                                        UINT32 nTotalHeight, BYTE* WINPR_RESTRICT pDst[3],  | 
631  |  |                                        const UINT32 dstStep[3],  | 
632  |  |                                        const RECTANGLE_16* WINPR_RESTRICT roi)  | 
633  |  | { | 
634  |  |   const UINT32 nWidth = roi->right - roi->left;  | 
635  |  |   const UINT32 nHeight = roi->bottom - roi->top;  | 
636  |  |   const UINT32 halfWidth = (nWidth + 1) / 2;  | 
637  |  |   const UINT32 halfPad = halfWidth % 16;  | 
638  |  |   const UINT32 halfHeight = (nHeight + 1) / 2;  | 
639  |  |   const UINT32 quaterWidth = (nWidth + 3) / 4;  | 
640  |  |   const UINT32 quaterPad = quaterWidth % 16;  | 
641  |  |  | 
642  |  |   /* B4 and B5: odd UV values for width/2, height */  | 
643  |  |   for (UINT32 y = 0; y < nHeight; y++)  | 
644  |  |   { | 
645  |  |     const UINT32 yTop = y + roi->top;  | 
646  |  |     const BYTE* pYaU = pSrc[0] + srcStep[0] * yTop + roi->left / 2;  | 
647  |  |     const BYTE* pYaV = pYaU + nTotalWidth / 2;  | 
648  |  |     BYTE* pU = pDst[1] + dstStep[1] * yTop + roi->left;  | 
649  |  |     BYTE* pV = pDst[2] + dstStep[2] * yTop + roi->left;  | 
650  |  |  | 
651  |  |     UINT32 x = 0;  | 
652  |  |     for (; x < halfWidth - halfPad; x += 16)  | 
653  |  |     { | 
654  |  |       { | 
655  |  |         uint8x16x2_t u = vld2q_u8(&pU[2 * x]);  | 
656  |  |         u.val[1] = vld1q_u8(&pYaU[x]);  | 
657  |  |         vst2q_u8(&pU[2 * x], u);  | 
658  |  |       }  | 
659  |  |       { | 
660  |  |         uint8x16x2_t v = vld2q_u8(&pV[2 * x]);  | 
661  |  |         v.val[1] = vld1q_u8(&pYaV[x]);  | 
662  |  |         vst2q_u8(&pV[2 * x], v);  | 
663  |  |       }  | 
664  |  |     }  | 
665  |  |  | 
666  |  |     for (; x < halfWidth; x++)  | 
667  |  |     { | 
668  |  |       const UINT32 odd = 2 * x + 1;  | 
669  |  |       pU[odd] = pYaU[x];  | 
670  |  |       pV[odd] = pYaV[x];  | 
671  |  |     }  | 
672  |  |   }  | 
673  |  |  | 
674  |  |   /* B6 - B9 */  | 
675  |  |   for (UINT32 y = 0; y < halfHeight; y++)  | 
676  |  |   { | 
677  |  |     const BYTE* pUaU = pSrc[1] + srcStep[1] * (y + roi->top / 2) + roi->left / 4;  | 
678  |  |     const BYTE* pUaV = pUaU + nTotalWidth / 4;  | 
679  |  |     const BYTE* pVaU = pSrc[2] + srcStep[2] * (y + roi->top / 2) + roi->left / 4;  | 
680  |  |     const BYTE* pVaV = pVaU + nTotalWidth / 4;  | 
681  |  |     BYTE* pU = pDst[1] + dstStep[1] * (2 * y + 1 + roi->top) + roi->left;  | 
682  |  |     BYTE* pV = pDst[2] + dstStep[2] * (2 * y + 1 + roi->top) + roi->left;  | 
683  |  |  | 
684  |  |     UINT32 x = 0;  | 
685  |  |     for (; x < quaterWidth - quaterPad; x += 16)  | 
686  |  |     { | 
687  |  |       { | 
688  |  |         uint8x16x4_t u = vld4q_u8(&pU[4 * x]);  | 
689  |  |         u.val[0] = vld1q_u8(&pUaU[x]);  | 
690  |  |         u.val[2] = vld1q_u8(&pVaU[x]);  | 
691  |  |         vst4q_u8(&pU[4 * x], u);  | 
692  |  |       }  | 
693  |  |       { | 
694  |  |         uint8x16x4_t v = vld4q_u8(&pV[4 * x]);  | 
695  |  |         v.val[0] = vld1q_u8(&pUaV[x]);  | 
696  |  |         v.val[2] = vld1q_u8(&pVaV[x]);  | 
697  |  |         vst4q_u8(&pV[4 * x], v);  | 
698  |  |       }  | 
699  |  |     }  | 
700  |  |  | 
701  |  |     for (; x < quaterWidth; x++)  | 
702  |  |     { | 
703  |  |       pU[4 * x + 0] = pUaU[x];  | 
704  |  |       pV[4 * x + 0] = pUaV[x];  | 
705  |  |       pU[4 * x + 2] = pVaU[x];  | 
706  |  |       pV[4 * x + 2] = pVaV[x];  | 
707  |  |     }  | 
708  |  |   }  | 
709  |  |  | 
710  |  |   return neon_ChromaFilter(pDst, dstStep, roi);  | 
711  |  | }  | 
712  |  |  | 
713  |  | static pstatus_t neon_YUV420CombineToYUV444(avc444_frame_type type,  | 
714  |  |                                             const BYTE* const WINPR_RESTRICT pSrc[3],  | 
715  |  |                                             const UINT32 srcStep[3], UINT32 nWidth, UINT32 nHeight,  | 
716  |  |                                             BYTE* WINPR_RESTRICT pDst[3], const UINT32 dstStep[3],  | 
717  |  |                                             const RECTANGLE_16* WINPR_RESTRICT roi)  | 
718  |  | { | 
719  |  |   if (!pSrc || !pSrc[0] || !pSrc[1] || !pSrc[2])  | 
720  |  |     return -1;  | 
721  |  |  | 
722  |  |   if (!pDst || !pDst[0] || !pDst[1] || !pDst[2])  | 
723  |  |     return -1;  | 
724  |  |  | 
725  |  |   if (!roi)  | 
726  |  |     return -1;  | 
727  |  |  | 
728  |  |   switch (type)  | 
729  |  |   { | 
730  |  |     case AVC444_LUMA:  | 
731  |  |       return neon_LumaToYUV444(pSrc, srcStep, pDst, dstStep, roi);  | 
732  |  |  | 
733  |  |     case AVC444_CHROMAv1:  | 
734  |  |       return neon_ChromaV1ToYUV444(pSrc, srcStep, pDst, dstStep, roi);  | 
735  |  |  | 
736  |  |     case AVC444_CHROMAv2:  | 
737  |  |       return neon_ChromaV2ToYUV444(pSrc, srcStep, nWidth, nHeight, pDst, dstStep, roi);  | 
738  |  |  | 
739  |  |     default:  | 
740  |  |       return -1;  | 
741  |  |   }  | 
742  |  | }  | 
743  |  | #endif  | 
744  |  |  | 
745  |  | void primitives_init_YUV_neon(primitives_t* prims)  | 
746  | 0  | { | 
747  |  | #if defined(NEON_ENABLED)  | 
748  |  |   generic = primitives_get_generic();  | 
749  |  |   primitives_init_YUV(prims);  | 
750  |  |  | 
751  |  |   if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))  | 
752  |  |   { | 
753  |  |     WLog_VRB(PRIM_TAG, "NEON optimizations");  | 
754  |  |     prims->YUV420ToRGB_8u_P3AC4R = neon_YUV420ToRGB_8u_P3AC4R;  | 
755  |  |     prims->YUV444ToRGB_8u_P3AC4R = neon_YUV444ToRGB_8u_P3AC4R;  | 
756  |  |     prims->YUV420CombineToYUV444 = neon_YUV420CombineToYUV444;  | 
757  |  |   }  | 
758  |  | #else  | 
759  | 0  |   WLog_VRB(PRIM_TAG, "undefined WITH_NEON");  | 
760  | 0  |   WINPR_UNUSED(prims);  | 
761  | 0  | #endif  | 
762  | 0  | }  |