/src/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* FreeRDP: A Remote Desktop Protocol Client  | 
2  |  |  * Copy operations.  | 
3  |  |  * vi:ts=4 sw=4:  | 
4  |  |  *  | 
5  |  |  * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.  | 
6  |  |  * Licensed under the Apache License, Version 2.0 (the "License"); you may  | 
7  |  |  * not use this file except in compliance with the License. You may obtain  | 
8  |  |  * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.  | 
9  |  |  * Unless required by applicable law or agreed to in writing, software  | 
10  |  |  * distributed under the License is distributed on an "AS IS" BASIS,  | 
11  |  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express  | 
12  |  |  * or implied. See the License for the specific language governing  | 
13  |  |  * permissions and limitations under the License.  | 
14  |  |  */  | 
15  |  |  | 
16  |  | #include <winpr/sysinfo.h>  | 
17  |  |  | 
18  |  | #include <freerdp/config.h>  | 
19  |  |  | 
20  |  | #include <string.h>  | 
21  |  | #include <freerdp/types.h>  | 
22  |  | #include <freerdp/primitives.h>  | 
23  |  | #include <freerdp/log.h>  | 
24  |  |  | 
25  |  | #include "prim_internal.h"  | 
26  |  | #include "prim_copy.h"  | 
27  |  | #include "../codec/color.h"  | 
28  |  |  | 
29  |  | #include <freerdp/codec/color.h>  | 
30  |  |  | 
31  |  | #define TAG FREERDP_TAG("primitives.copy") | 
32  |  |  | 
33  |  | #if defined(SSE2_ENABLED)  | 
34  |  | #include <emmintrin.h>  | 
35  |  | #include <immintrin.h>  | 
36  |  |  | 
37  |  | static INLINE pstatus_t avx2_image_copy_no_overlap_convert(  | 
38  |  |     BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,  | 
39  |  |     UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,  | 
40  |  |     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,  | 
41  |  |     SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);  | 
42  |  |  | 
43  |  | static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,  | 
44  |  |                                                      UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,  | 
45  |  |                                                      UINT32 nHeight,  | 
46  |  |                                                      const BYTE* WINPR_RESTRICT pSrcData,  | 
47  |  |                                                      UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,  | 
48  |  |                                                      SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,  | 
49  |  |                                                      SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)  | 
50  |  | { | 
51  |  |  | 
52  |  |   const SSIZE_T srcByte = 3;  | 
53  |  |   const SSIZE_T dstByte = 4;  | 
54  |  |  | 
55  |  |   const __m256i mask = _mm256_set_epi32(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);  | 
56  |  |   const SSIZE_T rem = nWidth % 8;  | 
57  |  |   const SSIZE_T width = nWidth - rem;  | 
58  |  |   for (SSIZE_T y = 0; y < nHeight; y++)  | 
59  |  |   { | 
60  |  |     const BYTE* WINPR_RESTRICT srcLine =  | 
61  |  |         &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];  | 
62  |  |     BYTE* WINPR_RESTRICT dstLine =  | 
63  |  |         &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];  | 
64  |  |  | 
65  |  |     SSIZE_T x = 0;  | 
66  |  |     for (; x < width; x += 8)  | 
67  |  |     { | 
68  |  |       const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];  | 
69  |  |       __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];  | 
70  |  |       const __m256i s0 = _mm256_loadu_si256(src);  | 
71  |  |       const __m256i s1 = _mm256_loadu_si256(dst);  | 
72  |  |       const __m256i s2 = _mm256_shuffle_epi8(s1, mask);  | 
73  |  |       __m256i d0 = _mm256_blendv_epi8(s2, s0, mask);  | 
74  |  |       _mm256_storeu_si256(dst, d0);  | 
75  |  |     }  | 
76  |  |     for (; x < nWidth; x++)  | 
77  |  |     { | 
78  |  |       const BYTE* src = &srcLine[(x + nXSrc) * srcByte];  | 
79  |  |       BYTE* dst = &dstLine[(x + nXDst) * dstByte];  | 
80  |  |       *dst++ = *src++;  | 
81  |  |       *dst++ = *src++;  | 
82  |  |       *dst++ = *src++;  | 
83  |  |     }  | 
84  |  |   }  | 
85  |  |  | 
86  |  |   return PRIMITIVES_SUCCESS;  | 
87  |  | }  | 
88  |  |  | 
89  |  | static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,  | 
90  |  |                                                       UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,  | 
91  |  |                                                       UINT32 nWidth, UINT32 nHeight,  | 
92  |  |                                                       const BYTE* WINPR_RESTRICT pSrcData,  | 
93  |  |                                                       UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,  | 
94  |  |                                                       SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,  | 
95  |  |                                                       SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)  | 
96  |  | { | 
97  |  |  | 
98  |  |   const SSIZE_T srcByte = 4;  | 
99  |  |   const SSIZE_T dstByte = 4;  | 
100  |  |  | 
101  |  |   const __m256i mask = _mm256_setr_epi8(  | 
102  |  |       (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,  | 
103  |  |       (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,  | 
104  |  |       (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,  | 
105  |  |       (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);  | 
106  |  |   const SSIZE_T rem = nWidth % 8;  | 
107  |  |   const SSIZE_T width = nWidth - rem;  | 
108  |  |   for (SSIZE_T y = 0; y < nHeight; y++)  | 
109  |  |   { | 
110  |  |     const BYTE* WINPR_RESTRICT srcLine =  | 
111  |  |         &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];  | 
112  |  |     BYTE* WINPR_RESTRICT dstLine =  | 
113  |  |         &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];  | 
114  |  |  | 
115  |  |     SSIZE_T x = 0;  | 
116  |  |     for (; x < width; x += 8)  | 
117  |  |     { | 
118  |  |       const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];  | 
119  |  |       __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];  | 
120  |  |       const __m256i s0 = _mm256_loadu_si256(src);  | 
121  |  |       const __m256i s1 = _mm256_loadu_si256(dst);  | 
122  |  |       __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);  | 
123  |  |       _mm256_storeu_si256(dst, d0);  | 
124  |  |     }  | 
125  |  |  | 
126  |  |     for (; x < nWidth; x++)  | 
127  |  |     { | 
128  |  |       const BYTE* src = &srcLine[(x + nXSrc) * srcByte];  | 
129  |  |       BYTE* dst = &dstLine[(x + nXDst) * dstByte];  | 
130  |  |       *dst++ = *src++;  | 
131  |  |       *dst++ = *src++;  | 
132  |  |       *dst++ = *src++;  | 
133  |  |     }  | 
134  |  |   }  | 
135  |  |  | 
136  |  |   return PRIMITIVES_SUCCESS;  | 
137  |  | }  | 
138  |  |  | 
139  |  | static pstatus_t avx2_image_copy_no_overlap_dst_alpha(  | 
140  |  |     BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,  | 
141  |  |     UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,  | 
142  |  |     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,  | 
143  |  |     SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)  | 
144  |  | { | 
145  |  |   WINPR_ASSERT(pDstData);  | 
146  |  |   WINPR_ASSERT(pSrcData);  | 
147  |  |  | 
148  |  |   switch (SrcFormat)  | 
149  |  |   { | 
150  |  |     case PIXEL_FORMAT_BGR24:  | 
151  |  |       switch (DstFormat)  | 
152  |  |       { | 
153  |  |         case PIXEL_FORMAT_BGRX32:  | 
154  |  |         case PIXEL_FORMAT_BGRA32:  | 
155  |  |           return avx2_image_copy_bgr24_bgrx32(  | 
156  |  |               pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,  | 
157  |  |               nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);  | 
158  |  |         default:  | 
159  |  |           break;  | 
160  |  |       }  | 
161  |  |       break;  | 
162  |  |     case PIXEL_FORMAT_BGRX32:  | 
163  |  |     case PIXEL_FORMAT_BGRA32:  | 
164  |  |       switch (DstFormat)  | 
165  |  |       { | 
166  |  |         case PIXEL_FORMAT_BGRX32:  | 
167  |  |         case PIXEL_FORMAT_BGRA32:  | 
168  |  |           return avx2_image_copy_bgrx32_bgrx32(  | 
169  |  |               pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,  | 
170  |  |               nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);  | 
171  |  |         default:  | 
172  |  |           break;  | 
173  |  |       }  | 
174  |  |       break;  | 
175  |  |     default:  | 
176  |  |       break;  | 
177  |  |   }  | 
178  |  |  | 
179  |  |   return avx2_image_copy_no_overlap_convert(  | 
180  |  |       pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,  | 
181  |  |       nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);  | 
182  |  | }  | 
183  |  |  | 
184  |  | pstatus_t avx2_image_copy_no_overlap_convert(  | 
185  |  |     BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,  | 
186  |  |     UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,  | 
187  |  |     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,  | 
188  |  |     SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)  | 
189  |  | { | 
190  |  |   const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);  | 
191  |  |   const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);  | 
192  |  |  | 
193  |  |   const UINT32 width = nWidth - nWidth % 8;  | 
194  |  |   for (SSIZE_T y = 0; y < nHeight; y++)  | 
195  |  |   { | 
196  |  |     const BYTE* WINPR_RESTRICT srcLine =  | 
197  |  |         &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];  | 
198  |  |     BYTE* WINPR_RESTRICT dstLine =  | 
199  |  |         &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];  | 
200  |  |  | 
201  |  |     SSIZE_T x = 0;  | 
202  |  |     WINPR_PRAGMA_UNROLL_LOOP  | 
203  |  |     for (; x < width; x++)  | 
204  |  |     { | 
205  |  |       const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);  | 
206  |  |       const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);  | 
207  |  |       FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);  | 
208  |  |     }  | 
209  |  |     for (; x < nWidth; x++)  | 
210  |  |     { | 
211  |  |       const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);  | 
212  |  |       const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);  | 
213  |  |       FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);  | 
214  |  |     }  | 
215  |  |   }  | 
216  |  |   return PRIMITIVES_SUCCESS;  | 
217  |  | }  | 
218  |  |  | 
219  |  | static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,  | 
220  |  |                                             UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,  | 
221  |  |                                             UINT32 nWidth, UINT32 nHeight,  | 
222  |  |                                             const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,  | 
223  |  |                                             UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,  | 
224  |  |                                             const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)  | 
225  |  | { | 
226  |  |   const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;  | 
227  |  |   SSIZE_T srcVOffset = 0;  | 
228  |  |   SSIZE_T srcVMultiplier = 1;  | 
229  |  |   SSIZE_T dstVOffset = 0;  | 
230  |  |   SSIZE_T dstVMultiplier = 1;  | 
231  |  |  | 
232  |  |   if ((nWidth == 0) || (nHeight == 0))  | 
233  |  |     return PRIMITIVES_SUCCESS;  | 
234  |  |  | 
235  |  |   if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))  | 
236  |  |     return -1;  | 
237  |  |  | 
238  |  |   if (!pDstData || !pSrcData)  | 
239  |  |     return -1;  | 
240  |  |  | 
241  |  |   if (nDstStep == 0)  | 
242  |  |     nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);  | 
243  |  |  | 
244  |  |   if (nSrcStep == 0)  | 
245  |  |     nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);  | 
246  |  |  | 
247  |  |   if (vSrcVFlip)  | 
248  |  |   { | 
249  |  |     srcVOffset = (nHeight - 1ll) * nSrcStep;  | 
250  |  |     srcVMultiplier = -1;  | 
251  |  |   }  | 
252  |  |  | 
253  |  |   if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))  | 
254  |  |     return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,  | 
255  |  |                                                 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,  | 
256  |  |                                                 nXSrc, nYSrc, palette, srcVMultiplier,  | 
257  |  |                                                 srcVOffset, dstVMultiplier, dstVOffset);  | 
258  |  |   else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))  | 
259  |  |     return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,  | 
260  |  |                                                 nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,  | 
261  |  |                                                 nXSrc, nYSrc, palette, srcVMultiplier,  | 
262  |  |                                                 srcVOffset, dstVMultiplier, dstVOffset, flags);  | 
263  |  |   else  | 
264  |  |     return avx2_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,  | 
265  |  |                                               nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,  | 
266  |  |                                               nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset,  | 
267  |  |                                               dstVMultiplier, dstVOffset);  | 
268  |  | }  | 
269  |  | #endif  | 
270  |  |  | 
271  |  | /* ------------------------------------------------------------------------- */  | 
272  |  | void primitives_init_copy_avx2(primitives_t* prims)  | 
273  | 0  | { | 
274  |  | #if defined(SSE2_ENABLED)  | 
275  |  |   if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))  | 
276  |  |   { | 
277  |  |     WLog_VRB(PRIM_TAG, "AVX2 optimizations");  | 
278  |  |     prims->copy_no_overlap = avx2_image_copy_no_overlap;  | 
279  |  |   }  | 
280  |  | #else  | 
281  | 0  |   WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");  | 
282  | 0  |   WINPR_UNUSED(prims);  | 
283  | 0  | #endif  | 
284  | 0  | }  |