/src/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Copy operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | */ |
15 | | |
16 | | #include <winpr/sysinfo.h> |
17 | | |
18 | | #include <freerdp/config.h> |
19 | | |
20 | | #include <string.h> |
21 | | #include <freerdp/types.h> |
22 | | #include <freerdp/primitives.h> |
23 | | #include <freerdp/log.h> |
24 | | |
25 | | #include "prim_internal.h" |
26 | | #include "prim_copy.h" |
27 | | #include "../codec/color.h" |
28 | | |
29 | | #include <freerdp/codec/color.h> |
30 | | |
31 | | #define TAG FREERDP_TAG("primitives.copy") |
32 | | |
33 | | #if defined(SSE2_ENABLED) |
34 | | #include <emmintrin.h> |
35 | | #include <immintrin.h> |
36 | | |
37 | | static INLINE pstatus_t avx2_image_copy_no_overlap_convert( |
38 | | BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, |
39 | | UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat, |
40 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette, |
41 | | SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset); |
42 | | |
43 | | static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep, |
44 | | UINT32 nXDst, UINT32 nYDst, UINT32 nWidth, |
45 | | UINT32 nHeight, |
46 | | const BYTE* WINPR_RESTRICT pSrcData, |
47 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, |
48 | | SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, |
49 | | SSIZE_T dstVMultiplier, SSIZE_T dstVOffset) |
50 | | { |
51 | | |
52 | | const SSIZE_T srcByte = 3; |
53 | | const SSIZE_T dstByte = 4; |
54 | | |
55 | | const __m256i mask = _mm256_set_epi32(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF); |
56 | | const SSIZE_T rem = nWidth % 8; |
57 | | const SSIZE_T width = nWidth - rem; |
58 | | for (SSIZE_T y = 0; y < nHeight; y++) |
59 | | { |
60 | | const BYTE* WINPR_RESTRICT srcLine = |
61 | | &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset]; |
62 | | BYTE* WINPR_RESTRICT dstLine = |
63 | | &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset]; |
64 | | |
65 | | SSIZE_T x = 0; |
66 | | for (; x < width; x += 8) |
67 | | { |
68 | | const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte]; |
69 | | __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte]; |
70 | | const __m256i s0 = _mm256_loadu_si256(src); |
71 | | const __m256i s1 = _mm256_loadu_si256(dst); |
72 | | const __m256i s2 = _mm256_shuffle_epi8(s1, mask); |
73 | | __m256i d0 = _mm256_blendv_epi8(s2, s0, mask); |
74 | | _mm256_storeu_si256(dst, d0); |
75 | | } |
76 | | for (; x < nWidth; x++) |
77 | | { |
78 | | const BYTE* src = &srcLine[(x + nXSrc) * srcByte]; |
79 | | BYTE* dst = &dstLine[(x + nXDst) * dstByte]; |
80 | | *dst++ = *src++; |
81 | | *dst++ = *src++; |
82 | | *dst++ = *src++; |
83 | | } |
84 | | } |
85 | | |
86 | | return PRIMITIVES_SUCCESS; |
87 | | } |
88 | | |
89 | | static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, |
90 | | UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, |
91 | | UINT32 nWidth, UINT32 nHeight, |
92 | | const BYTE* WINPR_RESTRICT pSrcData, |
93 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, |
94 | | SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, |
95 | | SSIZE_T dstVMultiplier, SSIZE_T dstVOffset) |
96 | | { |
97 | | |
98 | | const SSIZE_T srcByte = 4; |
99 | | const SSIZE_T dstByte = 4; |
100 | | |
101 | | const __m256i mask = _mm256_setr_epi8( |
102 | | (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00, |
103 | | (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00, |
104 | | (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00, |
105 | | (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00); |
106 | | const SSIZE_T rem = nWidth % 8; |
107 | | const SSIZE_T width = nWidth - rem; |
108 | | for (SSIZE_T y = 0; y < nHeight; y++) |
109 | | { |
110 | | const BYTE* WINPR_RESTRICT srcLine = |
111 | | &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset]; |
112 | | BYTE* WINPR_RESTRICT dstLine = |
113 | | &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset]; |
114 | | |
115 | | SSIZE_T x = 0; |
116 | | for (; x < width; x += 8) |
117 | | { |
118 | | const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte]; |
119 | | __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte]; |
120 | | const __m256i s0 = _mm256_loadu_si256(src); |
121 | | const __m256i s1 = _mm256_loadu_si256(dst); |
122 | | __m256i d0 = _mm256_blendv_epi8(s1, s0, mask); |
123 | | _mm256_storeu_si256(dst, d0); |
124 | | } |
125 | | |
126 | | for (; x < nWidth; x++) |
127 | | { |
128 | | const BYTE* src = &srcLine[(x + nXSrc) * srcByte]; |
129 | | BYTE* dst = &dstLine[(x + nXDst) * dstByte]; |
130 | | *dst++ = *src++; |
131 | | *dst++ = *src++; |
132 | | *dst++ = *src++; |
133 | | } |
134 | | } |
135 | | |
136 | | return PRIMITIVES_SUCCESS; |
137 | | } |
138 | | |
139 | | static pstatus_t avx2_image_copy_no_overlap_dst_alpha( |
140 | | BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, |
141 | | UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat, |
142 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette, |
143 | | SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset) |
144 | | { |
145 | | WINPR_ASSERT(pDstData); |
146 | | WINPR_ASSERT(pSrcData); |
147 | | |
148 | | switch (SrcFormat) |
149 | | { |
150 | | case PIXEL_FORMAT_BGR24: |
151 | | switch (DstFormat) |
152 | | { |
153 | | case PIXEL_FORMAT_BGRX32: |
154 | | case PIXEL_FORMAT_BGRA32: |
155 | | return avx2_image_copy_bgr24_bgrx32( |
156 | | pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep, |
157 | | nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset); |
158 | | default: |
159 | | break; |
160 | | } |
161 | | break; |
162 | | case PIXEL_FORMAT_BGRX32: |
163 | | case PIXEL_FORMAT_BGRA32: |
164 | | switch (DstFormat) |
165 | | { |
166 | | case PIXEL_FORMAT_BGRX32: |
167 | | case PIXEL_FORMAT_BGRA32: |
168 | | return avx2_image_copy_bgrx32_bgrx32( |
169 | | pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep, |
170 | | nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset); |
171 | | default: |
172 | | break; |
173 | | } |
174 | | break; |
175 | | default: |
176 | | break; |
177 | | } |
178 | | |
179 | | return avx2_image_copy_no_overlap_convert( |
180 | | pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep, |
181 | | nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset); |
182 | | } |
183 | | |
184 | | pstatus_t avx2_image_copy_no_overlap_convert( |
185 | | BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, |
186 | | UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat, |
187 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette, |
188 | | SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset) |
189 | | { |
190 | | const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat); |
191 | | const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat); |
192 | | |
193 | | const UINT32 width = nWidth - nWidth % 8; |
194 | | for (SSIZE_T y = 0; y < nHeight; y++) |
195 | | { |
196 | | const BYTE* WINPR_RESTRICT srcLine = |
197 | | &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset]; |
198 | | BYTE* WINPR_RESTRICT dstLine = |
199 | | &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset]; |
200 | | |
201 | | SSIZE_T x = 0; |
202 | | WINPR_PRAGMA_UNROLL_LOOP |
203 | | for (; x < width; x++) |
204 | | { |
205 | | const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat); |
206 | | const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette); |
207 | | FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor); |
208 | | } |
209 | | for (; x < nWidth; x++) |
210 | | { |
211 | | const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat); |
212 | | const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette); |
213 | | FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor); |
214 | | } |
215 | | } |
216 | | return PRIMITIVES_SUCCESS; |
217 | | } |
218 | | |
219 | | static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, |
220 | | UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, |
221 | | UINT32 nWidth, UINT32 nHeight, |
222 | | const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat, |
223 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, |
224 | | const gdiPalette* WINPR_RESTRICT palette, UINT32 flags) |
225 | | { |
226 | | const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE; |
227 | | SSIZE_T srcVOffset = 0; |
228 | | SSIZE_T srcVMultiplier = 1; |
229 | | SSIZE_T dstVOffset = 0; |
230 | | SSIZE_T dstVMultiplier = 1; |
231 | | |
232 | | if ((nWidth == 0) || (nHeight == 0)) |
233 | | return PRIMITIVES_SUCCESS; |
234 | | |
235 | | if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX)) |
236 | | return -1; |
237 | | |
238 | | if (!pDstData || !pSrcData) |
239 | | return -1; |
240 | | |
241 | | if (nDstStep == 0) |
242 | | nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat); |
243 | | |
244 | | if (nSrcStep == 0) |
245 | | nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat); |
246 | | |
247 | | if (vSrcVFlip) |
248 | | { |
249 | | srcVOffset = (nHeight - 1ll) * nSrcStep; |
250 | | srcVMultiplier = -1; |
251 | | } |
252 | | |
253 | | if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat)) |
254 | | return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst, |
255 | | nWidth, nHeight, pSrcData, SrcFormat, nSrcStep, |
256 | | nXSrc, nYSrc, palette, srcVMultiplier, |
257 | | srcVOffset, dstVMultiplier, dstVOffset); |
258 | | else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat)) |
259 | | return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst, |
260 | | nWidth, nHeight, pSrcData, SrcFormat, nSrcStep, |
261 | | nXSrc, nYSrc, palette, srcVMultiplier, |
262 | | srcVOffset, dstVMultiplier, dstVOffset, flags); |
263 | | else |
264 | | return avx2_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst, |
265 | | nWidth, nHeight, pSrcData, SrcFormat, nSrcStep, |
266 | | nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, |
267 | | dstVMultiplier, dstVOffset); |
268 | | } |
269 | | #endif |
270 | | |
271 | | /* ------------------------------------------------------------------------- */ |
272 | | void primitives_init_copy_avx2(primitives_t* prims) |
273 | 0 | { |
274 | | #if defined(SSE2_ENABLED) |
275 | | if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE)) |
276 | | { |
277 | | WLog_VRB(PRIM_TAG, "AVX2 optimizations"); |
278 | | prims->copy_no_overlap = avx2_image_copy_no_overlap; |
279 | | } |
280 | | #else |
281 | 0 | WLog_VRB(PRIM_TAG, "undefined WITH_SSE2"); |
282 | 0 | WINPR_UNUSED(prims); |
283 | 0 | #endif |
284 | 0 | } |