/src/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Copy operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | */ |
15 | | |
16 | | #include <winpr/sysinfo.h> |
17 | | |
18 | | #include <freerdp/config.h> |
19 | | |
20 | | #include <string.h> |
21 | | #include <freerdp/types.h> |
22 | | #include <freerdp/primitives.h> |
23 | | #include <freerdp/log.h> |
24 | | |
25 | | #include "prim_internal.h" |
26 | | #include "prim_copy.h" |
27 | | #include "../codec/color.h" |
28 | | |
29 | | #include <freerdp/codec/color.h> |
30 | | |
31 | | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
32 | | #include <emmintrin.h> |
33 | | #include <immintrin.h> |
34 | | |
35 | | static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, |
36 | | uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) |
37 | 0 | { |
38 | 0 | return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4, |
39 | 0 | (int32_t)i5, (int32_t)i6, (int32_t)i7); |
40 | 0 | } |
41 | | |
42 | | static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep, |
43 | | UINT32 nXDst, UINT32 nYDst, UINT32 nWidth, |
44 | | UINT32 nHeight, |
45 | | const BYTE* WINPR_RESTRICT pSrcData, |
46 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, |
47 | | int64_t srcVMultiplier, int64_t srcVOffset, |
48 | | int64_t dstVMultiplier, int64_t dstVOffset) |
49 | 0 | { |
50 | |
|
51 | 0 | const int64_t srcByte = 3; |
52 | 0 | const int64_t dstByte = 4; |
53 | |
|
54 | 0 | const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, |
55 | 0 | 0xFF000000, 0xFF000000, 0xFF000000); |
56 | 0 | const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff, |
57 | 0 | 0xff0b0a09, 0xff080706, 0xff050403, 0xff020100); |
58 | 0 | const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c, |
59 | 0 | 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff); |
60 | 0 | const UINT32 rem = nWidth % 8; |
61 | 0 | const int64_t width = nWidth - rem; |
62 | |
|
63 | 0 | for (int64_t y = 0; y < nHeight; y++) |
64 | 0 | { |
65 | 0 | const BYTE* WINPR_RESTRICT srcLine = |
66 | 0 | &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset]; |
67 | 0 | BYTE* WINPR_RESTRICT dstLine = |
68 | 0 | &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset]; |
69 | |
|
70 | 0 | int64_t x = 0; |
71 | | |
72 | | /* Ensure alignment requirements can be met */ |
73 | 0 | for (; x < width; x += 8) |
74 | 0 | { |
75 | 0 | const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte]; |
76 | 0 | __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte]; |
77 | 0 | const __m256i s0 = _mm256_loadu_si256(src); |
78 | 0 | __m256i s1 = _mm256_shuffle_epi8(s0, smask); |
79 | | |
80 | | /* _mm256_shuffle_epi8 can not cross 128bit lanes. |
81 | | * manually copy these bytes with extract/insert */ |
82 | 0 | const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0)); |
83 | 0 | const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask); |
84 | 0 | const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF, |
85 | 0 | 0x00000000, 0x00000000, 0x00000000, 0x00000000); |
86 | 0 | const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask); |
87 | |
|
88 | 0 | const __m256i s2 = _mm256_loadu_si256(dst); |
89 | 0 | __m256i d0 = _mm256_blendv_epi8(merged, s2, mask); |
90 | 0 | _mm256_storeu_si256(dst, d0); |
91 | 0 | } |
92 | |
|
93 | 0 | for (; x < nWidth; x++) |
94 | 0 | { |
95 | 0 | const BYTE* src = &srcLine[(x + nXSrc) * srcByte]; |
96 | 0 | BYTE* dst = &dstLine[(x + nXDst) * dstByte]; |
97 | 0 | *dst++ = *src++; |
98 | 0 | *dst++ = *src++; |
99 | 0 | *dst++ = *src++; |
100 | 0 | } |
101 | 0 | } |
102 | |
|
103 | 0 | return PRIMITIVES_SUCCESS; |
104 | 0 | } |
105 | | |
106 | | static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData, |
107 | | UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, |
108 | | UINT32 nWidth, UINT32 nHeight, |
109 | | const BYTE* WINPR_RESTRICT pSrcData, |
110 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, |
111 | | int64_t srcVMultiplier, int64_t srcVOffset, |
112 | | int64_t dstVMultiplier, int64_t dstVOffset) |
113 | 0 | { |
114 | |
|
115 | 0 | const int64_t srcByte = 4; |
116 | 0 | const int64_t dstByte = 4; |
117 | |
|
118 | 0 | const __m256i mask = _mm256_setr_epi8( |
119 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00, |
120 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00, |
121 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00, |
122 | 0 | (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00); |
123 | 0 | const UINT32 rem = nWidth % 8; |
124 | 0 | const int64_t width = nWidth - rem; |
125 | 0 | for (int64_t y = 0; y < nHeight; y++) |
126 | 0 | { |
127 | 0 | const BYTE* WINPR_RESTRICT srcLine = |
128 | 0 | &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset]; |
129 | 0 | BYTE* WINPR_RESTRICT dstLine = |
130 | 0 | &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset]; |
131 | |
|
132 | 0 | int64_t x = 0; |
133 | 0 | for (; x < width; x += 8) |
134 | 0 | { |
135 | 0 | const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte]; |
136 | 0 | __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte]; |
137 | 0 | const __m256i s0 = _mm256_loadu_si256(src); |
138 | 0 | const __m256i s1 = _mm256_loadu_si256(dst); |
139 | 0 | __m256i d0 = _mm256_blendv_epi8(s1, s0, mask); |
140 | 0 | _mm256_storeu_si256(dst, d0); |
141 | 0 | } |
142 | |
|
143 | 0 | for (; x < nWidth; x++) |
144 | 0 | { |
145 | 0 | const BYTE* src = &srcLine[(x + nXSrc) * srcByte]; |
146 | 0 | BYTE* dst = &dstLine[(x + nXDst) * dstByte]; |
147 | 0 | *dst++ = *src++; |
148 | 0 | *dst++ = *src++; |
149 | 0 | *dst++ = *src++; |
150 | 0 | } |
151 | 0 | } |
152 | |
|
153 | 0 | return PRIMITIVES_SUCCESS; |
154 | 0 | } |
155 | | |
156 | | static pstatus_t avx2_image_copy_no_overlap_dst_alpha( |
157 | | BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, |
158 | | UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat, |
159 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette, |
160 | | UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier, |
161 | | int64_t dstVOffset) |
162 | 0 | { |
163 | 0 | WINPR_ASSERT(pDstData); |
164 | 0 | WINPR_ASSERT(pSrcData); |
165 | | |
166 | 0 | switch (SrcFormat) |
167 | 0 | { |
168 | 0 | case PIXEL_FORMAT_BGR24: |
169 | 0 | switch (DstFormat) |
170 | 0 | { |
171 | 0 | case PIXEL_FORMAT_BGRX32: |
172 | 0 | case PIXEL_FORMAT_BGRA32: |
173 | 0 | return avx2_image_copy_bgr24_bgrx32( |
174 | 0 | pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep, |
175 | 0 | nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset); |
176 | 0 | default: |
177 | 0 | break; |
178 | 0 | } |
179 | 0 | break; |
180 | 0 | case PIXEL_FORMAT_BGRX32: |
181 | 0 | case PIXEL_FORMAT_BGRA32: |
182 | 0 | switch (DstFormat) |
183 | 0 | { |
184 | 0 | case PIXEL_FORMAT_BGRX32: |
185 | 0 | case PIXEL_FORMAT_BGRA32: |
186 | 0 | return avx2_image_copy_bgrx32_bgrx32( |
187 | 0 | pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep, |
188 | 0 | nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset); |
189 | 0 | default: |
190 | 0 | break; |
191 | 0 | } |
192 | 0 | break; |
193 | 0 | case PIXEL_FORMAT_RGBX32: |
194 | 0 | case PIXEL_FORMAT_RGBA32: |
195 | 0 | switch (DstFormat) |
196 | 0 | { |
197 | 0 | case PIXEL_FORMAT_RGBX32: |
198 | 0 | case PIXEL_FORMAT_RGBA32: |
199 | 0 | return avx2_image_copy_bgrx32_bgrx32( |
200 | 0 | pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep, |
201 | 0 | nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset); |
202 | 0 | default: |
203 | 0 | break; |
204 | 0 | } |
205 | 0 | break; |
206 | 0 | default: |
207 | 0 | break; |
208 | 0 | } |
209 | | |
210 | 0 | primitives_t* gen = primitives_get_generic(); |
211 | 0 | return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, |
212 | 0 | pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags); |
213 | 0 | } |
214 | | |
215 | | static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, |
216 | | UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst, |
217 | | UINT32 nWidth, UINT32 nHeight, |
218 | | const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat, |
219 | | UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, |
220 | | const gdiPalette* WINPR_RESTRICT palette, UINT32 flags) |
221 | 0 | { |
222 | 0 | const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE; |
223 | 0 | int64_t srcVOffset = 0; |
224 | 0 | int64_t srcVMultiplier = 1; |
225 | 0 | int64_t dstVOffset = 0; |
226 | 0 | int64_t dstVMultiplier = 1; |
227 | |
|
228 | 0 | if ((nWidth == 0) || (nHeight == 0)) |
229 | 0 | return PRIMITIVES_SUCCESS; |
230 | | |
231 | 0 | if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX)) |
232 | 0 | return -1; |
233 | | |
234 | 0 | if (!pDstData || !pSrcData) |
235 | 0 | return -1; |
236 | | |
237 | 0 | if (nDstStep == 0) |
238 | 0 | nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat); |
239 | |
|
240 | 0 | if (nSrcStep == 0) |
241 | 0 | nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat); |
242 | |
|
243 | 0 | if (vSrcVFlip) |
244 | 0 | { |
245 | 0 | srcVOffset = (nHeight - 1ll) * nSrcStep; |
246 | 0 | srcVMultiplier = -1; |
247 | 0 | } |
248 | |
|
249 | 0 | if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat)) |
250 | 0 | return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst, |
251 | 0 | nWidth, nHeight, pSrcData, SrcFormat, nSrcStep, |
252 | 0 | nXSrc, nYSrc, palette, flags, srcVMultiplier, |
253 | 0 | srcVOffset, dstVMultiplier, dstVOffset); |
254 | 0 | else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat)) |
255 | 0 | return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst, |
256 | 0 | nWidth, nHeight, pSrcData, SrcFormat, nSrcStep, |
257 | 0 | nXSrc, nYSrc, palette, srcVMultiplier, |
258 | 0 | srcVOffset, dstVMultiplier, dstVOffset, flags); |
259 | 0 | else |
260 | 0 | { |
261 | 0 | primitives_t* gen = primitives_get_generic(); |
262 | 0 | return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, |
263 | 0 | pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags); |
264 | 0 | } |
265 | 0 | } |
266 | | #endif |
267 | | |
268 | | /* ------------------------------------------------------------------------- */ |
269 | | void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims) |
270 | 0 | { |
271 | 0 | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
272 | 0 | WLog_VRB(PRIM_TAG, "AVX2 optimizations"); |
273 | 0 | prims->copy_no_overlap = avx2_image_copy_no_overlap; |
274 | | #else |
275 | | WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available"); |
276 | | WINPR_UNUSED(prims); |
277 | | #endif |
278 | 0 | } |