Coverage Report

Created: 2025-07-01 06:46

/src/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Copy operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 */
15
16
#include <winpr/sysinfo.h>
17
18
#include <freerdp/config.h>
19
20
#include <string.h>
21
#include <freerdp/types.h>
22
#include <freerdp/primitives.h>
23
#include <freerdp/log.h>
24
25
#include "prim_internal.h"
26
#include "prim_copy.h"
27
#include "../codec/color.h"
28
29
#include <freerdp/codec/color.h>
30
31
#if defined(SSE_AVX_INTRINSICS_ENABLED)
32
#include <emmintrin.h>
33
#include <immintrin.h>
34
35
static inline __m256i mm256_set_epu32(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
36
                                      uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7)
37
0
{
38
0
  return _mm256_set_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4,
39
0
                          (int32_t)i5, (int32_t)i6, (int32_t)i7);
40
0
}
41
42
static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
43
                                                     UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
44
                                                     UINT32 nHeight,
45
                                                     const BYTE* WINPR_RESTRICT pSrcData,
46
                                                     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
47
                                                     int64_t srcVMultiplier, int64_t srcVOffset,
48
                                                     int64_t dstVMultiplier, int64_t dstVOffset)
49
0
{
50
51
0
  const int64_t srcByte = 3;
52
0
  const int64_t dstByte = 4;
53
54
0
  const __m256i mask = mm256_set_epu32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000,
55
0
                                       0xFF000000, 0xFF000000, 0xFF000000);
56
0
  const __m256i smask = mm256_set_epu32(0xff171615, 0xff141312, 0xff1110ff, 0xffffffff,
57
0
                                        0xff0b0a09, 0xff080706, 0xff050403, 0xff020100);
58
0
  const __m256i shelpmask = mm256_set_epu32(0xffffffff, 0xffffffff, 0xffffff1f, 0xff1e1d1c,
59
0
                                            0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
60
0
  const UINT32 rem = nWidth % 8;
61
0
  const int64_t width = nWidth - rem;
62
63
0
  for (int64_t y = 0; y < nHeight; y++)
64
0
  {
65
0
    const BYTE* WINPR_RESTRICT srcLine =
66
0
        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
67
0
    BYTE* WINPR_RESTRICT dstLine =
68
0
        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
69
70
0
    int64_t x = 0;
71
72
    /* Ensure alignment requirements can be met */
73
0
    for (; x < width; x += 8)
74
0
    {
75
0
      const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
76
0
      __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
77
0
      const __m256i s0 = _mm256_loadu_si256(src);
78
0
      __m256i s1 = _mm256_shuffle_epi8(s0, smask);
79
80
      /* _mm256_shuffle_epi8 can not cross 128bit lanes.
81
       * manually copy these bytes with extract/insert */
82
0
      const __m256i sx = _mm256_broadcastsi128_si256(_mm256_extractf128_si256(s0, 0));
83
0
      const __m256i sxx = _mm256_shuffle_epi8(sx, shelpmask);
84
0
      const __m256i bmask = _mm256_set_epi32(0x00000000, 0x00000000, 0x000000FF, 0x00FFFFFF,
85
0
                                             0x00000000, 0x00000000, 0x00000000, 0x00000000);
86
0
      const __m256i merged = _mm256_blendv_epi8(s1, sxx, bmask);
87
88
0
      const __m256i s2 = _mm256_loadu_si256(dst);
89
0
      __m256i d0 = _mm256_blendv_epi8(merged, s2, mask);
90
0
      _mm256_storeu_si256(dst, d0);
91
0
    }
92
93
0
    for (; x < nWidth; x++)
94
0
    {
95
0
      const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
96
0
      BYTE* dst = &dstLine[(x + nXDst) * dstByte];
97
0
      *dst++ = *src++;
98
0
      *dst++ = *src++;
99
0
      *dst++ = *src++;
100
0
    }
101
0
  }
102
103
0
  return PRIMITIVES_SUCCESS;
104
0
}
105
106
static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
107
                                                      UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
108
                                                      UINT32 nWidth, UINT32 nHeight,
109
                                                      const BYTE* WINPR_RESTRICT pSrcData,
110
                                                      UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
111
                                                      int64_t srcVMultiplier, int64_t srcVOffset,
112
                                                      int64_t dstVMultiplier, int64_t dstVOffset)
113
0
{
114
115
0
  const int64_t srcByte = 4;
116
0
  const int64_t dstByte = 4;
117
118
0
  const __m256i mask = _mm256_setr_epi8(
119
0
      (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
120
0
      (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
121
0
      (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
122
0
      (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
123
0
  const UINT32 rem = nWidth % 8;
124
0
  const int64_t width = nWidth - rem;
125
0
  for (int64_t y = 0; y < nHeight; y++)
126
0
  {
127
0
    const BYTE* WINPR_RESTRICT srcLine =
128
0
        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
129
0
    BYTE* WINPR_RESTRICT dstLine =
130
0
        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
131
132
0
    int64_t x = 0;
133
0
    for (; x < width; x += 8)
134
0
    {
135
0
      const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
136
0
      __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
137
0
      const __m256i s0 = _mm256_loadu_si256(src);
138
0
      const __m256i s1 = _mm256_loadu_si256(dst);
139
0
      __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
140
0
      _mm256_storeu_si256(dst, d0);
141
0
    }
142
143
0
    for (; x < nWidth; x++)
144
0
    {
145
0
      const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
146
0
      BYTE* dst = &dstLine[(x + nXDst) * dstByte];
147
0
      *dst++ = *src++;
148
0
      *dst++ = *src++;
149
0
      *dst++ = *src++;
150
0
    }
151
0
  }
152
153
0
  return PRIMITIVES_SUCCESS;
154
0
}
155
156
static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
157
    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
158
    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
159
    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
160
    UINT32 flags, int64_t srcVMultiplier, int64_t srcVOffset, int64_t dstVMultiplier,
161
    int64_t dstVOffset)
162
0
{
163
0
  WINPR_ASSERT(pDstData);
164
0
  WINPR_ASSERT(pSrcData);
165
166
0
  switch (SrcFormat)
167
0
  {
168
0
    case PIXEL_FORMAT_BGR24:
169
0
      switch (DstFormat)
170
0
      {
171
0
        case PIXEL_FORMAT_BGRX32:
172
0
        case PIXEL_FORMAT_BGRA32:
173
0
          return avx2_image_copy_bgr24_bgrx32(
174
0
              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
175
0
              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
176
0
        default:
177
0
          break;
178
0
      }
179
0
      break;
180
0
    case PIXEL_FORMAT_BGRX32:
181
0
    case PIXEL_FORMAT_BGRA32:
182
0
      switch (DstFormat)
183
0
      {
184
0
        case PIXEL_FORMAT_BGRX32:
185
0
        case PIXEL_FORMAT_BGRA32:
186
0
          return avx2_image_copy_bgrx32_bgrx32(
187
0
              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
188
0
              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
189
0
        default:
190
0
          break;
191
0
      }
192
0
      break;
193
0
    case PIXEL_FORMAT_RGBX32:
194
0
    case PIXEL_FORMAT_RGBA32:
195
0
      switch (DstFormat)
196
0
      {
197
0
        case PIXEL_FORMAT_RGBX32:
198
0
        case PIXEL_FORMAT_RGBA32:
199
0
          return avx2_image_copy_bgrx32_bgrx32(
200
0
              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
201
0
              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
202
0
        default:
203
0
          break;
204
0
      }
205
0
      break;
206
0
    default:
207
0
      break;
208
0
  }
209
210
0
  primitives_t* gen = primitives_get_generic();
211
0
  return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
212
0
                              pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
213
0
}
214
215
static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
216
                                            UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
217
                                            UINT32 nWidth, UINT32 nHeight,
218
                                            const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
219
                                            UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
220
                                            const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
221
0
{
222
0
  const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
223
0
  int64_t srcVOffset = 0;
224
0
  int64_t srcVMultiplier = 1;
225
0
  int64_t dstVOffset = 0;
226
0
  int64_t dstVMultiplier = 1;
227
228
0
  if ((nWidth == 0) || (nHeight == 0))
229
0
    return PRIMITIVES_SUCCESS;
230
231
0
  if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
232
0
    return -1;
233
234
0
  if (!pDstData || !pSrcData)
235
0
    return -1;
236
237
0
  if (nDstStep == 0)
238
0
    nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
239
240
0
  if (nSrcStep == 0)
241
0
    nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
242
243
0
  if (vSrcVFlip)
244
0
  {
245
0
    srcVOffset = (nHeight - 1ll) * nSrcStep;
246
0
    srcVMultiplier = -1;
247
0
  }
248
249
0
  if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
250
0
    return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
251
0
                                                nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
252
0
                                                nXSrc, nYSrc, palette, flags, srcVMultiplier,
253
0
                                                srcVOffset, dstVMultiplier, dstVOffset);
254
0
  else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
255
0
    return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
256
0
                                                nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
257
0
                                                nXSrc, nYSrc, palette, srcVMultiplier,
258
0
                                                srcVOffset, dstVMultiplier, dstVOffset, flags);
259
0
  else
260
0
  {
261
0
    primitives_t* gen = primitives_get_generic();
262
0
    return gen->copy_no_overlap(pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight,
263
0
                                pSrcData, SrcFormat, nSrcStep, nXSrc, nYSrc, palette, flags);
264
0
  }
265
0
}
266
#endif
267
268
/* ------------------------------------------------------------------------- */
269
void primitives_init_copy_avx2_int(primitives_t* WINPR_RESTRICT prims)
270
0
{
271
0
#if defined(SSE_AVX_INTRINSICS_ENABLED)
272
0
  WLog_VRB(PRIM_TAG, "AVX2 optimizations");
273
0
  prims->copy_no_overlap = avx2_image_copy_no_overlap;
274
#else
275
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or WITH_AVX2 or AVX2 intrinsics not available");
276
  WINPR_UNUSED(prims);
277
#endif
278
0
}