Coverage Report

Created: 2024-09-08 06:20

/src/FreeRDP/libfreerdp/primitives/sse/prim_copy_avx2.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Copy operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 */
15
16
#include <winpr/sysinfo.h>
17
18
#include <freerdp/config.h>
19
20
#include <string.h>
21
#include <freerdp/types.h>
22
#include <freerdp/primitives.h>
23
#include <freerdp/log.h>
24
25
#include "prim_internal.h"
26
#include "prim_copy.h"
27
#include "../codec/color.h"
28
29
#include <freerdp/codec/color.h>
30
31
#define TAG FREERDP_TAG("primitives.copy")
32
33
#if defined(SSE2_ENABLED)
34
#include <emmintrin.h>
35
#include <immintrin.h>
36
37
static INLINE pstatus_t avx2_image_copy_no_overlap_convert(
38
    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
39
    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
40
    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
41
    SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset);
42
43
static INLINE pstatus_t avx2_image_copy_bgr24_bgrx32(BYTE* WINPR_RESTRICT pDstData, UINT32 nDstStep,
44
                                                     UINT32 nXDst, UINT32 nYDst, UINT32 nWidth,
45
                                                     UINT32 nHeight,
46
                                                     const BYTE* WINPR_RESTRICT pSrcData,
47
                                                     UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
48
                                                     SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
49
                                                     SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
50
{
51
52
  const SSIZE_T srcByte = 3;
53
  const SSIZE_T dstByte = 4;
54
55
  const __m256i mask = _mm256_set_epi32(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
56
  const SSIZE_T rem = nWidth % 8;
57
  const SSIZE_T width = nWidth - rem;
58
  for (SSIZE_T y = 0; y < nHeight; y++)
59
  {
60
    const BYTE* WINPR_RESTRICT srcLine =
61
        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
62
    BYTE* WINPR_RESTRICT dstLine =
63
        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
64
65
    SSIZE_T x = 0;
66
    for (; x < width; x += 8)
67
    {
68
      const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
69
      __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
70
      const __m256i s0 = _mm256_loadu_si256(src);
71
      const __m256i s1 = _mm256_loadu_si256(dst);
72
      const __m256i s2 = _mm256_shuffle_epi8(s1, mask);
73
      __m256i d0 = _mm256_blendv_epi8(s2, s0, mask);
74
      _mm256_storeu_si256(dst, d0);
75
    }
76
    for (; x < nWidth; x++)
77
    {
78
      const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
79
      BYTE* dst = &dstLine[(x + nXDst) * dstByte];
80
      *dst++ = *src++;
81
      *dst++ = *src++;
82
      *dst++ = *src++;
83
    }
84
  }
85
86
  return PRIMITIVES_SUCCESS;
87
}
88
89
static INLINE pstatus_t avx2_image_copy_bgrx32_bgrx32(BYTE* WINPR_RESTRICT pDstData,
90
                                                      UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
91
                                                      UINT32 nWidth, UINT32 nHeight,
92
                                                      const BYTE* WINPR_RESTRICT pSrcData,
93
                                                      UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
94
                                                      SSIZE_T srcVMultiplier, SSIZE_T srcVOffset,
95
                                                      SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
96
{
97
98
  const SSIZE_T srcByte = 4;
99
  const SSIZE_T dstByte = 4;
100
101
  const __m256i mask = _mm256_setr_epi8(
102
      (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
103
      (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
104
      (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00,
105
      (char)0xFF, (char)0xFF, (char)0xFF, 0x00, (char)0xFF, (char)0xFF, (char)0xFF, 0x00);
106
  const SSIZE_T rem = nWidth % 8;
107
  const SSIZE_T width = nWidth - rem;
108
  for (SSIZE_T y = 0; y < nHeight; y++)
109
  {
110
    const BYTE* WINPR_RESTRICT srcLine =
111
        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
112
    BYTE* WINPR_RESTRICT dstLine =
113
        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
114
115
    SSIZE_T x = 0;
116
    for (; x < width; x += 8)
117
    {
118
      const __m256i* src = (const __m256i*)&srcLine[(x + nXSrc) * srcByte];
119
      __m256i* dst = (__m256i*)&dstLine[(x + nXDst) * dstByte];
120
      const __m256i s0 = _mm256_loadu_si256(src);
121
      const __m256i s1 = _mm256_loadu_si256(dst);
122
      __m256i d0 = _mm256_blendv_epi8(s1, s0, mask);
123
      _mm256_storeu_si256(dst, d0);
124
    }
125
126
    for (; x < nWidth; x++)
127
    {
128
      const BYTE* src = &srcLine[(x + nXSrc) * srcByte];
129
      BYTE* dst = &dstLine[(x + nXDst) * dstByte];
130
      *dst++ = *src++;
131
      *dst++ = *src++;
132
      *dst++ = *src++;
133
    }
134
  }
135
136
  return PRIMITIVES_SUCCESS;
137
}
138
139
static pstatus_t avx2_image_copy_no_overlap_dst_alpha(
140
    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
141
    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
142
    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
143
    SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
144
{
145
  WINPR_ASSERT(pDstData);
146
  WINPR_ASSERT(pSrcData);
147
148
  switch (SrcFormat)
149
  {
150
    case PIXEL_FORMAT_BGR24:
151
      switch (DstFormat)
152
      {
153
        case PIXEL_FORMAT_BGRX32:
154
        case PIXEL_FORMAT_BGRA32:
155
          return avx2_image_copy_bgr24_bgrx32(
156
              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
157
              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
158
        default:
159
          break;
160
      }
161
      break;
162
    case PIXEL_FORMAT_BGRX32:
163
    case PIXEL_FORMAT_BGRA32:
164
      switch (DstFormat)
165
      {
166
        case PIXEL_FORMAT_BGRX32:
167
        case PIXEL_FORMAT_BGRA32:
168
          return avx2_image_copy_bgrx32_bgrx32(
169
              pDstData, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, nSrcStep,
170
              nXSrc, nYSrc, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
171
        default:
172
          break;
173
      }
174
      break;
175
    default:
176
      break;
177
  }
178
179
  return avx2_image_copy_no_overlap_convert(
180
      pDstData, DstFormat, nDstStep, nXDst, nYDst, nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
181
      nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset, dstVMultiplier, dstVOffset);
182
}
183
184
pstatus_t avx2_image_copy_no_overlap_convert(
185
    BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat, UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
186
    UINT32 nWidth, UINT32 nHeight, const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
187
    UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc, const gdiPalette* WINPR_RESTRICT palette,
188
    SSIZE_T srcVMultiplier, SSIZE_T srcVOffset, SSIZE_T dstVMultiplier, SSIZE_T dstVOffset)
189
{
190
  const SSIZE_T srcByte = FreeRDPGetBytesPerPixel(SrcFormat);
191
  const SSIZE_T dstByte = FreeRDPGetBytesPerPixel(DstFormat);
192
193
  const UINT32 width = nWidth - nWidth % 8;
194
  for (SSIZE_T y = 0; y < nHeight; y++)
195
  {
196
    const BYTE* WINPR_RESTRICT srcLine =
197
        &pSrcData[srcVMultiplier * (y + nYSrc) * nSrcStep + srcVOffset];
198
    BYTE* WINPR_RESTRICT dstLine =
199
        &pDstData[dstVMultiplier * (y + nYDst) * nDstStep + dstVOffset];
200
201
    SSIZE_T x = 0;
202
    WINPR_PRAGMA_UNROLL_LOOP
203
    for (; x < width; x++)
204
    {
205
      const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
206
      const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
207
      FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
208
    }
209
    for (; x < nWidth; x++)
210
    {
211
      const UINT32 color = FreeRDPReadColor_int(&srcLine[(x + nXSrc) * srcByte], SrcFormat);
212
      const UINT32 dstColor = FreeRDPConvertColor(color, SrcFormat, DstFormat, palette);
213
      FreeRDPWriteColor_int(&dstLine[(x + nXDst) * dstByte], DstFormat, dstColor);
214
    }
215
  }
216
  return PRIMITIVES_SUCCESS;
217
}
218
219
static pstatus_t avx2_image_copy_no_overlap(BYTE* WINPR_RESTRICT pDstData, DWORD DstFormat,
220
                                            UINT32 nDstStep, UINT32 nXDst, UINT32 nYDst,
221
                                            UINT32 nWidth, UINT32 nHeight,
222
                                            const BYTE* WINPR_RESTRICT pSrcData, DWORD SrcFormat,
223
                                            UINT32 nSrcStep, UINT32 nXSrc, UINT32 nYSrc,
224
                                            const gdiPalette* WINPR_RESTRICT palette, UINT32 flags)
225
{
226
  const BOOL vSrcVFlip = (flags & FREERDP_FLIP_VERTICAL) ? TRUE : FALSE;
227
  SSIZE_T srcVOffset = 0;
228
  SSIZE_T srcVMultiplier = 1;
229
  SSIZE_T dstVOffset = 0;
230
  SSIZE_T dstVMultiplier = 1;
231
232
  if ((nWidth == 0) || (nHeight == 0))
233
    return PRIMITIVES_SUCCESS;
234
235
  if ((nHeight > INT32_MAX) || (nWidth > INT32_MAX))
236
    return -1;
237
238
  if (!pDstData || !pSrcData)
239
    return -1;
240
241
  if (nDstStep == 0)
242
    nDstStep = nWidth * FreeRDPGetBytesPerPixel(DstFormat);
243
244
  if (nSrcStep == 0)
245
    nSrcStep = nWidth * FreeRDPGetBytesPerPixel(SrcFormat);
246
247
  if (vSrcVFlip)
248
  {
249
    srcVOffset = (nHeight - 1ll) * nSrcStep;
250
    srcVMultiplier = -1;
251
  }
252
253
  if (((flags & FREERDP_KEEP_DST_ALPHA) != 0) && FreeRDPColorHasAlpha(DstFormat))
254
    return avx2_image_copy_no_overlap_dst_alpha(pDstData, DstFormat, nDstStep, nXDst, nYDst,
255
                                                nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
256
                                                nXSrc, nYSrc, palette, srcVMultiplier,
257
                                                srcVOffset, dstVMultiplier, dstVOffset);
258
  else if (FreeRDPAreColorFormatsEqualNoAlpha(SrcFormat, DstFormat))
259
    return generic_image_copy_no_overlap_memcpy(pDstData, DstFormat, nDstStep, nXDst, nYDst,
260
                                                nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
261
                                                nXSrc, nYSrc, palette, srcVMultiplier,
262
                                                srcVOffset, dstVMultiplier, dstVOffset, flags);
263
  else
264
    return avx2_image_copy_no_overlap_convert(pDstData, DstFormat, nDstStep, nXDst, nYDst,
265
                                              nWidth, nHeight, pSrcData, SrcFormat, nSrcStep,
266
                                              nXSrc, nYSrc, palette, srcVMultiplier, srcVOffset,
267
                                              dstVMultiplier, dstVOffset);
268
}
269
#endif
270
271
/* ------------------------------------------------------------------------- */
272
void primitives_init_copy_avx2(primitives_t* prims)
273
0
{
274
#if defined(SSE2_ENABLED)
275
  if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
276
  {
277
    WLog_VRB(PRIM_TAG, "AVX2 optimizations");
278
    prims->copy_no_overlap = avx2_image_copy_no_overlap;
279
  }
280
#else
281
0
  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
282
0
  WINPR_UNUSED(prims);
283
0
#endif
284
0
}