Coverage Report

Created: 2023-09-25 06:56

/src/FreeRDP/libfreerdp/primitives/prim_set_opt.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized routines to set a chunk of memory to a constant.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 *
15
 */
16
17
#include <freerdp/config.h>
18
19
#include <string.h>
20
#include <freerdp/types.h>
21
#include <freerdp/primitives.h>
22
#include <winpr/sysinfo.h>
23
24
#ifdef WITH_SSE2
25
#include <emmintrin.h>
26
#endif /* WITH_SSE2 */
27
#ifdef WITH_IPP
28
#include <ipps.h>
29
#endif /* WITH_IPP */
30
31
#include "prim_internal.h"
32
33
static primitives_t* generic = NULL;
34
35
/* ========================================================================= */
36
#ifdef WITH_SSE2
37
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
38
static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
39
0
{
40
0
  BYTE byte, *dptr;
41
0
  __m128i xmm0;
42
0
  size_t count;
43
44
0
  if (len < 16)
45
0
    return generic->set_8u(val, pDst, len);
46
47
0
  byte = val;
48
0
  dptr = (BYTE*)pDst;
49
50
  /* Seek 16-byte alignment. */
51
0
  while ((ULONG_PTR)dptr & 0x0f)
52
0
  {
53
0
    *dptr++ = byte;
54
55
0
    if (--len == 0)
56
0
      return PRIMITIVES_SUCCESS;
57
0
  }
58
59
0
  xmm0 = _mm_set1_epi8(byte);
60
  /* Cover 256-byte chunks via SSE register stores. */
61
0
  count = len >> 8;
62
0
  len -= count << 8;
63
64
  /* Do 256-byte chunks using one XMM register. */
65
0
  while (count--)
66
0
  {
67
0
    _mm_store_si128((__m128i*)dptr, xmm0);
68
0
    dptr += 16;
69
0
    _mm_store_si128((__m128i*)dptr, xmm0);
70
0
    dptr += 16;
71
0
    _mm_store_si128((__m128i*)dptr, xmm0);
72
0
    dptr += 16;
73
0
    _mm_store_si128((__m128i*)dptr, xmm0);
74
0
    dptr += 16;
75
0
    _mm_store_si128((__m128i*)dptr, xmm0);
76
0
    dptr += 16;
77
0
    _mm_store_si128((__m128i*)dptr, xmm0);
78
0
    dptr += 16;
79
0
    _mm_store_si128((__m128i*)dptr, xmm0);
80
0
    dptr += 16;
81
0
    _mm_store_si128((__m128i*)dptr, xmm0);
82
0
    dptr += 16;
83
0
    _mm_store_si128((__m128i*)dptr, xmm0);
84
0
    dptr += 16;
85
0
    _mm_store_si128((__m128i*)dptr, xmm0);
86
0
    dptr += 16;
87
0
    _mm_store_si128((__m128i*)dptr, xmm0);
88
0
    dptr += 16;
89
0
    _mm_store_si128((__m128i*)dptr, xmm0);
90
0
    dptr += 16;
91
0
    _mm_store_si128((__m128i*)dptr, xmm0);
92
0
    dptr += 16;
93
0
    _mm_store_si128((__m128i*)dptr, xmm0);
94
0
    dptr += 16;
95
0
    _mm_store_si128((__m128i*)dptr, xmm0);
96
0
    dptr += 16;
97
0
    _mm_store_si128((__m128i*)dptr, xmm0);
98
0
    dptr += 16;
99
0
  }
100
101
  /* Cover 16-byte chunks via SSE register stores. */
102
0
  count = len >> 4;
103
0
  len -= count << 4;
104
105
  /* Do 16-byte chunks using one XMM register. */
106
0
  while (count--)
107
0
  {
108
0
    _mm_store_si128((__m128i*)dptr, xmm0);
109
0
    dptr += 16;
110
0
  }
111
112
  /* Do leftover bytes. */
113
0
  while (len--)
114
0
    *dptr++ = byte;
115
116
0
  return PRIMITIVES_SUCCESS;
117
0
}
118
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
119
#endif /* WITH_SSE2 */
120
121
/* ------------------------------------------------------------------------- */
122
#ifdef WITH_SSE2
123
#if !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS)
124
static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len)
125
0
{
126
0
  const primitives_t* prim = primitives_get_generic();
127
0
  UINT32* dptr = (UINT32*)pDst;
128
0
  __m128i xmm0;
129
0
  size_t count;
130
131
  /* If really short, just do it here. */
132
0
  if (len < 32)
133
0
  {
134
0
    while (len--)
135
0
      *dptr++ = val;
136
137
0
    return PRIMITIVES_SUCCESS;
138
0
  }
139
140
  /* Assure we can reach 16-byte alignment. */
141
0
  if (((ULONG_PTR)dptr & 0x03) != 0)
142
0
  {
143
0
    return prim->set_32u(val, pDst, len);
144
0
  }
145
146
  /* Seek 16-byte alignment. */
147
0
  while ((ULONG_PTR)dptr & 0x0f)
148
0
  {
149
0
    *dptr++ = val;
150
151
0
    if (--len == 0)
152
0
      return PRIMITIVES_SUCCESS;
153
0
  }
154
155
0
  xmm0 = _mm_set1_epi32(val);
156
  /* Cover 256-byte chunks via SSE register stores. */
157
0
  count = len >> 6;
158
0
  len -= count << 6;
159
160
  /* Do 256-byte chunks using one XMM register. */
161
0
  while (count--)
162
0
  {
163
0
    _mm_store_si128((__m128i*)dptr, xmm0);
164
0
    dptr += 4;
165
0
    _mm_store_si128((__m128i*)dptr, xmm0);
166
0
    dptr += 4;
167
0
    _mm_store_si128((__m128i*)dptr, xmm0);
168
0
    dptr += 4;
169
0
    _mm_store_si128((__m128i*)dptr, xmm0);
170
0
    dptr += 4;
171
0
    _mm_store_si128((__m128i*)dptr, xmm0);
172
0
    dptr += 4;
173
0
    _mm_store_si128((__m128i*)dptr, xmm0);
174
0
    dptr += 4;
175
0
    _mm_store_si128((__m128i*)dptr, xmm0);
176
0
    dptr += 4;
177
0
    _mm_store_si128((__m128i*)dptr, xmm0);
178
0
    dptr += 4;
179
0
    _mm_store_si128((__m128i*)dptr, xmm0);
180
0
    dptr += 4;
181
0
    _mm_store_si128((__m128i*)dptr, xmm0);
182
0
    dptr += 4;
183
0
    _mm_store_si128((__m128i*)dptr, xmm0);
184
0
    dptr += 4;
185
0
    _mm_store_si128((__m128i*)dptr, xmm0);
186
0
    dptr += 4;
187
0
    _mm_store_si128((__m128i*)dptr, xmm0);
188
0
    dptr += 4;
189
0
    _mm_store_si128((__m128i*)dptr, xmm0);
190
0
    dptr += 4;
191
0
    _mm_store_si128((__m128i*)dptr, xmm0);
192
0
    dptr += 4;
193
0
    _mm_store_si128((__m128i*)dptr, xmm0);
194
0
    dptr += 4;
195
0
  }
196
197
  /* Cover 16-byte chunks via SSE register stores. */
198
0
  count = len >> 2;
199
0
  len -= count << 2;
200
201
  /* Do 16-byte chunks using one XMM register. */
202
0
  while (count--)
203
0
  {
204
0
    _mm_store_si128((__m128i*)dptr, xmm0);
205
0
    dptr += 4;
206
0
  }
207
208
  /* Do leftover bytes. */
209
0
  while (len--)
210
0
    *dptr++ = val;
211
212
0
  return PRIMITIVES_SUCCESS;
213
0
}
214
215
/* ------------------------------------------------------------------------- */
216
static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
217
0
{
218
0
  UINT32 uval = *((UINT32*)&val);
219
0
  return sse2_set_32u(uval, (UINT32*)pDst, len);
220
0
}
221
#endif /* !defined(WITH_IPP) || defined(ALL_PRIMITIVES_VERSIONS) */
222
#endif /* WITH_SSE2 */
223
224
#ifdef WITH_IPP
225
/* ------------------------------------------------------------------------- */
226
static pstatus_t ipp_wrapper_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, INT32 len)
227
{
228
  /* A little type conversion, then use the signed version. */
229
  INT32 sval = *((INT32*)&val);
230
  return ippsSet_32s(sval, (INT32*)pDst, len);
231
}
232
#endif
233
234
/* ------------------------------------------------------------------------- */
235
void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
236
0
{
237
0
  generic = primitives_get_generic();
238
0
  primitives_init_set(prims);
239
  /* Pick tuned versions if possible. */
240
#ifdef WITH_IPP
241
  prims->set_8u = (__set_8u_t)ippsSet_8u;
242
  prims->set_32s = (__set_32s_t)ippsSet_32s;
243
  prims->set_32u = (__set_32u_t)ipp_wrapper_set_32u;
244
  prims->zero = (__zero_t)ippsZero_8u;
245
#elif defined(WITH_SSE2)
246
247
0
  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
248
0
  {
249
0
    prims->set_8u = sse2_set_8u;
250
0
    prims->set_32s = sse2_set_32s;
251
0
    prims->set_32u = sse2_set_32u;
252
0
  }
253
254
0
#endif
255
0
}