Coverage Report

Created: 2024-09-08 06:20

/src/FreeRDP/libfreerdp/primitives/sse/prim_set_sse2.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized routines to set a chunk of memory to a constant.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 *
15
 */
16
17
#include <freerdp/config.h>
18
19
#include <string.h>
20
#include <freerdp/types.h>
21
#include <freerdp/primitives.h>
22
#include <winpr/sysinfo.h>
23
24
#include "prim_internal.h"
25
#include "prim_set.h"
26
27
/* ========================================================================= */
28
#if defined(SSE2_ENABLED)
29
#include <emmintrin.h>
30
31
static primitives_t* generic = NULL;
32
33
static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len)
34
{
35
  BYTE byte = 0;
36
  BYTE* dptr = NULL;
37
  __m128i xmm0;
38
  size_t count = 0;
39
40
  if (len < 16)
41
    return generic->set_8u(val, pDst, len);
42
43
  byte = val;
44
  dptr = pDst;
45
46
  /* Seek 16-byte alignment. */
47
  while ((ULONG_PTR)dptr & 0x0f)
48
  {
49
    *dptr++ = byte;
50
51
    if (--len == 0)
52
      return PRIMITIVES_SUCCESS;
53
  }
54
55
  xmm0 = _mm_set1_epi8(byte);
56
  /* Cover 256-byte chunks via SSE register stores. */
57
  count = len >> 8;
58
  len -= count << 8;
59
60
  /* Do 256-byte chunks using one XMM register. */
61
  while (count--)
62
  {
63
    _mm_store_si128((__m128i*)dptr, xmm0);
64
    dptr += 16;
65
    _mm_store_si128((__m128i*)dptr, xmm0);
66
    dptr += 16;
67
    _mm_store_si128((__m128i*)dptr, xmm0);
68
    dptr += 16;
69
    _mm_store_si128((__m128i*)dptr, xmm0);
70
    dptr += 16;
71
    _mm_store_si128((__m128i*)dptr, xmm0);
72
    dptr += 16;
73
    _mm_store_si128((__m128i*)dptr, xmm0);
74
    dptr += 16;
75
    _mm_store_si128((__m128i*)dptr, xmm0);
76
    dptr += 16;
77
    _mm_store_si128((__m128i*)dptr, xmm0);
78
    dptr += 16;
79
    _mm_store_si128((__m128i*)dptr, xmm0);
80
    dptr += 16;
81
    _mm_store_si128((__m128i*)dptr, xmm0);
82
    dptr += 16;
83
    _mm_store_si128((__m128i*)dptr, xmm0);
84
    dptr += 16;
85
    _mm_store_si128((__m128i*)dptr, xmm0);
86
    dptr += 16;
87
    _mm_store_si128((__m128i*)dptr, xmm0);
88
    dptr += 16;
89
    _mm_store_si128((__m128i*)dptr, xmm0);
90
    dptr += 16;
91
    _mm_store_si128((__m128i*)dptr, xmm0);
92
    dptr += 16;
93
    _mm_store_si128((__m128i*)dptr, xmm0);
94
    dptr += 16;
95
  }
96
97
  /* Cover 16-byte chunks via SSE register stores. */
98
  count = len >> 4;
99
  len -= count << 4;
100
101
  /* Do 16-byte chunks using one XMM register. */
102
  while (count--)
103
  {
104
    _mm_store_si128((__m128i*)dptr, xmm0);
105
    dptr += 16;
106
  }
107
108
  /* Do leftover bytes. */
109
  while (len--)
110
    *dptr++ = byte;
111
112
  return PRIMITIVES_SUCCESS;
113
}
114
115
/* ------------------------------------------------------------------------- */
116
static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len)
117
{
118
  const primitives_t* prim = primitives_get_generic();
119
  UINT32* dptr = pDst;
120
  __m128i xmm0;
121
  size_t count = 0;
122
123
  /* If really short, just do it here. */
124
  if (len < 32)
125
  {
126
    while (len--)
127
      *dptr++ = val;
128
129
    return PRIMITIVES_SUCCESS;
130
  }
131
132
  /* Assure we can reach 16-byte alignment. */
133
  if (((ULONG_PTR)dptr & 0x03) != 0)
134
  {
135
    return prim->set_32u(val, pDst, len);
136
  }
137
138
  /* Seek 16-byte alignment. */
139
  while ((ULONG_PTR)dptr & 0x0f)
140
  {
141
    *dptr++ = val;
142
143
    if (--len == 0)
144
      return PRIMITIVES_SUCCESS;
145
  }
146
147
  xmm0 = _mm_set1_epi32(val);
148
  /* Cover 256-byte chunks via SSE register stores. */
149
  count = len >> 6;
150
  len -= count << 6;
151
152
  /* Do 256-byte chunks using one XMM register. */
153
  while (count--)
154
  {
155
    _mm_store_si128((__m128i*)dptr, xmm0);
156
    dptr += 4;
157
    _mm_store_si128((__m128i*)dptr, xmm0);
158
    dptr += 4;
159
    _mm_store_si128((__m128i*)dptr, xmm0);
160
    dptr += 4;
161
    _mm_store_si128((__m128i*)dptr, xmm0);
162
    dptr += 4;
163
    _mm_store_si128((__m128i*)dptr, xmm0);
164
    dptr += 4;
165
    _mm_store_si128((__m128i*)dptr, xmm0);
166
    dptr += 4;
167
    _mm_store_si128((__m128i*)dptr, xmm0);
168
    dptr += 4;
169
    _mm_store_si128((__m128i*)dptr, xmm0);
170
    dptr += 4;
171
    _mm_store_si128((__m128i*)dptr, xmm0);
172
    dptr += 4;
173
    _mm_store_si128((__m128i*)dptr, xmm0);
174
    dptr += 4;
175
    _mm_store_si128((__m128i*)dptr, xmm0);
176
    dptr += 4;
177
    _mm_store_si128((__m128i*)dptr, xmm0);
178
    dptr += 4;
179
    _mm_store_si128((__m128i*)dptr, xmm0);
180
    dptr += 4;
181
    _mm_store_si128((__m128i*)dptr, xmm0);
182
    dptr += 4;
183
    _mm_store_si128((__m128i*)dptr, xmm0);
184
    dptr += 4;
185
    _mm_store_si128((__m128i*)dptr, xmm0);
186
    dptr += 4;
187
  }
188
189
  /* Cover 16-byte chunks via SSE register stores. */
190
  count = len >> 2;
191
  len -= count << 2;
192
193
  /* Do 16-byte chunks using one XMM register. */
194
  while (count--)
195
  {
196
    _mm_store_si128((__m128i*)dptr, xmm0);
197
    dptr += 4;
198
  }
199
200
  /* Do leftover bytes. */
201
  while (len--)
202
    *dptr++ = val;
203
204
  return PRIMITIVES_SUCCESS;
205
}
206
207
/* ------------------------------------------------------------------------- */
208
static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len)
209
{
210
  UINT32 uval = *((UINT32*)&val);
211
  return sse2_set_32u(uval, (UINT32*)pDst, len);
212
}
213
#endif
214
215
/* ------------------------------------------------------------------------- */
216
void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
217
0
{
218
#if defined(SSE2_ENABLED)
219
  generic = primitives_get_generic();
220
  primitives_init_set(prims);
221
  /* Pick tuned versions if possible. */
222
223
  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE))
224
  {
225
    WLog_VRB(PRIM_TAG, "SSE2 optimizations");
226
    prims->set_8u = sse2_set_8u;
227
    prims->set_32s = sse2_set_32s;
228
    prims->set_32u = sse2_set_32u;
229
  }
230
231
#else
232
0
  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
233
0
  WINPR_UNUSED(prims);
234
0
#endif
235
0
}