Coverage Report

Created: 2026-02-26 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/FreeRDP/libfreerdp/primitives/sse/prim_templates.h
Line
Count
Source
1
/* prim_templates.h
2
 * vi:ts=4 sw=4
3
 *
4
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6
 * not use this file except in compliance with the License. You may obtain
7
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8
 * Unless required by applicable law or agreed to in writing, software
9
 * distributed under the License is distributed on an "AS IS" BASIS,
10
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11
 * or implied. See the License for the specific language governing
12
 * permissions and limitations under the License.  Algorithms used by
13
 * this code may be covered by patents by HP, Microsoft, or other parties.
14
 */
15
16
#pragma once
17
18
#include "prim_avxsse.h"
19
20
/* These are prototypes for SSE (potentially NEON) routines that do a
21
 * simple SSE operation over an array of data.  Since so much of this
22
 * code is shared except for the operation itself, these prototypes are
23
 * used rather than duplicating code.  The naming convention depends on
24
 * the parameters:  S=Source param; C=Constant; D=Destination.
25
 * All the macros have parameters for a fallback procedure if the data
26
 * is too small and an operation "the slow way" for use at 16-byte edges.
27
 */
28
29
/* SSE3 note:  If someone needs to support an SSE2 version of these without
30
 * SSE3 support, an alternative version could be added that merely checks
31
 * that 16-byte alignment on both destination and source(s) can be
32
 * achieved, rather than use LDDQU for unaligned reads.
33
 */
34
35
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
36
 * It easily can't do that if the value is stored in a variable.
37
 * So don't save it as an intermediate value.
38
 */
39
40
/* ----------------------------------------------------------------------------
41
 * SCD = Source, Constant, Destination
42
 */
43
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
44
  WINPR_ATTR_NODISCARD                                                         \
45
  static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val,       \
46
                          _type_* WINPR_RESTRICT pDst, UINT32 ulen)            \
47
0
  {                                                                            \
48
0
    size_t len = ulen;                                                       \
49
0
    INT32 shifts = 0;                                                        \
50
0
    const _type_* sptr = pSrc;                                               \
51
0
    _type_* dptr = pDst;                                                     \
52
0
    if (val == 0)                                                            \
53
0
      return PRIMITIVES_SUCCESS;                                           \
54
0
    if (val >= 16)                                                           \
55
0
      return -1;                                                           \
56
0
    if (sizeof(_type_) == 1)                                                 \
57
0
      shifts = 1;                                                          \
58
0
    else if (sizeof(_type_) == 2)                                            \
59
0
      shifts = 2;                                                          \
60
0
    else if (sizeof(_type_) == 4)                                            \
61
0
      shifts = 3;                                                          \
62
0
    else if (sizeof(_type_) == 8)                                            \
63
0
      shifts = 4;                                                          \
64
0
    /* Use 8 128-bit SSE registers. */                                       \
65
0
    size_t count = len >> (8 - shifts);                                      \
66
0
    len -= count << (8 - shifts);                                            \
67
0
                                                                                 \
68
0
    while (count--)                                                          \
69
0
    {                                                                        \
70
0
      __m128i xmm0 = LOAD_SI128(sptr);                                     \
71
0
      sptr += (16 / sizeof(_type_));                                       \
72
0
      __m128i xmm1 = LOAD_SI128(sptr);                                     \
73
0
      sptr += (16 / sizeof(_type_));                                       \
74
0
      __m128i xmm2 = LOAD_SI128(sptr);                                     \
75
0
      sptr += (16 / sizeof(_type_));                                       \
76
0
      __m128i xmm3 = LOAD_SI128(sptr);                                     \
77
0
      sptr += (16 / sizeof(_type_));                                       \
78
0
      __m128i xmm4 = LOAD_SI128(sptr);                                     \
79
0
      sptr += (16 / sizeof(_type_));                                       \
80
0
      __m128i xmm5 = LOAD_SI128(sptr);                                     \
81
0
      sptr += (16 / sizeof(_type_));                                       \
82
0
      __m128i xmm6 = LOAD_SI128(sptr);                                     \
83
0
      sptr += (16 / sizeof(_type_));                                       \
84
0
      __m128i xmm7 = LOAD_SI128(sptr);                                     \
85
0
      sptr += (16 / sizeof(_type_));                                       \
86
0
      xmm0 = _op_(xmm0, (_op_type_)val);                                   \
87
0
      xmm1 = _op_(xmm1, (_op_type_)val);                                   \
88
0
      xmm2 = _op_(xmm2, (_op_type_)val);                                   \
89
0
      xmm3 = _op_(xmm3, (_op_type_)val);                                   \
90
0
      xmm4 = _op_(xmm4, (_op_type_)val);                                   \
91
0
      xmm5 = _op_(xmm5, (_op_type_)val);                                   \
92
0
      xmm6 = _op_(xmm6, (_op_type_)val);                                   \
93
0
      xmm7 = _op_(xmm7, (_op_type_)val);                                   \
94
0
      STORE_SI128(dptr, xmm0);                                             \
95
0
      dptr += (16 / sizeof(_type_));                                       \
96
0
      STORE_SI128(dptr, xmm1);                                             \
97
0
      dptr += (16 / sizeof(_type_));                                       \
98
0
      STORE_SI128(dptr, xmm2);                                             \
99
0
      dptr += (16 / sizeof(_type_));                                       \
100
0
      STORE_SI128(dptr, xmm3);                                             \
101
0
      dptr += (16 / sizeof(_type_));                                       \
102
0
      STORE_SI128(dptr, xmm4);                                             \
103
0
      dptr += (16 / sizeof(_type_));                                       \
104
0
      STORE_SI128(dptr, xmm5);                                             \
105
0
      dptr += (16 / sizeof(_type_));                                       \
106
0
      STORE_SI128(dptr, xmm6);                                             \
107
0
      dptr += (16 / sizeof(_type_));                                       \
108
0
      STORE_SI128(dptr, xmm7);                                             \
109
0
      dptr += (16 / sizeof(_type_));                                       \
110
0
    }                                                                        \
111
0
                                                                                 \
112
0
    /* Use a single 128-bit SSE register. */                                 \
113
0
    count = len >> (5 - shifts);                                             \
114
0
    len -= count << (5 - shifts);                                            \
115
0
    while (count--)                                                          \
116
0
    {                                                                        \
117
0
      __m128i xmm0 = LOAD_SI128(sptr);                                     \
118
0
      sptr += (16 / sizeof(_type_));                                       \
119
0
      xmm0 = _op_(xmm0, (_op_type_)val);                                   \
120
0
      STORE_SI128(dptr, xmm0);                                             \
121
0
      dptr += (16 / sizeof(_type_));                                       \
122
0
    }                                                                        \
123
0
    /* Finish off the remainder. */                                          \
124
0
    while (len--)                                                            \
125
0
    {                                                                        \
126
0
      _slowWay_;                                                           \
127
0
    }                                                                        \
128
0
    return PRIMITIVES_SUCCESS;                                               \
129
0
  }
Unexecuted instantiation: prim_shift_sse3.c:sse2_lShiftC_16s
Unexecuted instantiation: prim_shift_sse3.c:sse2_rShiftC_16s
Unexecuted instantiation: prim_shift_sse3.c:sse2_lShiftC_16u
Unexecuted instantiation: prim_shift_sse3.c:sse2_rShiftC_16u
130
131
/* ----------------------------------------------------------------------------
132
 * SCD = Source, Constant, Destination
133
 * PRE = preload xmm0 with the constant.
134
 */
135
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)  \
136
  WINPR_ATTR_NODISCARD                                                   \
137
  static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
138
                          _type_* WINPR_RESTRICT pDst, INT32 ilen)       \
139
0
  {                                                                      \
140
0
    size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen);               \
141
0
    int shifts = 0;                                                    \
142
0
    const _type_* sptr = pSrc;                                         \
143
0
    _type_* dptr = pDst;                                               \
144
0
    __m128i xmm0;                                                      \
145
0
    if (sizeof(_type_) == 1)                                           \
146
0
      shifts = 1;                                                    \
147
0
    else if (sizeof(_type_) == 2)                                      \
148
0
      shifts = 2;                                                    \
149
0
    else if (sizeof(_type_) == 4)                                      \
150
0
      shifts = 3;                                                    \
151
0
    else if (sizeof(_type_) == 8)                                      \
152
0
      shifts = 4;                                                    \
153
0
    /* Use 4 128-bit SSE registers. */                                 \
154
0
    size_t count = len >> (7 - shifts);                                \
155
0
    len -= count << (7 - shifts);                                      \
156
0
    xmm0 = mm_set1_epu32(val);                                         \
157
0
    for (size_t x = 0; x < count; x++)                                 \
158
0
    {                                                                  \
159
0
      __m128i xmm1 = LOAD_SI128(sptr);                               \
160
0
      sptr += (16 / sizeof(_type_));                                 \
161
0
      __m128i xmm2 = LOAD_SI128(sptr);                               \
162
0
      sptr += (16 / sizeof(_type_));                                 \
163
0
      __m128i xmm3 = LOAD_SI128(sptr);                               \
164
0
      sptr += (16 / sizeof(_type_));                                 \
165
0
      __m128i xmm4 = LOAD_SI128(sptr);                               \
166
0
      sptr += (16 / sizeof(_type_));                                 \
167
0
      xmm1 = _op_(xmm1, xmm0);                                       \
168
0
      xmm2 = _op_(xmm2, xmm0);                                       \
169
0
      xmm3 = _op_(xmm3, xmm0);                                       \
170
0
      xmm4 = _op_(xmm4, xmm0);                                       \
171
0
      STORE_SI128(dptr, xmm1);                                       \
172
0
      dptr += (16 / sizeof(_type_));                                 \
173
0
      STORE_SI128(dptr, xmm2);                                       \
174
0
      dptr += (16 / sizeof(_type_));                                 \
175
0
      STORE_SI128(dptr, xmm3);                                       \
176
0
      dptr += (16 / sizeof(_type_));                                 \
177
0
      STORE_SI128(dptr, xmm4);                                       \
178
0
      dptr += (16 / sizeof(_type_));                                 \
179
0
    }                                                                  \
180
0
    /* Use a single 128-bit SSE register. */                           \
181
0
    count = len >> (5 - shifts);                                       \
182
0
    len -= count << (5 - shifts);                                      \
183
0
    for (size_t x = 0; x < count; x++)                                 \
184
0
    {                                                                  \
185
0
      __m128i xmm1 = LOAD_SI128(sptr);                               \
186
0
      sptr += (16 / sizeof(_type_));                                 \
187
0
      xmm1 = _op_(xmm1, xmm0);                                       \
188
0
      STORE_SI128(dptr, xmm1);                                       \
189
0
      dptr += (16 / sizeof(_type_));                                 \
190
0
    }                                                                  \
191
0
    /* Finish off the remainder. */                                    \
192
0
    for (size_t x = 0; x < len; x++)                                   \
193
0
    {                                                                  \
194
0
      _slowWay_;                                                     \
195
0
    }                                                                  \
196
0
    return PRIMITIVES_SUCCESS;                                         \
197
0
  }
Unexecuted instantiation: prim_andor_sse3.c:sse3_andC_32u
Unexecuted instantiation: prim_andor_sse3.c:sse3_orC_32u
198
199
/* ----------------------------------------------------------------------------
200
 * SSD = Source1, Source2, Destination
201
 */
202
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                        \
203
  WINPR_ATTR_NODISCARD                                                                     \
204
  static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1,                              \
205
                          const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
206
                          UINT32 ulen)                                                     \
207
0
  {                                                                                        \
208
0
    size_t len = ulen;                                                                   \
209
0
    int shifts = 0;                                                                      \
210
0
    const _type_* sptr1 = pSrc1;                                                         \
211
0
    const _type_* sptr2 = pSrc2;                                                         \
212
0
    _type_* dptr = pDst;                                                                 \
213
0
    size_t count;                                                                        \
214
0
    if (sizeof(_type_) == 1)                                                             \
215
0
      shifts = 1;                                                                      \
216
0
    else if (sizeof(_type_) == 2)                                                        \
217
0
      shifts = 2;                                                                      \
218
0
    else if (sizeof(_type_) == 4)                                                        \
219
0
      shifts = 3;                                                                      \
220
0
    else if (sizeof(_type_) == 8)                                                        \
221
0
      shifts = 4;                                                                      \
222
0
    /* Use 4 128-bit SSE registers. */                                                   \
223
0
    count = len >> (7 - shifts);                                                         \
224
0
    len -= count << (7 - shifts);                                                        \
225
0
    /* Aligned loads */                                                                  \
226
0
    while (count--)                                                                      \
227
0
    {                                                                                    \
228
0
      __m128i xmm0 = LOAD_SI128(sptr1);                                                \
229
0
      sptr1 += (16 / sizeof(_type_));                                                  \
230
0
      __m128i xmm1 = LOAD_SI128(sptr1);                                                \
231
0
      sptr1 += (16 / sizeof(_type_));                                                  \
232
0
      __m128i xmm2 = LOAD_SI128(sptr1);                                                \
233
0
      sptr1 += (16 / sizeof(_type_));                                                  \
234
0
      __m128i xmm3 = LOAD_SI128(sptr1);                                                \
235
0
      sptr1 += (16 / sizeof(_type_));                                                  \
236
0
      __m128i xmm4 = LOAD_SI128(sptr2);                                                \
237
0
      sptr2 += (16 / sizeof(_type_));                                                  \
238
0
      __m128i xmm5 = LOAD_SI128(sptr2);                                                \
239
0
      sptr2 += (16 / sizeof(_type_));                                                  \
240
0
      __m128i xmm6 = LOAD_SI128(sptr2);                                                \
241
0
      sptr2 += (16 / sizeof(_type_));                                                  \
242
0
      __m128i xmm7 = LOAD_SI128(sptr2);                                                \
243
0
      sptr2 += (16 / sizeof(_type_));                                                  \
244
0
      xmm0 = _op_(xmm0, xmm4);                                                         \
245
0
      xmm1 = _op_(xmm1, xmm5);                                                         \
246
0
      xmm2 = _op_(xmm2, xmm6);                                                         \
247
0
      xmm3 = _op_(xmm3, xmm7);                                                         \
248
0
      STORE_SI128(dptr, xmm0);                                                         \
249
0
      dptr += (16 / sizeof(_type_));                                                   \
250
0
      STORE_SI128(dptr, xmm1);                                                         \
251
0
      dptr += (16 / sizeof(_type_));                                                   \
252
0
      STORE_SI128(dptr, xmm2);                                                         \
253
0
      dptr += (16 / sizeof(_type_));                                                   \
254
0
      STORE_SI128(dptr, xmm3);                                                         \
255
0
      dptr += (16 / sizeof(_type_));                                                   \
256
0
    }                                                                                    \
257
0
    /* Use a single 128-bit SSE register. */                                             \
258
0
    count = len >> (5 - shifts);                                                         \
259
0
    len -= count << (5 - shifts);                                                        \
260
0
    while (count--)                                                                      \
261
0
    {                                                                                    \
262
0
      __m128i xmm0 = LOAD_SI128(sptr1);                                                \
263
0
      sptr1 += (16 / sizeof(_type_));                                                  \
264
0
      __m128i xmm1 = LOAD_SI128(sptr2);                                                \
265
0
      sptr2 += (16 / sizeof(_type_));                                                  \
266
0
      xmm0 = _op_(xmm0, xmm1);                                                         \
267
0
      STORE_SI128(dptr, xmm0);                                                         \
268
0
      dptr += (16 / sizeof(_type_));                                                   \
269
0
    }                                                                                    \
270
0
    /* Finish off the remainder. */                                                      \
271
0
    while (len--)                                                                        \
272
0
    {                                                                                    \
273
0
      const pstatus_t rc = _slowWay_;                                                  \
274
0
      if (rc != PRIMITIVES_SUCCESS)                                                    \
275
0
        return rc;                                                                   \
276
0
    }                                                                                    \
277
0
    return PRIMITIVES_SUCCESS;                                                           \
278
0
  }