Coverage Report

Created: 2023-09-25 06:56

/src/FreeRDP/libfreerdp/primitives/prim_templates.h
Line
Count
Source (jump to first uncovered line)
1
/* prim_templates.h
2
 * vi:ts=4 sw=4
3
 *
4
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
6
 * not use this file except in compliance with the License. You may obtain
7
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8
 * Unless required by applicable law or agreed to in writing, software
9
 * distributed under the License is distributed on an "AS IS" BASIS,
10
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11
 * or implied. See the License for the specific language governing
12
 * permissions and limitations under the License.  Algorithms used by
13
 * this code may be covered by patents by HP, Microsoft, or other parties.
14
 */
15
16
#ifdef __GNUC__
17
#pragma once
18
#endif
19
20
#ifndef FREERDP_LIB_PRIM_TEMPLATES_H
21
#define FREERDP_LIB_PRIM_TEMPLATES_H
22
23
/* These are prototypes for SSE (potentially NEON) routines that do a
24
 * simple SSE operation over an array of data.  Since so much of this
25
 * code is shared except for the operation itself, these prototypes are
26
 * used rather than duplicating code.  The naming convention depends on
27
 * the parameters:  S=Source param; C=Constant; D=Destination.
28
 * All the macros have parameters for a fallback procedure if the data
29
 * is too small and an operation "the slow way" for use at 16-byte edges.
30
 */
31
32
/* SSE3 note:  If someone needs to support an SSE2 version of these without
33
 * SSE3 support, an alternative version could be added that merely checks
34
 * that 16-byte alignment on both destination and source(s) can be
35
 * achieved, rather than use LDDQU for unaligned reads.
36
 */
37
38
/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
39
 * It easily can't do that if the value is stored in a variable.
40
 * So don't save it as an intermediate value.
41
 */
42
43
/* ----------------------------------------------------------------------------
44
 * SCD = Source, Constant, Destination
45
 */
46
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                 \
47
  static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
48
0
  {                                                                                 \
49
0
    INT32 shifts = 0;                                                             \
50
0
    UINT32 offBeatMask;                                                           \
51
0
    const _type_* sptr = pSrc;                                                    \
52
0
    _type_* dptr = pDst;                                                          \
53
0
    int count;                                                                    \
54
0
    if (val == 0)                                                                 \
55
0
      return PRIMITIVES_SUCCESS;                                                \
56
0
    if (val >= 16)                                                                \
57
0
      return -1;                                                                \
58
0
    if (len < 16) /* pointless if too small */                                    \
59
0
    {                                                                             \
60
0
      return _fallback_(pSrc, val, pDst, len);                                  \
61
0
    }                                                                             \
62
0
    if (sizeof(_type_) == 1)                                                      \
63
0
      shifts = 1;                                                               \
64
0
    else if (sizeof(_type_) == 2)                                                 \
65
0
      shifts = 2;                                                               \
66
0
    else if (sizeof(_type_) == 4)                                                 \
67
0
      shifts = 3;                                                               \
68
0
    else if (sizeof(_type_) == 8)                                                 \
69
0
      shifts = 4;                                                               \
70
0
    offBeatMask = (1 << (shifts - 1)) - 1;                                        \
71
0
    if ((ULONG_PTR)pDst & offBeatMask)                                            \
72
0
    {                                                                             \
73
0
      /* Incrementing the pointer skips over 16-byte boundary. */               \
74
0
      return _fallback_(pSrc, val, pDst, len);                                  \
75
0
    }                                                                             \
76
0
    /* Get to the 16-byte boundary now. */                                        \
77
0
    while ((ULONG_PTR)dptr & 0x0f)                                                \
78
0
    {                                                                             \
79
0
      _slowWay_;                                                                \
80
0
      if (--len == 0)                                                           \
81
0
        return PRIMITIVES_SUCCESS;                                            \
82
0
    }                                                                             \
83
0
    /* Use 8 128-bit SSE registers. */                                            \
84
0
    count = len >> (8 - shifts);                                                  \
85
0
    len -= count << (8 - shifts);                                                 \
86
0
    if ((const ULONG_PTR)sptr & 0x0f)                                             \
87
0
    {                                                                             \
88
0
      while (count--)                                                           \
89
0
      {                                                                         \
90
0
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;               \
91
0
        xmm0 = _mm_lddqu_si128((const __m128i*)sptr);                         \
92
0
        sptr += (16 / sizeof(_type_));                                        \
93
0
        xmm1 = _mm_lddqu_si128((const __m128i*)sptr);                         \
94
0
        sptr += (16 / sizeof(_type_));                                        \
95
0
        xmm2 = _mm_lddqu_si128((const __m128i*)sptr);                         \
96
0
        sptr += (16 / sizeof(_type_));                                        \
97
0
        xmm3 = _mm_lddqu_si128((const __m128i*)sptr);                         \
98
0
        sptr += (16 / sizeof(_type_));                                        \
99
0
        xmm4 = _mm_lddqu_si128((const __m128i*)sptr);                         \
100
0
        sptr += (16 / sizeof(_type_));                                        \
101
0
        xmm5 = _mm_lddqu_si128((const __m128i*)sptr);                         \
102
0
        sptr += (16 / sizeof(_type_));                                        \
103
0
        xmm6 = _mm_lddqu_si128((const __m128i*)sptr);                         \
104
0
        sptr += (16 / sizeof(_type_));                                        \
105
0
        xmm7 = _mm_lddqu_si128((const __m128i*)sptr);                         \
106
0
        sptr += (16 / sizeof(_type_));                                        \
107
0
        xmm0 = _op_(xmm0, val);                                               \
108
0
        xmm1 = _op_(xmm1, val);                                               \
109
0
        xmm2 = _op_(xmm2, val);                                               \
110
0
        xmm3 = _op_(xmm3, val);                                               \
111
0
        xmm4 = _op_(xmm4, val);                                               \
112
0
        xmm5 = _op_(xmm5, val);                                               \
113
0
        xmm6 = _op_(xmm6, val);                                               \
114
0
        xmm7 = _op_(xmm7, val);                                               \
115
0
        _mm_store_si128((__m128i*)dptr, xmm0);                                \
116
0
        dptr += (16 / sizeof(_type_));                                        \
117
0
        _mm_store_si128((__m128i*)dptr, xmm1);                                \
118
0
        dptr += (16 / sizeof(_type_));                                        \
119
0
        _mm_store_si128((__m128i*)dptr, xmm2);                                \
120
0
        dptr += (16 / sizeof(_type_));                                        \
121
0
        _mm_store_si128((__m128i*)dptr, xmm3);                                \
122
0
        dptr += (16 / sizeof(_type_));                                        \
123
0
        _mm_store_si128((__m128i*)dptr, xmm4);                                \
124
0
        dptr += (16 / sizeof(_type_));                                        \
125
0
        _mm_store_si128((__m128i*)dptr, xmm5);                                \
126
0
        dptr += (16 / sizeof(_type_));                                        \
127
0
        _mm_store_si128((__m128i*)dptr, xmm6);                                \
128
0
        dptr += (16 / sizeof(_type_));                                        \
129
0
        _mm_store_si128((__m128i*)dptr, xmm7);                                \
130
0
        dptr += (16 / sizeof(_type_));                                        \
131
0
      }                                                                         \
132
0
    }                                                                             \
133
0
    else                                                                          \
134
0
    {                                                                             \
135
0
      while (count--)                                                           \
136
0
      {                                                                         \
137
0
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;               \
138
0
        xmm0 = _mm_load_si128((const __m128i*)sptr);                          \
139
0
        sptr += (16 / sizeof(_type_));                                        \
140
0
        xmm1 = _mm_load_si128((const __m128i*)sptr);                          \
141
0
        sptr += (16 / sizeof(_type_));                                        \
142
0
        xmm2 = _mm_load_si128((const __m128i*)sptr);                          \
143
0
        sptr += (16 / sizeof(_type_));                                        \
144
0
        xmm3 = _mm_load_si128((const __m128i*)sptr);                          \
145
0
        sptr += (16 / sizeof(_type_));                                        \
146
0
        xmm4 = _mm_load_si128((const __m128i*)sptr);                          \
147
0
        sptr += (16 / sizeof(_type_));                                        \
148
0
        xmm5 = _mm_load_si128((const __m128i*)sptr);                          \
149
0
        sptr += (16 / sizeof(_type_));                                        \
150
0
        xmm6 = _mm_load_si128((const __m128i*)sptr);                          \
151
0
        sptr += (16 / sizeof(_type_));                                        \
152
0
        xmm7 = _mm_load_si128((const __m128i*)sptr);                          \
153
0
        sptr += (16 / sizeof(_type_));                                        \
154
0
        xmm0 = _op_(xmm0, val);                                               \
155
0
        xmm1 = _op_(xmm1, val);                                               \
156
0
        xmm2 = _op_(xmm2, val);                                               \
157
0
        xmm3 = _op_(xmm3, val);                                               \
158
0
        xmm4 = _op_(xmm4, val);                                               \
159
0
        xmm5 = _op_(xmm5, val);                                               \
160
0
        xmm6 = _op_(xmm6, val);                                               \
161
0
        xmm7 = _op_(xmm7, val);                                               \
162
0
        _mm_store_si128((__m128i*)dptr, xmm0);                                \
163
0
        dptr += (16 / sizeof(_type_));                                        \
164
0
        _mm_store_si128((__m128i*)dptr, xmm1);                                \
165
0
        dptr += (16 / sizeof(_type_));                                        \
166
0
        _mm_store_si128((__m128i*)dptr, xmm2);                                \
167
0
        dptr += (16 / sizeof(_type_));                                        \
168
0
        _mm_store_si128((__m128i*)dptr, xmm3);                                \
169
0
        dptr += (16 / sizeof(_type_));                                        \
170
0
        _mm_store_si128((__m128i*)dptr, xmm4);                                \
171
0
        dptr += (16 / sizeof(_type_));                                        \
172
0
        _mm_store_si128((__m128i*)dptr, xmm5);                                \
173
0
        dptr += (16 / sizeof(_type_));                                        \
174
0
        _mm_store_si128((__m128i*)dptr, xmm6);                                \
175
0
        dptr += (16 / sizeof(_type_));                                        \
176
0
        _mm_store_si128((__m128i*)dptr, xmm7);                                \
177
0
        dptr += (16 / sizeof(_type_));                                        \
178
0
      }                                                                         \
179
0
    }                                                                             \
180
0
    /* Use a single 128-bit SSE register. */                                      \
181
0
    count = len >> (5 - shifts);                                                  \
182
0
    len -= count << (5 - shifts);                                                 \
183
0
    while (count--)                                                               \
184
0
    {                                                                             \
185
0
      __m128i xmm0 = LOAD_SI128(sptr);                                          \
186
0
      sptr += (16 / sizeof(_type_));                                            \
187
0
      xmm0 = _op_(xmm0, val);                                                   \
188
0
      _mm_store_si128((__m128i*)dptr, xmm0);                                    \
189
0
      dptr += (16 / sizeof(_type_));                                            \
190
0
    }                                                                             \
191
0
    /* Finish off the remainder. */                                               \
192
0
    while (len--)                                                                 \
193
0
    {                                                                             \
194
0
      _slowWay_;                                                                \
195
0
    }                                                                             \
196
0
    return PRIMITIVES_SUCCESS;                                                    \
197
0
  }
Unexecuted instantiation: prim_shift_opt.c:sse2_lShiftC_16s
Unexecuted instantiation: prim_shift_opt.c:sse2_rShiftC_16s
Unexecuted instantiation: prim_shift_opt.c:sse2_lShiftC_16u
Unexecuted instantiation: prim_shift_opt.c:sse2_rShiftC_16u
198
199
/* ----------------------------------------------------------------------------
200
 * SCD = Source, Constant, Destination
201
 * PRE = preload xmm0 with the constant.
202
 */
203
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)            \
204
  static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
205
0
  {                                                                                \
206
0
    int shifts = 0;                                                              \
207
0
    UINT32 offBeatMask;                                                          \
208
0
    const _type_* sptr = pSrc;                                                   \
209
0
    _type_* dptr = pDst;                                                         \
210
0
    size_t count;                                                                \
211
0
    __m128i xmm0;                                                                \
212
0
    if (len < 16) /* pointless if too small */                                   \
213
0
    {                                                                            \
214
0
      return _fallback_(pSrc, val, pDst, len);                                 \
215
0
    }                                                                            \
216
0
    if (sizeof(_type_) == 1)                                                     \
217
0
      shifts = 1;                                                              \
218
0
    else if (sizeof(_type_) == 2)                                                \
219
0
      shifts = 2;                                                              \
220
0
    else if (sizeof(_type_) == 4)                                                \
221
0
      shifts = 3;                                                              \
222
0
    else if (sizeof(_type_) == 8)                                                \
223
0
      shifts = 4;                                                              \
224
0
    offBeatMask = (1 << (shifts - 1)) - 1;                                       \
225
0
    if ((ULONG_PTR)pDst & offBeatMask)                                           \
226
0
    {                                                                            \
227
0
      /* Incrementing the pointer skips over 16-byte boundary. */              \
228
0
      return _fallback_(pSrc, val, pDst, len);                                 \
229
0
    }                                                                            \
230
0
    /* Get to the 16-byte boundary now. */                                       \
231
0
    while ((ULONG_PTR)dptr & 0x0f)                                               \
232
0
    {                                                                            \
233
0
      _slowWay_;                                                               \
234
0
      if (--len == 0)                                                          \
235
0
        return PRIMITIVES_SUCCESS;                                           \
236
0
    }                                                                            \
237
0
    /* Use 4 128-bit SSE registers. */                                           \
238
0
    count = len >> (7 - shifts);                                                 \
239
0
    len -= count << (7 - shifts);                                                \
240
0
    xmm0 = _mm_set1_epi32(val);                                                  \
241
0
    if ((const ULONG_PTR)sptr & 0x0f)                                            \
242
0
    {                                                                            \
243
0
      while (count--)                                                          \
244
0
      {                                                                        \
245
0
        __m128i xmm1, xmm2, xmm3, xmm4;                                      \
246
0
        xmm1 = _mm_lddqu_si128((const __m128i*)sptr);                        \
247
0
        sptr += (16 / sizeof(_type_));                                       \
248
0
        xmm2 = _mm_lddqu_si128((const __m128i*)sptr);                        \
249
0
        sptr += (16 / sizeof(_type_));                                       \
250
0
        xmm3 = _mm_lddqu_si128((const __m128i*)sptr);                        \
251
0
        sptr += (16 / sizeof(_type_));                                       \
252
0
        xmm4 = _mm_lddqu_si128((const __m128i*)sptr);                        \
253
0
        sptr += (16 / sizeof(_type_));                                       \
254
0
        xmm1 = _op_(xmm1, xmm0);                                             \
255
0
        xmm2 = _op_(xmm2, xmm0);                                             \
256
0
        xmm3 = _op_(xmm3, xmm0);                                             \
257
0
        xmm4 = _op_(xmm4, xmm0);                                             \
258
0
        _mm_store_si128((__m128i*)dptr, xmm1);                               \
259
0
        dptr += (16 / sizeof(_type_));                                       \
260
0
        _mm_store_si128((__m128i*)dptr, xmm2);                               \
261
0
        dptr += (16 / sizeof(_type_));                                       \
262
0
        _mm_store_si128((__m128i*)dptr, xmm3);                               \
263
0
        dptr += (16 / sizeof(_type_));                                       \
264
0
        _mm_store_si128((__m128i*)dptr, xmm4);                               \
265
0
        dptr += (16 / sizeof(_type_));                                       \
266
0
      }                                                                        \
267
0
    }                                                                            \
268
0
    else                                                                         \
269
0
    {                                                                            \
270
0
      while (count--)                                                          \
271
0
      {                                                                        \
272
0
        __m128i xmm1, xmm2, xmm3, xmm4;                                      \
273
0
        xmm1 = _mm_load_si128((const __m128i*)sptr);                         \
274
0
        sptr += (16 / sizeof(_type_));                                       \
275
0
        xmm2 = _mm_load_si128((const __m128i*)sptr);                         \
276
0
        sptr += (16 / sizeof(_type_));                                       \
277
0
        xmm3 = _mm_load_si128((const __m128i*)sptr);                         \
278
0
        sptr += (16 / sizeof(_type_));                                       \
279
0
        xmm4 = _mm_load_si128((const __m128i*)sptr);                         \
280
0
        sptr += (16 / sizeof(_type_));                                       \
281
0
        xmm1 = _op_(xmm1, xmm0);                                             \
282
0
        xmm2 = _op_(xmm2, xmm0);                                             \
283
0
        xmm3 = _op_(xmm3, xmm0);                                             \
284
0
        xmm4 = _op_(xmm4, xmm0);                                             \
285
0
        _mm_store_si128((__m128i*)dptr, xmm1);                               \
286
0
        dptr += (16 / sizeof(_type_));                                       \
287
0
        _mm_store_si128((__m128i*)dptr, xmm2);                               \
288
0
        dptr += (16 / sizeof(_type_));                                       \
289
0
        _mm_store_si128((__m128i*)dptr, xmm3);                               \
290
0
        dptr += (16 / sizeof(_type_));                                       \
291
0
        _mm_store_si128((__m128i*)dptr, xmm4);                               \
292
0
        dptr += (16 / sizeof(_type_));                                       \
293
0
      }                                                                        \
294
0
    }                                                                            \
295
0
    /* Use a single 128-bit SSE register. */                                     \
296
0
    count = len >> (5 - shifts);                                                 \
297
0
    len -= count << (5 - shifts);                                                \
298
0
    while (count--)                                                              \
299
0
    {                                                                            \
300
0
      __m128i xmm1 = LOAD_SI128(sptr);                                         \
301
0
      sptr += (16 / sizeof(_type_));                                           \
302
0
      xmm1 = _op_(xmm1, xmm0);                                                 \
303
0
      _mm_store_si128((__m128i*)dptr, xmm1);                                   \
304
0
      dptr += (16 / sizeof(_type_));                                           \
305
0
    }                                                                            \
306
0
    /* Finish off the remainder. */                                              \
307
0
    while (len--)                                                                \
308
0
    {                                                                            \
309
0
      _slowWay_;                                                               \
310
0
    }                                                                            \
311
0
    return PRIMITIVES_SUCCESS;                                                   \
312
0
  }
Unexecuted instantiation: prim_andor_opt.c:sse3_andC_32u
Unexecuted instantiation: prim_andor_opt.c:sse3_orC_32u
313
314
/* ----------------------------------------------------------------------------
315
 * SSD = Source1, Source2, Destination
316
 */
317
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                           \
318
  static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
319
0
  {                                                                                           \
320
0
    int shifts = 0;                                                                         \
321
0
    UINT32 offBeatMask;                                                                     \
322
0
    const _type_* sptr1 = pSrc1;                                                            \
323
0
    const _type_* sptr2 = pSrc2;                                                            \
324
0
    _type_* dptr = pDst;                                                                    \
325
0
    size_t count;                                                                           \
326
0
    if (len < 16) /* pointless if too small */                                              \
327
0
    {                                                                                       \
328
0
      return _fallback_(pSrc1, pSrc2, pDst, len);                                         \
329
0
    }                                                                                       \
330
0
    if (sizeof(_type_) == 1)                                                                \
331
0
      shifts = 1;                                                                         \
332
0
    else if (sizeof(_type_) == 2)                                                           \
333
0
      shifts = 2;                                                                         \
334
0
    else if (sizeof(_type_) == 4)                                                           \
335
0
      shifts = 3;                                                                         \
336
0
    else if (sizeof(_type_) == 8)                                                           \
337
0
      shifts = 4;                                                                         \
338
0
    offBeatMask = (1 << (shifts - 1)) - 1;                                                  \
339
0
    if ((ULONG_PTR)pDst & offBeatMask)                                                      \
340
0
    {                                                                                       \
341
0
      /* Incrementing the pointer skips over 16-byte boundary. */                         \
342
0
      return _fallback_(pSrc1, pSrc2, pDst, len);                                         \
343
0
    }                                                                                       \
344
0
    /* Get to the 16-byte boundary now. */                                                  \
345
0
    while ((ULONG_PTR)dptr & 0x0f)                                                          \
346
0
    {                                                                                       \
347
0
      pstatus_t status;                                                                   \
348
0
      status = _slowWay_;                                                                 \
349
0
      if (status != PRIMITIVES_SUCCESS)                                                   \
350
0
        return status;                                                                  \
351
0
      if (--len == 0)                                                                     \
352
0
        return PRIMITIVES_SUCCESS;                                                      \
353
0
    }                                                                                       \
354
0
    /* Use 4 128-bit SSE registers. */                                                      \
355
0
    count = len >> (7 - shifts);                                                            \
356
0
    len -= count << (7 - shifts);                                                           \
357
0
    if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f))                 \
358
0
    {                                                                                       \
359
0
      /* Unaligned loads */                                                               \
360
0
      while (count--)                                                                     \
361
0
      {                                                                                   \
362
0
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;                         \
363
0
        xmm0 = _mm_lddqu_si128((const __m128i*)sptr1);                                  \
364
0
        sptr1 += (16 / sizeof(_type_));                                                 \
365
0
        xmm1 = _mm_lddqu_si128((const __m128i*)sptr1);                                  \
366
0
        sptr1 += (16 / sizeof(_type_));                                                 \
367
0
        xmm2 = _mm_lddqu_si128((const __m128i*)sptr1);                                  \
368
0
        sptr1 += (16 / sizeof(_type_));                                                 \
369
0
        xmm3 = _mm_lddqu_si128((const __m128i*)sptr1);                                  \
370
0
        sptr1 += (16 / sizeof(_type_));                                                 \
371
0
        xmm4 = _mm_lddqu_si128((const __m128i*)sptr2);                                  \
372
0
        sptr2 += (16 / sizeof(_type_));                                                 \
373
0
        xmm5 = _mm_lddqu_si128((const __m128i*)sptr2);                                  \
374
0
        sptr2 += (16 / sizeof(_type_));                                                 \
375
0
        xmm6 = _mm_lddqu_si128((const __m128i*)sptr2);                                  \
376
0
        sptr2 += (16 / sizeof(_type_));                                                 \
377
0
        xmm7 = _mm_lddqu_si128((const __m128i*)sptr2);                                  \
378
0
        sptr2 += (16 / sizeof(_type_));                                                 \
379
0
        xmm0 = _op_(xmm0, xmm4);                                                        \
380
0
        xmm1 = _op_(xmm1, xmm5);                                                        \
381
0
        xmm2 = _op_(xmm2, xmm6);                                                        \
382
0
        xmm3 = _op_(xmm3, xmm7);                                                        \
383
0
        _mm_store_si128((__m128i*)dptr, xmm0);                                          \
384
0
        dptr += (16 / sizeof(_type_));                                                  \
385
0
        _mm_store_si128((__m128i*)dptr, xmm1);                                          \
386
0
        dptr += (16 / sizeof(_type_));                                                  \
387
0
        _mm_store_si128((__m128i*)dptr, xmm2);                                          \
388
0
        dptr += (16 / sizeof(_type_));                                                  \
389
0
        _mm_store_si128((__m128i*)dptr, xmm3);                                          \
390
0
        dptr += (16 / sizeof(_type_));                                                  \
391
0
      }                                                                                   \
392
0
    }                                                                                       \
393
0
    else                                                                                    \
394
0
    {                                                                                       \
395
0
      /* Aligned loads */                                                                 \
396
0
      while (count--)                                                                     \
397
0
      {                                                                                   \
398
0
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;                         \
399
0
        xmm0 = _mm_load_si128((const __m128i*)sptr1);                                   \
400
0
        sptr1 += (16 / sizeof(_type_));                                                 \
401
0
        xmm1 = _mm_load_si128((const __m128i*)sptr1);                                   \
402
0
        sptr1 += (16 / sizeof(_type_));                                                 \
403
0
        xmm2 = _mm_load_si128((const __m128i*)sptr1);                                   \
404
0
        sptr1 += (16 / sizeof(_type_));                                                 \
405
0
        xmm3 = _mm_load_si128((const __m128i*)sptr1);                                   \
406
0
        sptr1 += (16 / sizeof(_type_));                                                 \
407
0
        xmm4 = _mm_load_si128((const __m128i*)sptr2);                                   \
408
0
        sptr2 += (16 / sizeof(_type_));                                                 \
409
0
        xmm5 = _mm_load_si128((const __m128i*)sptr2);                                   \
410
0
        sptr2 += (16 / sizeof(_type_));                                                 \
411
0
        xmm6 = _mm_load_si128((const __m128i*)sptr2);                                   \
412
0
        sptr2 += (16 / sizeof(_type_));                                                 \
413
0
        xmm7 = _mm_load_si128((const __m128i*)sptr2);                                   \
414
0
        sptr2 += (16 / sizeof(_type_));                                                 \
415
0
        xmm0 = _op_(xmm0, xmm4);                                                        \
416
0
        xmm1 = _op_(xmm1, xmm5);                                                        \
417
0
        xmm2 = _op_(xmm2, xmm6);                                                        \
418
0
        xmm3 = _op_(xmm3, xmm7);                                                        \
419
0
        _mm_store_si128((__m128i*)dptr, xmm0);                                          \
420
0
        dptr += (16 / sizeof(_type_));                                                  \
421
0
        _mm_store_si128((__m128i*)dptr, xmm1);                                          \
422
0
        dptr += (16 / sizeof(_type_));                                                  \
423
0
        _mm_store_si128((__m128i*)dptr, xmm2);                                          \
424
0
        dptr += (16 / sizeof(_type_));                                                  \
425
0
        _mm_store_si128((__m128i*)dptr, xmm3);                                          \
426
0
        dptr += (16 / sizeof(_type_));                                                  \
427
0
      }                                                                                   \
428
0
    }                                                                                       \
429
0
    /* Use a single 128-bit SSE register. */                                                \
430
0
    count = len >> (5 - shifts);                                                            \
431
0
    len -= count << (5 - shifts);                                                           \
432
0
    while (count--)                                                                         \
433
0
    {                                                                                       \
434
0
      __m128i xmm0, xmm1;                                                                 \
435
0
      xmm0 = LOAD_SI128(sptr1);                                                           \
436
0
      sptr1 += (16 / sizeof(_type_));                                                     \
437
0
      xmm1 = LOAD_SI128(sptr2);                                                           \
438
0
      sptr2 += (16 / sizeof(_type_));                                                     \
439
0
      xmm0 = _op_(xmm0, xmm1);                                                            \
440
0
      _mm_store_si128((__m128i*)dptr, xmm0);                                              \
441
0
      dptr += (16 / sizeof(_type_));                                                      \
442
0
    }                                                                                       \
443
0
    /* Finish off the remainder. */                                                         \
444
0
    while (len--)                                                                           \
445
0
    {                                                                                       \
446
0
      _slowWay_;                                                                          \
447
0
    }                                                                                       \
448
0
    return PRIMITIVES_SUCCESS;                                                              \
449
0
  }
450
451
#endif /* FREERDP_LIB_PRIM_TEMPLATES_H */