/src/FreeRDP/libfreerdp/primitives/prim_templates.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* prim_templates.h |
2 | | * vi:ts=4 sw=4 |
3 | | * |
4 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
6 | | * not use this file except in compliance with the License. You may obtain |
7 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
8 | | * Unless required by applicable law or agreed to in writing, software |
9 | | * distributed under the License is distributed on an "AS IS" BASIS, |
10 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
11 | | * or implied. See the License for the specific language governing |
12 | | * permissions and limitations under the License. Algorithms used by |
13 | | * this code may be covered by patents by HP, Microsoft, or other parties. |
14 | | */ |
15 | | |
16 | | #ifdef __GNUC__ |
17 | | #pragma once |
18 | | #endif |
19 | | |
20 | | #ifndef FREERDP_LIB_PRIM_TEMPLATES_H |
21 | | #define FREERDP_LIB_PRIM_TEMPLATES_H |
22 | | |
23 | | /* These are prototypes for SSE (potentially NEON) routines that do a |
24 | | * simple SSE operation over an array of data. Since so much of this |
25 | | * code is shared except for the operation itself, these prototypes are |
26 | | * used rather than duplicating code. The naming convention depends on |
27 | | * the parameters: S=Source param; C=Constant; D=Destination. |
28 | | * All the macros have parameters for a fallback procedure if the data |
29 | | * is too small and an operation "the slow way" for use at 16-byte edges. |
30 | | */ |
31 | | |
32 | | /* SSE3 note: If someone needs to support an SSE2 version of these without |
33 | | * SSE3 support, an alternative version could be added that merely checks |
34 | | * that 16-byte alignment on both destination and source(s) can be |
35 | | * achieved, rather than use LDDQU for unaligned reads. |
36 | | */ |
37 | | |
38 | | /* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant. |
39 | | * It easily can't do that if the value is stored in a variable. |
40 | | * So don't save it as an intermediate value. |
41 | | */ |
42 | | |
43 | | /* ---------------------------------------------------------------------------- |
44 | | * SCD = Source, Constant, Destination |
45 | | */ |
46 | | #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ |
47 | | static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \ |
48 | 0 | { \ |
49 | 0 | INT32 shifts = 0; \ |
50 | 0 | UINT32 offBeatMask; \ |
51 | 0 | const _type_* sptr = pSrc; \ |
52 | 0 | _type_* dptr = pDst; \ |
53 | 0 | int count; \ |
54 | 0 | if (val == 0) \ |
55 | 0 | return PRIMITIVES_SUCCESS; \ |
56 | 0 | if (val >= 16) \ |
57 | 0 | return -1; \ |
58 | 0 | if (len < 16) /* pointless if too small */ \ |
59 | 0 | { \ |
60 | 0 | return _fallback_(pSrc, val, pDst, len); \ |
61 | 0 | } \ |
62 | 0 | if (sizeof(_type_) == 1) \ |
63 | 0 | shifts = 1; \ |
64 | 0 | else if (sizeof(_type_) == 2) \ |
65 | 0 | shifts = 2; \ |
66 | 0 | else if (sizeof(_type_) == 4) \ |
67 | 0 | shifts = 3; \ |
68 | 0 | else if (sizeof(_type_) == 8) \ |
69 | 0 | shifts = 4; \ |
70 | 0 | offBeatMask = (1 << (shifts - 1)) - 1; \ |
71 | 0 | if ((ULONG_PTR)pDst & offBeatMask) \ |
72 | 0 | { \ |
73 | 0 | /* Incrementing the pointer skips over 16-byte boundary. */ \ |
74 | 0 | return _fallback_(pSrc, val, pDst, len); \ |
75 | 0 | } \ |
76 | 0 | /* Get to the 16-byte boundary now. */ \ |
77 | 0 | while ((ULONG_PTR)dptr & 0x0f) \ |
78 | 0 | { \ |
79 | 0 | _slowWay_; \ |
80 | 0 | if (--len == 0) \ |
81 | 0 | return PRIMITIVES_SUCCESS; \ |
82 | 0 | } \ |
83 | 0 | /* Use 8 128-bit SSE registers. */ \ |
84 | 0 | count = len >> (8 - shifts); \ |
85 | 0 | len -= count << (8 - shifts); \ |
86 | 0 | if ((const ULONG_PTR)sptr & 0x0f) \ |
87 | 0 | { \ |
88 | 0 | while (count--) \ |
89 | 0 | { \ |
90 | 0 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ |
91 | 0 | xmm0 = _mm_lddqu_si128((const __m128i*)sptr); \ |
92 | 0 | sptr += (16 / sizeof(_type_)); \ |
93 | 0 | xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \ |
94 | 0 | sptr += (16 / sizeof(_type_)); \ |
95 | 0 | xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \ |
96 | 0 | sptr += (16 / sizeof(_type_)); \ |
97 | 0 | xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \ |
98 | 0 | sptr += (16 / sizeof(_type_)); \ |
99 | 0 | xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \ |
100 | 0 | sptr += (16 / sizeof(_type_)); \ |
101 | 0 | xmm5 = _mm_lddqu_si128((const __m128i*)sptr); \ |
102 | 0 | sptr += (16 / sizeof(_type_)); \ |
103 | 0 | xmm6 = _mm_lddqu_si128((const __m128i*)sptr); \ |
104 | 0 | sptr += (16 / sizeof(_type_)); \ |
105 | 0 | xmm7 = _mm_lddqu_si128((const __m128i*)sptr); \ |
106 | 0 | sptr += (16 / sizeof(_type_)); \ |
107 | 0 | xmm0 = _op_(xmm0, val); \ |
108 | 0 | xmm1 = _op_(xmm1, val); \ |
109 | 0 | xmm2 = _op_(xmm2, val); \ |
110 | 0 | xmm3 = _op_(xmm3, val); \ |
111 | 0 | xmm4 = _op_(xmm4, val); \ |
112 | 0 | xmm5 = _op_(xmm5, val); \ |
113 | 0 | xmm6 = _op_(xmm6, val); \ |
114 | 0 | xmm7 = _op_(xmm7, val); \ |
115 | 0 | _mm_store_si128((__m128i*)dptr, xmm0); \ |
116 | 0 | dptr += (16 / sizeof(_type_)); \ |
117 | 0 | _mm_store_si128((__m128i*)dptr, xmm1); \ |
118 | 0 | dptr += (16 / sizeof(_type_)); \ |
119 | 0 | _mm_store_si128((__m128i*)dptr, xmm2); \ |
120 | 0 | dptr += (16 / sizeof(_type_)); \ |
121 | 0 | _mm_store_si128((__m128i*)dptr, xmm3); \ |
122 | 0 | dptr += (16 / sizeof(_type_)); \ |
123 | 0 | _mm_store_si128((__m128i*)dptr, xmm4); \ |
124 | 0 | dptr += (16 / sizeof(_type_)); \ |
125 | 0 | _mm_store_si128((__m128i*)dptr, xmm5); \ |
126 | 0 | dptr += (16 / sizeof(_type_)); \ |
127 | 0 | _mm_store_si128((__m128i*)dptr, xmm6); \ |
128 | 0 | dptr += (16 / sizeof(_type_)); \ |
129 | 0 | _mm_store_si128((__m128i*)dptr, xmm7); \ |
130 | 0 | dptr += (16 / sizeof(_type_)); \ |
131 | 0 | } \ |
132 | 0 | } \ |
133 | 0 | else \ |
134 | 0 | { \ |
135 | 0 | while (count--) \ |
136 | 0 | { \ |
137 | 0 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ |
138 | 0 | xmm0 = _mm_load_si128((const __m128i*)sptr); \ |
139 | 0 | sptr += (16 / sizeof(_type_)); \ |
140 | 0 | xmm1 = _mm_load_si128((const __m128i*)sptr); \ |
141 | 0 | sptr += (16 / sizeof(_type_)); \ |
142 | 0 | xmm2 = _mm_load_si128((const __m128i*)sptr); \ |
143 | 0 | sptr += (16 / sizeof(_type_)); \ |
144 | 0 | xmm3 = _mm_load_si128((const __m128i*)sptr); \ |
145 | 0 | sptr += (16 / sizeof(_type_)); \ |
146 | 0 | xmm4 = _mm_load_si128((const __m128i*)sptr); \ |
147 | 0 | sptr += (16 / sizeof(_type_)); \ |
148 | 0 | xmm5 = _mm_load_si128((const __m128i*)sptr); \ |
149 | 0 | sptr += (16 / sizeof(_type_)); \ |
150 | 0 | xmm6 = _mm_load_si128((const __m128i*)sptr); \ |
151 | 0 | sptr += (16 / sizeof(_type_)); \ |
152 | 0 | xmm7 = _mm_load_si128((const __m128i*)sptr); \ |
153 | 0 | sptr += (16 / sizeof(_type_)); \ |
154 | 0 | xmm0 = _op_(xmm0, val); \ |
155 | 0 | xmm1 = _op_(xmm1, val); \ |
156 | 0 | xmm2 = _op_(xmm2, val); \ |
157 | 0 | xmm3 = _op_(xmm3, val); \ |
158 | 0 | xmm4 = _op_(xmm4, val); \ |
159 | 0 | xmm5 = _op_(xmm5, val); \ |
160 | 0 | xmm6 = _op_(xmm6, val); \ |
161 | 0 | xmm7 = _op_(xmm7, val); \ |
162 | 0 | _mm_store_si128((__m128i*)dptr, xmm0); \ |
163 | 0 | dptr += (16 / sizeof(_type_)); \ |
164 | 0 | _mm_store_si128((__m128i*)dptr, xmm1); \ |
165 | 0 | dptr += (16 / sizeof(_type_)); \ |
166 | 0 | _mm_store_si128((__m128i*)dptr, xmm2); \ |
167 | 0 | dptr += (16 / sizeof(_type_)); \ |
168 | 0 | _mm_store_si128((__m128i*)dptr, xmm3); \ |
169 | 0 | dptr += (16 / sizeof(_type_)); \ |
170 | 0 | _mm_store_si128((__m128i*)dptr, xmm4); \ |
171 | 0 | dptr += (16 / sizeof(_type_)); \ |
172 | 0 | _mm_store_si128((__m128i*)dptr, xmm5); \ |
173 | 0 | dptr += (16 / sizeof(_type_)); \ |
174 | 0 | _mm_store_si128((__m128i*)dptr, xmm6); \ |
175 | 0 | dptr += (16 / sizeof(_type_)); \ |
176 | 0 | _mm_store_si128((__m128i*)dptr, xmm7); \ |
177 | 0 | dptr += (16 / sizeof(_type_)); \ |
178 | 0 | } \ |
179 | 0 | } \ |
180 | 0 | /* Use a single 128-bit SSE register. */ \ |
181 | 0 | count = len >> (5 - shifts); \ |
182 | 0 | len -= count << (5 - shifts); \ |
183 | 0 | while (count--) \ |
184 | 0 | { \ |
185 | 0 | __m128i xmm0 = LOAD_SI128(sptr); \ |
186 | 0 | sptr += (16 / sizeof(_type_)); \ |
187 | 0 | xmm0 = _op_(xmm0, val); \ |
188 | 0 | _mm_store_si128((__m128i*)dptr, xmm0); \ |
189 | 0 | dptr += (16 / sizeof(_type_)); \ |
190 | 0 | } \ |
191 | 0 | /* Finish off the remainder. */ \ |
192 | 0 | while (len--) \ |
193 | 0 | { \ |
194 | 0 | _slowWay_; \ |
195 | 0 | } \ |
196 | 0 | return PRIMITIVES_SUCCESS; \ |
197 | 0 | } Unexecuted instantiation: prim_shift_opt.c:sse2_lShiftC_16s Unexecuted instantiation: prim_shift_opt.c:sse2_rShiftC_16s Unexecuted instantiation: prim_shift_opt.c:sse2_lShiftC_16u Unexecuted instantiation: prim_shift_opt.c:sse2_rShiftC_16u |
198 | | |
199 | | /* ---------------------------------------------------------------------------- |
200 | | * SCD = Source, Constant, Destination |
201 | | * PRE = preload xmm0 with the constant. |
202 | | */ |
203 | | #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ |
204 | | static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \ |
205 | 0 | { \ |
206 | 0 | int shifts = 0; \ |
207 | 0 | UINT32 offBeatMask; \ |
208 | 0 | const _type_* sptr = pSrc; \ |
209 | 0 | _type_* dptr = pDst; \ |
210 | 0 | size_t count; \ |
211 | 0 | __m128i xmm0; \ |
212 | 0 | if (len < 16) /* pointless if too small */ \ |
213 | 0 | { \ |
214 | 0 | return _fallback_(pSrc, val, pDst, len); \ |
215 | 0 | } \ |
216 | 0 | if (sizeof(_type_) == 1) \ |
217 | 0 | shifts = 1; \ |
218 | 0 | else if (sizeof(_type_) == 2) \ |
219 | 0 | shifts = 2; \ |
220 | 0 | else if (sizeof(_type_) == 4) \ |
221 | 0 | shifts = 3; \ |
222 | 0 | else if (sizeof(_type_) == 8) \ |
223 | 0 | shifts = 4; \ |
224 | 0 | offBeatMask = (1 << (shifts - 1)) - 1; \ |
225 | 0 | if ((ULONG_PTR)pDst & offBeatMask) \ |
226 | 0 | { \ |
227 | 0 | /* Incrementing the pointer skips over 16-byte boundary. */ \ |
228 | 0 | return _fallback_(pSrc, val, pDst, len); \ |
229 | 0 | } \ |
230 | 0 | /* Get to the 16-byte boundary now. */ \ |
231 | 0 | while ((ULONG_PTR)dptr & 0x0f) \ |
232 | 0 | { \ |
233 | 0 | _slowWay_; \ |
234 | 0 | if (--len == 0) \ |
235 | 0 | return PRIMITIVES_SUCCESS; \ |
236 | 0 | } \ |
237 | 0 | /* Use 4 128-bit SSE registers. */ \ |
238 | 0 | count = len >> (7 - shifts); \ |
239 | 0 | len -= count << (7 - shifts); \ |
240 | 0 | xmm0 = _mm_set1_epi32(val); \ |
241 | 0 | if ((const ULONG_PTR)sptr & 0x0f) \ |
242 | 0 | { \ |
243 | 0 | while (count--) \ |
244 | 0 | { \ |
245 | 0 | __m128i xmm1, xmm2, xmm3, xmm4; \ |
246 | 0 | xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \ |
247 | 0 | sptr += (16 / sizeof(_type_)); \ |
248 | 0 | xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \ |
249 | 0 | sptr += (16 / sizeof(_type_)); \ |
250 | 0 | xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \ |
251 | 0 | sptr += (16 / sizeof(_type_)); \ |
252 | 0 | xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \ |
253 | 0 | sptr += (16 / sizeof(_type_)); \ |
254 | 0 | xmm1 = _op_(xmm1, xmm0); \ |
255 | 0 | xmm2 = _op_(xmm2, xmm0); \ |
256 | 0 | xmm3 = _op_(xmm3, xmm0); \ |
257 | 0 | xmm4 = _op_(xmm4, xmm0); \ |
258 | 0 | _mm_store_si128((__m128i*)dptr, xmm1); \ |
259 | 0 | dptr += (16 / sizeof(_type_)); \ |
260 | 0 | _mm_store_si128((__m128i*)dptr, xmm2); \ |
261 | 0 | dptr += (16 / sizeof(_type_)); \ |
262 | 0 | _mm_store_si128((__m128i*)dptr, xmm3); \ |
263 | 0 | dptr += (16 / sizeof(_type_)); \ |
264 | 0 | _mm_store_si128((__m128i*)dptr, xmm4); \ |
265 | 0 | dptr += (16 / sizeof(_type_)); \ |
266 | 0 | } \ |
267 | 0 | } \ |
268 | 0 | else \ |
269 | 0 | { \ |
270 | 0 | while (count--) \ |
271 | 0 | { \ |
272 | 0 | __m128i xmm1, xmm2, xmm3, xmm4; \ |
273 | 0 | xmm1 = _mm_load_si128((const __m128i*)sptr); \ |
274 | 0 | sptr += (16 / sizeof(_type_)); \ |
275 | 0 | xmm2 = _mm_load_si128((const __m128i*)sptr); \ |
276 | 0 | sptr += (16 / sizeof(_type_)); \ |
277 | 0 | xmm3 = _mm_load_si128((const __m128i*)sptr); \ |
278 | 0 | sptr += (16 / sizeof(_type_)); \ |
279 | 0 | xmm4 = _mm_load_si128((const __m128i*)sptr); \ |
280 | 0 | sptr += (16 / sizeof(_type_)); \ |
281 | 0 | xmm1 = _op_(xmm1, xmm0); \ |
282 | 0 | xmm2 = _op_(xmm2, xmm0); \ |
283 | 0 | xmm3 = _op_(xmm3, xmm0); \ |
284 | 0 | xmm4 = _op_(xmm4, xmm0); \ |
285 | 0 | _mm_store_si128((__m128i*)dptr, xmm1); \ |
286 | 0 | dptr += (16 / sizeof(_type_)); \ |
287 | 0 | _mm_store_si128((__m128i*)dptr, xmm2); \ |
288 | 0 | dptr += (16 / sizeof(_type_)); \ |
289 | 0 | _mm_store_si128((__m128i*)dptr, xmm3); \ |
290 | 0 | dptr += (16 / sizeof(_type_)); \ |
291 | 0 | _mm_store_si128((__m128i*)dptr, xmm4); \ |
292 | 0 | dptr += (16 / sizeof(_type_)); \ |
293 | 0 | } \ |
294 | 0 | } \ |
295 | 0 | /* Use a single 128-bit SSE register. */ \ |
296 | 0 | count = len >> (5 - shifts); \ |
297 | 0 | len -= count << (5 - shifts); \ |
298 | 0 | while (count--) \ |
299 | 0 | { \ |
300 | 0 | __m128i xmm1 = LOAD_SI128(sptr); \ |
301 | 0 | sptr += (16 / sizeof(_type_)); \ |
302 | 0 | xmm1 = _op_(xmm1, xmm0); \ |
303 | 0 | _mm_store_si128((__m128i*)dptr, xmm1); \ |
304 | 0 | dptr += (16 / sizeof(_type_)); \ |
305 | 0 | } \ |
306 | 0 | /* Finish off the remainder. */ \ |
307 | 0 | while (len--) \ |
308 | 0 | { \ |
309 | 0 | _slowWay_; \ |
310 | 0 | } \ |
311 | 0 | return PRIMITIVES_SUCCESS; \ |
312 | 0 | } Unexecuted instantiation: prim_andor_opt.c:sse3_andC_32u Unexecuted instantiation: prim_andor_opt.c:sse3_orC_32u |
313 | | |
314 | | /* ---------------------------------------------------------------------------- |
315 | | * SSD = Source1, Source2, Destination |
316 | | */ |
317 | | #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ |
318 | | static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \ |
319 | 0 | { \ |
320 | 0 | int shifts = 0; \ |
321 | 0 | UINT32 offBeatMask; \ |
322 | 0 | const _type_* sptr1 = pSrc1; \ |
323 | 0 | const _type_* sptr2 = pSrc2; \ |
324 | 0 | _type_* dptr = pDst; \ |
325 | 0 | size_t count; \ |
326 | 0 | if (len < 16) /* pointless if too small */ \ |
327 | 0 | { \ |
328 | 0 | return _fallback_(pSrc1, pSrc2, pDst, len); \ |
329 | 0 | } \ |
330 | 0 | if (sizeof(_type_) == 1) \ |
331 | 0 | shifts = 1; \ |
332 | 0 | else if (sizeof(_type_) == 2) \ |
333 | 0 | shifts = 2; \ |
334 | 0 | else if (sizeof(_type_) == 4) \ |
335 | 0 | shifts = 3; \ |
336 | 0 | else if (sizeof(_type_) == 8) \ |
337 | 0 | shifts = 4; \ |
338 | 0 | offBeatMask = (1 << (shifts - 1)) - 1; \ |
339 | 0 | if ((ULONG_PTR)pDst & offBeatMask) \ |
340 | 0 | { \ |
341 | 0 | /* Incrementing the pointer skips over 16-byte boundary. */ \ |
342 | 0 | return _fallback_(pSrc1, pSrc2, pDst, len); \ |
343 | 0 | } \ |
344 | 0 | /* Get to the 16-byte boundary now. */ \ |
345 | 0 | while ((ULONG_PTR)dptr & 0x0f) \ |
346 | 0 | { \ |
347 | 0 | pstatus_t status; \ |
348 | 0 | status = _slowWay_; \ |
349 | 0 | if (status != PRIMITIVES_SUCCESS) \ |
350 | 0 | return status; \ |
351 | 0 | if (--len == 0) \ |
352 | 0 | return PRIMITIVES_SUCCESS; \ |
353 | 0 | } \ |
354 | 0 | /* Use 4 128-bit SSE registers. */ \ |
355 | 0 | count = len >> (7 - shifts); \ |
356 | 0 | len -= count << (7 - shifts); \ |
357 | 0 | if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f)) \ |
358 | 0 | { \ |
359 | 0 | /* Unaligned loads */ \ |
360 | 0 | while (count--) \ |
361 | 0 | { \ |
362 | 0 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ |
363 | 0 | xmm0 = _mm_lddqu_si128((const __m128i*)sptr1); \ |
364 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
365 | 0 | xmm1 = _mm_lddqu_si128((const __m128i*)sptr1); \ |
366 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
367 | 0 | xmm2 = _mm_lddqu_si128((const __m128i*)sptr1); \ |
368 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
369 | 0 | xmm3 = _mm_lddqu_si128((const __m128i*)sptr1); \ |
370 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
371 | 0 | xmm4 = _mm_lddqu_si128((const __m128i*)sptr2); \ |
372 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
373 | 0 | xmm5 = _mm_lddqu_si128((const __m128i*)sptr2); \ |
374 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
375 | 0 | xmm6 = _mm_lddqu_si128((const __m128i*)sptr2); \ |
376 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
377 | 0 | xmm7 = _mm_lddqu_si128((const __m128i*)sptr2); \ |
378 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
379 | 0 | xmm0 = _op_(xmm0, xmm4); \ |
380 | 0 | xmm1 = _op_(xmm1, xmm5); \ |
381 | 0 | xmm2 = _op_(xmm2, xmm6); \ |
382 | 0 | xmm3 = _op_(xmm3, xmm7); \ |
383 | 0 | _mm_store_si128((__m128i*)dptr, xmm0); \ |
384 | 0 | dptr += (16 / sizeof(_type_)); \ |
385 | 0 | _mm_store_si128((__m128i*)dptr, xmm1); \ |
386 | 0 | dptr += (16 / sizeof(_type_)); \ |
387 | 0 | _mm_store_si128((__m128i*)dptr, xmm2); \ |
388 | 0 | dptr += (16 / sizeof(_type_)); \ |
389 | 0 | _mm_store_si128((__m128i*)dptr, xmm3); \ |
390 | 0 | dptr += (16 / sizeof(_type_)); \ |
391 | 0 | } \ |
392 | 0 | } \ |
393 | 0 | else \ |
394 | 0 | { \ |
395 | 0 | /* Aligned loads */ \ |
396 | 0 | while (count--) \ |
397 | 0 | { \ |
398 | 0 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \ |
399 | 0 | xmm0 = _mm_load_si128((const __m128i*)sptr1); \ |
400 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
401 | 0 | xmm1 = _mm_load_si128((const __m128i*)sptr1); \ |
402 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
403 | 0 | xmm2 = _mm_load_si128((const __m128i*)sptr1); \ |
404 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
405 | 0 | xmm3 = _mm_load_si128((const __m128i*)sptr1); \ |
406 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
407 | 0 | xmm4 = _mm_load_si128((const __m128i*)sptr2); \ |
408 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
409 | 0 | xmm5 = _mm_load_si128((const __m128i*)sptr2); \ |
410 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
411 | 0 | xmm6 = _mm_load_si128((const __m128i*)sptr2); \ |
412 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
413 | 0 | xmm7 = _mm_load_si128((const __m128i*)sptr2); \ |
414 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
415 | 0 | xmm0 = _op_(xmm0, xmm4); \ |
416 | 0 | xmm1 = _op_(xmm1, xmm5); \ |
417 | 0 | xmm2 = _op_(xmm2, xmm6); \ |
418 | 0 | xmm3 = _op_(xmm3, xmm7); \ |
419 | 0 | _mm_store_si128((__m128i*)dptr, xmm0); \ |
420 | 0 | dptr += (16 / sizeof(_type_)); \ |
421 | 0 | _mm_store_si128((__m128i*)dptr, xmm1); \ |
422 | 0 | dptr += (16 / sizeof(_type_)); \ |
423 | 0 | _mm_store_si128((__m128i*)dptr, xmm2); \ |
424 | 0 | dptr += (16 / sizeof(_type_)); \ |
425 | 0 | _mm_store_si128((__m128i*)dptr, xmm3); \ |
426 | 0 | dptr += (16 / sizeof(_type_)); \ |
427 | 0 | } \ |
428 | 0 | } \ |
429 | 0 | /* Use a single 128-bit SSE register. */ \ |
430 | 0 | count = len >> (5 - shifts); \ |
431 | 0 | len -= count << (5 - shifts); \ |
432 | 0 | while (count--) \ |
433 | 0 | { \ |
434 | 0 | __m128i xmm0, xmm1; \ |
435 | 0 | xmm0 = LOAD_SI128(sptr1); \ |
436 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
437 | 0 | xmm1 = LOAD_SI128(sptr2); \ |
438 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
439 | 0 | xmm0 = _op_(xmm0, xmm1); \ |
440 | 0 | _mm_store_si128((__m128i*)dptr, xmm0); \ |
441 | 0 | dptr += (16 / sizeof(_type_)); \ |
442 | 0 | } \ |
443 | 0 | /* Finish off the remainder. */ \ |
444 | 0 | while (len--) \ |
445 | 0 | { \ |
446 | 0 | _slowWay_; \ |
447 | 0 | } \ |
448 | 0 | return PRIMITIVES_SUCCESS; \ |
449 | 0 | } |
450 | | |
451 | | #endif /* FREERDP_LIB_PRIM_TEMPLATES_H */ |