/src/FreeRDP/libfreerdp/primitives/sse/prim_templates.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* prim_templates.h |
2 | | * vi:ts=4 sw=4 |
3 | | * |
4 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
6 | | * not use this file except in compliance with the License. You may obtain |
7 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
8 | | * Unless required by applicable law or agreed to in writing, software |
9 | | * distributed under the License is distributed on an "AS IS" BASIS, |
10 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
11 | | * or implied. See the License for the specific language governing |
12 | | * permissions and limitations under the License. Algorithms used by |
13 | | * this code may be covered by patents by HP, Microsoft, or other parties. |
14 | | */ |
15 | | |
16 | | #pragma once |
17 | | |
18 | | #include "prim_avxsse.h" |
19 | | |
20 | | /* These are prototypes for SSE (potentially NEON) routines that do a |
21 | | * simple SSE operation over an array of data. Since so much of this |
22 | | * code is shared except for the operation itself, these prototypes are |
23 | | * used rather than duplicating code. The naming convention depends on |
24 | | * the parameters: S=Source param; C=Constant; D=Destination. |
25 | | * All the macros have parameters for a fallback procedure if the data |
26 | | * is too small and an operation "the slow way" for use at 16-byte edges. |
27 | | */ |
28 | | |
29 | | /* SSE3 note: If someone needs to support an SSE2 version of these without |
30 | | * SSE3 support, an alternative version could be added that merely checks |
31 | | * that 16-byte alignment on both destination and source(s) can be |
32 | | * achieved, rather than use LDDQU for unaligned reads. |
33 | | */ |
34 | | |
35 | | /* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant. |
36 | | * It easily can't do that if the value is stored in a variable. |
37 | | * So don't save it as an intermediate value. |
38 | | */ |
39 | | |
40 | | /* ---------------------------------------------------------------------------- |
41 | | * SCD = Source, Constant, Destination |
42 | | */ |
43 | | #define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \ |
44 | | static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \ |
45 | | _type_* WINPR_RESTRICT pDst, UINT32 ulen) \ |
46 | 0 | { \ |
47 | 0 | size_t len = ulen; \ |
48 | 0 | INT32 shifts = 0; \ |
49 | 0 | const _type_* sptr = pSrc; \ |
50 | 0 | _type_* dptr = pDst; \ |
51 | 0 | if (val == 0) \ |
52 | 0 | return PRIMITIVES_SUCCESS; \ |
53 | 0 | if (val >= 16) \ |
54 | 0 | return -1; \ |
55 | 0 | if (sizeof(_type_) == 1) \ |
56 | 0 | shifts = 1; \ |
57 | 0 | else if (sizeof(_type_) == 2) \ |
58 | 0 | shifts = 2; \ |
59 | 0 | else if (sizeof(_type_) == 4) \ |
60 | 0 | shifts = 3; \ |
61 | 0 | else if (sizeof(_type_) == 8) \ |
62 | 0 | shifts = 4; \ |
63 | 0 | /* Use 8 128-bit SSE registers. */ \ |
64 | 0 | size_t count = len >> (8 - shifts); \ |
65 | 0 | len -= count << (8 - shifts); \ |
66 | 0 | \ |
67 | 0 | while (count--) \ |
68 | 0 | { \ |
69 | 0 | __m128i xmm0 = LOAD_SI128(sptr); \ |
70 | 0 | sptr += (16 / sizeof(_type_)); \ |
71 | 0 | __m128i xmm1 = LOAD_SI128(sptr); \ |
72 | 0 | sptr += (16 / sizeof(_type_)); \ |
73 | 0 | __m128i xmm2 = LOAD_SI128(sptr); \ |
74 | 0 | sptr += (16 / sizeof(_type_)); \ |
75 | 0 | __m128i xmm3 = LOAD_SI128(sptr); \ |
76 | 0 | sptr += (16 / sizeof(_type_)); \ |
77 | 0 | __m128i xmm4 = LOAD_SI128(sptr); \ |
78 | 0 | sptr += (16 / sizeof(_type_)); \ |
79 | 0 | __m128i xmm5 = LOAD_SI128(sptr); \ |
80 | 0 | sptr += (16 / sizeof(_type_)); \ |
81 | 0 | __m128i xmm6 = LOAD_SI128(sptr); \ |
82 | 0 | sptr += (16 / sizeof(_type_)); \ |
83 | 0 | __m128i xmm7 = LOAD_SI128(sptr); \ |
84 | 0 | sptr += (16 / sizeof(_type_)); \ |
85 | 0 | xmm0 = _op_(xmm0, (_op_type_)val); \ |
86 | 0 | xmm1 = _op_(xmm1, (_op_type_)val); \ |
87 | 0 | xmm2 = _op_(xmm2, (_op_type_)val); \ |
88 | 0 | xmm3 = _op_(xmm3, (_op_type_)val); \ |
89 | 0 | xmm4 = _op_(xmm4, (_op_type_)val); \ |
90 | 0 | xmm5 = _op_(xmm5, (_op_type_)val); \ |
91 | 0 | xmm6 = _op_(xmm6, (_op_type_)val); \ |
92 | 0 | xmm7 = _op_(xmm7, (_op_type_)val); \ |
93 | 0 | STORE_SI128(dptr, xmm0); \ |
94 | 0 | dptr += (16 / sizeof(_type_)); \ |
95 | 0 | STORE_SI128(dptr, xmm1); \ |
96 | 0 | dptr += (16 / sizeof(_type_)); \ |
97 | 0 | STORE_SI128(dptr, xmm2); \ |
98 | 0 | dptr += (16 / sizeof(_type_)); \ |
99 | 0 | STORE_SI128(dptr, xmm3); \ |
100 | 0 | dptr += (16 / sizeof(_type_)); \ |
101 | 0 | STORE_SI128(dptr, xmm4); \ |
102 | 0 | dptr += (16 / sizeof(_type_)); \ |
103 | 0 | STORE_SI128(dptr, xmm5); \ |
104 | 0 | dptr += (16 / sizeof(_type_)); \ |
105 | 0 | STORE_SI128(dptr, xmm6); \ |
106 | 0 | dptr += (16 / sizeof(_type_)); \ |
107 | 0 | STORE_SI128(dptr, xmm7); \ |
108 | 0 | dptr += (16 / sizeof(_type_)); \ |
109 | 0 | } \ |
110 | 0 | \ |
111 | 0 | /* Use a single 128-bit SSE register. */ \ |
112 | 0 | count = len >> (5 - shifts); \ |
113 | 0 | len -= count << (5 - shifts); \ |
114 | 0 | while (count--) \ |
115 | 0 | { \ |
116 | 0 | __m128i xmm0 = LOAD_SI128(sptr); \ |
117 | 0 | sptr += (16 / sizeof(_type_)); \ |
118 | 0 | xmm0 = _op_(xmm0, (_op_type_)val); \ |
119 | 0 | STORE_SI128(dptr, xmm0); \ |
120 | 0 | dptr += (16 / sizeof(_type_)); \ |
121 | 0 | } \ |
122 | 0 | /* Finish off the remainder. */ \ |
123 | 0 | while (len--) \ |
124 | 0 | { \ |
125 | 0 | _slowWay_; \ |
126 | 0 | } \ |
127 | 0 | return PRIMITIVES_SUCCESS; \ |
128 | 0 | } Unexecuted instantiation: prim_shift_sse3.c:sse2_lShiftC_16s Unexecuted instantiation: prim_shift_sse3.c:sse2_rShiftC_16s Unexecuted instantiation: prim_shift_sse3.c:sse2_lShiftC_16u Unexecuted instantiation: prim_shift_sse3.c:sse2_rShiftC_16u |
129 | | |
130 | | /* ---------------------------------------------------------------------------- |
131 | | * SCD = Source, Constant, Destination |
132 | | * PRE = preload xmm0 with the constant. |
133 | | */ |
134 | | #define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ |
135 | | static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \ |
136 | | _type_* WINPR_RESTRICT pDst, INT32 ilen) \ |
137 | 0 | { \ |
138 | 0 | size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \ |
139 | 0 | int shifts = 0; \ |
140 | 0 | const _type_* sptr = pSrc; \ |
141 | 0 | _type_* dptr = pDst; \ |
142 | 0 | __m128i xmm0; \ |
143 | 0 | if (sizeof(_type_) == 1) \ |
144 | 0 | shifts = 1; \ |
145 | 0 | else if (sizeof(_type_) == 2) \ |
146 | 0 | shifts = 2; \ |
147 | 0 | else if (sizeof(_type_) == 4) \ |
148 | 0 | shifts = 3; \ |
149 | 0 | else if (sizeof(_type_) == 8) \ |
150 | 0 | shifts = 4; \ |
151 | 0 | /* Use 4 128-bit SSE registers. */ \ |
152 | 0 | size_t count = len >> (7 - shifts); \ |
153 | 0 | len -= count << (7 - shifts); \ |
154 | 0 | xmm0 = mm_set1_epu32(val); \ |
155 | 0 | for (size_t x = 0; x < count; x++) \ |
156 | 0 | { \ |
157 | 0 | __m128i xmm1 = LOAD_SI128(sptr); \ |
158 | 0 | sptr += (16 / sizeof(_type_)); \ |
159 | 0 | __m128i xmm2 = LOAD_SI128(sptr); \ |
160 | 0 | sptr += (16 / sizeof(_type_)); \ |
161 | 0 | __m128i xmm3 = LOAD_SI128(sptr); \ |
162 | 0 | sptr += (16 / sizeof(_type_)); \ |
163 | 0 | __m128i xmm4 = LOAD_SI128(sptr); \ |
164 | 0 | sptr += (16 / sizeof(_type_)); \ |
165 | 0 | xmm1 = _op_(xmm1, xmm0); \ |
166 | 0 | xmm2 = _op_(xmm2, xmm0); \ |
167 | 0 | xmm3 = _op_(xmm3, xmm0); \ |
168 | 0 | xmm4 = _op_(xmm4, xmm0); \ |
169 | 0 | STORE_SI128(dptr, xmm1); \ |
170 | 0 | dptr += (16 / sizeof(_type_)); \ |
171 | 0 | STORE_SI128(dptr, xmm2); \ |
172 | 0 | dptr += (16 / sizeof(_type_)); \ |
173 | 0 | STORE_SI128(dptr, xmm3); \ |
174 | 0 | dptr += (16 / sizeof(_type_)); \ |
175 | 0 | STORE_SI128(dptr, xmm4); \ |
176 | 0 | dptr += (16 / sizeof(_type_)); \ |
177 | 0 | } \ |
178 | 0 | /* Use a single 128-bit SSE register. */ \ |
179 | 0 | count = len >> (5 - shifts); \ |
180 | 0 | len -= count << (5 - shifts); \ |
181 | 0 | for (size_t x = 0; x < count; x++) \ |
182 | 0 | { \ |
183 | 0 | __m128i xmm1 = LOAD_SI128(sptr); \ |
184 | 0 | sptr += (16 / sizeof(_type_)); \ |
185 | 0 | xmm1 = _op_(xmm1, xmm0); \ |
186 | 0 | STORE_SI128(dptr, xmm1); \ |
187 | 0 | dptr += (16 / sizeof(_type_)); \ |
188 | 0 | } \ |
189 | 0 | /* Finish off the remainder. */ \ |
190 | 0 | for (size_t x = 0; x < len; x++) \ |
191 | 0 | { \ |
192 | 0 | _slowWay_; \ |
193 | 0 | } \ |
194 | 0 | return PRIMITIVES_SUCCESS; \ |
195 | 0 | } Unexecuted instantiation: prim_andor_sse3.c:sse3_andC_32u Unexecuted instantiation: prim_andor_sse3.c:sse3_orC_32u |
196 | | |
197 | | /* ---------------------------------------------------------------------------- |
198 | | * SSD = Source1, Source2, Destination |
199 | | */ |
200 | | #define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \ |
201 | | static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \ |
202 | | const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \ |
203 | | UINT32 ulen) \ |
204 | 0 | { \ |
205 | 0 | size_t len = ulen; \ |
206 | 0 | int shifts = 0; \ |
207 | 0 | const _type_* sptr1 = pSrc1; \ |
208 | 0 | const _type_* sptr2 = pSrc2; \ |
209 | 0 | _type_* dptr = pDst; \ |
210 | 0 | size_t count; \ |
211 | 0 | if (sizeof(_type_) == 1) \ |
212 | 0 | shifts = 1; \ |
213 | 0 | else if (sizeof(_type_) == 2) \ |
214 | 0 | shifts = 2; \ |
215 | 0 | else if (sizeof(_type_) == 4) \ |
216 | 0 | shifts = 3; \ |
217 | 0 | else if (sizeof(_type_) == 8) \ |
218 | 0 | shifts = 4; \ |
219 | 0 | /* Use 4 128-bit SSE registers. */ \ |
220 | 0 | count = len >> (7 - shifts); \ |
221 | 0 | len -= count << (7 - shifts); \ |
222 | 0 | /* Aligned loads */ \ |
223 | 0 | while (count--) \ |
224 | 0 | { \ |
225 | 0 | __m128i xmm0 = LOAD_SI128(sptr1); \ |
226 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
227 | 0 | __m128i xmm1 = LOAD_SI128(sptr1); \ |
228 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
229 | 0 | __m128i xmm2 = LOAD_SI128(sptr1); \ |
230 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
231 | 0 | __m128i xmm3 = LOAD_SI128(sptr1); \ |
232 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
233 | 0 | __m128i xmm4 = LOAD_SI128(sptr2); \ |
234 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
235 | 0 | __m128i xmm5 = LOAD_SI128(sptr2); \ |
236 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
237 | 0 | __m128i xmm6 = LOAD_SI128(sptr2); \ |
238 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
239 | 0 | __m128i xmm7 = LOAD_SI128(sptr2); \ |
240 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
241 | 0 | xmm0 = _op_(xmm0, xmm4); \ |
242 | 0 | xmm1 = _op_(xmm1, xmm5); \ |
243 | 0 | xmm2 = _op_(xmm2, xmm6); \ |
244 | 0 | xmm3 = _op_(xmm3, xmm7); \ |
245 | 0 | STORE_SI128(dptr, xmm0); \ |
246 | 0 | dptr += (16 / sizeof(_type_)); \ |
247 | 0 | STORE_SI128(dptr, xmm1); \ |
248 | 0 | dptr += (16 / sizeof(_type_)); \ |
249 | 0 | STORE_SI128(dptr, xmm2); \ |
250 | 0 | dptr += (16 / sizeof(_type_)); \ |
251 | 0 | STORE_SI128(dptr, xmm3); \ |
252 | 0 | dptr += (16 / sizeof(_type_)); \ |
253 | 0 | } \ |
254 | 0 | /* Use a single 128-bit SSE register. */ \ |
255 | 0 | count = len >> (5 - shifts); \ |
256 | 0 | len -= count << (5 - shifts); \ |
257 | 0 | while (count--) \ |
258 | 0 | { \ |
259 | 0 | __m128i xmm0 = LOAD_SI128(sptr1); \ |
260 | 0 | sptr1 += (16 / sizeof(_type_)); \ |
261 | 0 | __m128i xmm1 = LOAD_SI128(sptr2); \ |
262 | 0 | sptr2 += (16 / sizeof(_type_)); \ |
263 | 0 | xmm0 = _op_(xmm0, xmm1); \ |
264 | 0 | STORE_SI128(dptr, xmm0); \ |
265 | 0 | dptr += (16 / sizeof(_type_)); \ |
266 | 0 | } \ |
267 | 0 | /* Finish off the remainder. */ \ |
268 | 0 | while (len--) \ |
269 | 0 | { \ |
270 | 0 | _slowWay_; \ |
271 | 0 | } \ |
272 | 0 | return PRIMITIVES_SUCCESS; \ |
273 | 0 | } |