/src/FreeRDP/libfreerdp/primitives/sse/prim_set_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized routines to set a chunk of memory to a constant. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | * |
15 | | */ |
16 | | |
17 | | #include <freerdp/config.h> |
18 | | |
19 | | #include <string.h> |
20 | | #include <freerdp/types.h> |
21 | | #include <freerdp/primitives.h> |
22 | | #include <winpr/sysinfo.h> |
23 | | |
24 | | #include "prim_internal.h" |
25 | | #include "prim_set.h" |
26 | | |
27 | | /* ========================================================================= */ |
28 | | #if defined(SSE2_ENABLED) |
29 | | #include <emmintrin.h> |
30 | | |
31 | | static primitives_t* generic = NULL; |
32 | | |
33 | | static pstatus_t sse2_set_8u(BYTE val, BYTE* WINPR_RESTRICT pDst, UINT32 len) |
34 | | { |
35 | | BYTE byte = 0; |
36 | | BYTE* dptr = NULL; |
37 | | __m128i xmm0; |
38 | | size_t count = 0; |
39 | | |
40 | | if (len < 16) |
41 | | return generic->set_8u(val, pDst, len); |
42 | | |
43 | | byte = val; |
44 | | dptr = pDst; |
45 | | |
46 | | /* Seek 16-byte alignment. */ |
47 | | while ((ULONG_PTR)dptr & 0x0f) |
48 | | { |
49 | | *dptr++ = byte; |
50 | | |
51 | | if (--len == 0) |
52 | | return PRIMITIVES_SUCCESS; |
53 | | } |
54 | | |
55 | | xmm0 = _mm_set1_epi8(byte); |
56 | | /* Cover 256-byte chunks via SSE register stores. */ |
57 | | count = len >> 8; |
58 | | len -= count << 8; |
59 | | |
60 | | /* Do 256-byte chunks using one XMM register. */ |
61 | | while (count--) |
62 | | { |
63 | | _mm_store_si128((__m128i*)dptr, xmm0); |
64 | | dptr += 16; |
65 | | _mm_store_si128((__m128i*)dptr, xmm0); |
66 | | dptr += 16; |
67 | | _mm_store_si128((__m128i*)dptr, xmm0); |
68 | | dptr += 16; |
69 | | _mm_store_si128((__m128i*)dptr, xmm0); |
70 | | dptr += 16; |
71 | | _mm_store_si128((__m128i*)dptr, xmm0); |
72 | | dptr += 16; |
73 | | _mm_store_si128((__m128i*)dptr, xmm0); |
74 | | dptr += 16; |
75 | | _mm_store_si128((__m128i*)dptr, xmm0); |
76 | | dptr += 16; |
77 | | _mm_store_si128((__m128i*)dptr, xmm0); |
78 | | dptr += 16; |
79 | | _mm_store_si128((__m128i*)dptr, xmm0); |
80 | | dptr += 16; |
81 | | _mm_store_si128((__m128i*)dptr, xmm0); |
82 | | dptr += 16; |
83 | | _mm_store_si128((__m128i*)dptr, xmm0); |
84 | | dptr += 16; |
85 | | _mm_store_si128((__m128i*)dptr, xmm0); |
86 | | dptr += 16; |
87 | | _mm_store_si128((__m128i*)dptr, xmm0); |
88 | | dptr += 16; |
89 | | _mm_store_si128((__m128i*)dptr, xmm0); |
90 | | dptr += 16; |
91 | | _mm_store_si128((__m128i*)dptr, xmm0); |
92 | | dptr += 16; |
93 | | _mm_store_si128((__m128i*)dptr, xmm0); |
94 | | dptr += 16; |
95 | | } |
96 | | |
97 | | /* Cover 16-byte chunks via SSE register stores. */ |
98 | | count = len >> 4; |
99 | | len -= count << 4; |
100 | | |
101 | | /* Do 16-byte chunks using one XMM register. */ |
102 | | while (count--) |
103 | | { |
104 | | _mm_store_si128((__m128i*)dptr, xmm0); |
105 | | dptr += 16; |
106 | | } |
107 | | |
108 | | /* Do leftover bytes. */ |
109 | | while (len--) |
110 | | *dptr++ = byte; |
111 | | |
112 | | return PRIMITIVES_SUCCESS; |
113 | | } |
114 | | |
115 | | /* ------------------------------------------------------------------------- */ |
116 | | static pstatus_t sse2_set_32u(UINT32 val, UINT32* WINPR_RESTRICT pDst, UINT32 len) |
117 | | { |
118 | | const primitives_t* prim = primitives_get_generic(); |
119 | | UINT32* dptr = pDst; |
120 | | __m128i xmm0; |
121 | | size_t count = 0; |
122 | | |
123 | | /* If really short, just do it here. */ |
124 | | if (len < 32) |
125 | | { |
126 | | while (len--) |
127 | | *dptr++ = val; |
128 | | |
129 | | return PRIMITIVES_SUCCESS; |
130 | | } |
131 | | |
132 | | /* Assure we can reach 16-byte alignment. */ |
133 | | if (((ULONG_PTR)dptr & 0x03) != 0) |
134 | | { |
135 | | return prim->set_32u(val, pDst, len); |
136 | | } |
137 | | |
138 | | /* Seek 16-byte alignment. */ |
139 | | while ((ULONG_PTR)dptr & 0x0f) |
140 | | { |
141 | | *dptr++ = val; |
142 | | |
143 | | if (--len == 0) |
144 | | return PRIMITIVES_SUCCESS; |
145 | | } |
146 | | |
147 | | xmm0 = _mm_set1_epi32(val); |
148 | | /* Cover 256-byte chunks via SSE register stores. */ |
149 | | count = len >> 6; |
150 | | len -= count << 6; |
151 | | |
152 | | /* Do 256-byte chunks using one XMM register. */ |
153 | | while (count--) |
154 | | { |
155 | | _mm_store_si128((__m128i*)dptr, xmm0); |
156 | | dptr += 4; |
157 | | _mm_store_si128((__m128i*)dptr, xmm0); |
158 | | dptr += 4; |
159 | | _mm_store_si128((__m128i*)dptr, xmm0); |
160 | | dptr += 4; |
161 | | _mm_store_si128((__m128i*)dptr, xmm0); |
162 | | dptr += 4; |
163 | | _mm_store_si128((__m128i*)dptr, xmm0); |
164 | | dptr += 4; |
165 | | _mm_store_si128((__m128i*)dptr, xmm0); |
166 | | dptr += 4; |
167 | | _mm_store_si128((__m128i*)dptr, xmm0); |
168 | | dptr += 4; |
169 | | _mm_store_si128((__m128i*)dptr, xmm0); |
170 | | dptr += 4; |
171 | | _mm_store_si128((__m128i*)dptr, xmm0); |
172 | | dptr += 4; |
173 | | _mm_store_si128((__m128i*)dptr, xmm0); |
174 | | dptr += 4; |
175 | | _mm_store_si128((__m128i*)dptr, xmm0); |
176 | | dptr += 4; |
177 | | _mm_store_si128((__m128i*)dptr, xmm0); |
178 | | dptr += 4; |
179 | | _mm_store_si128((__m128i*)dptr, xmm0); |
180 | | dptr += 4; |
181 | | _mm_store_si128((__m128i*)dptr, xmm0); |
182 | | dptr += 4; |
183 | | _mm_store_si128((__m128i*)dptr, xmm0); |
184 | | dptr += 4; |
185 | | _mm_store_si128((__m128i*)dptr, xmm0); |
186 | | dptr += 4; |
187 | | } |
188 | | |
189 | | /* Cover 16-byte chunks via SSE register stores. */ |
190 | | count = len >> 2; |
191 | | len -= count << 2; |
192 | | |
193 | | /* Do 16-byte chunks using one XMM register. */ |
194 | | while (count--) |
195 | | { |
196 | | _mm_store_si128((__m128i*)dptr, xmm0); |
197 | | dptr += 4; |
198 | | } |
199 | | |
200 | | /* Do leftover bytes. */ |
201 | | while (len--) |
202 | | *dptr++ = val; |
203 | | |
204 | | return PRIMITIVES_SUCCESS; |
205 | | } |
206 | | |
207 | | /* ------------------------------------------------------------------------- */ |
208 | | static pstatus_t sse2_set_32s(INT32 val, INT32* WINPR_RESTRICT pDst, UINT32 len) |
209 | | { |
210 | | UINT32 uval = *((UINT32*)&val); |
211 | | return sse2_set_32u(uval, (UINT32*)pDst, len); |
212 | | } |
213 | | #endif |
214 | | |
215 | | /* ------------------------------------------------------------------------- */ |
216 | | void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims) |
217 | 0 | { |
218 | | #if defined(SSE2_ENABLED) |
219 | | generic = primitives_get_generic(); |
220 | | primitives_init_set(prims); |
221 | | /* Pick tuned versions if possible. */ |
222 | | |
223 | | if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE)) |
224 | | { |
225 | | WLog_VRB(PRIM_TAG, "SSE2 optimizations"); |
226 | | prims->set_8u = sse2_set_8u; |
227 | | prims->set_32s = sse2_set_32s; |
228 | | prims->set_32u = sse2_set_32u; |
229 | | } |
230 | | |
231 | | #else |
232 | 0 | WLog_VRB(PRIM_TAG, "undefined WITH_SSE2"); |
233 | 0 | WINPR_UNUSED(prims); |
234 | 0 | #endif |
235 | 0 | } |