/src/FreeRDP/libfreerdp/primitives/sse/prim_sign_ssse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized sign operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | */ |
15 | | |
16 | | #include <freerdp/config.h> |
17 | | |
18 | | #include <freerdp/types.h> |
19 | | #include <freerdp/primitives.h> |
20 | | #include <winpr/sysinfo.h> |
21 | | |
22 | | #include "prim_sign.h" |
23 | | |
24 | | #include "prim_internal.h" |
25 | | #include "prim_avxsse.h" |
26 | | |
27 | | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
28 | | #include <emmintrin.h> |
29 | | #include <tmmintrin.h> |
30 | | |
31 | | static primitives_t* generic = NULL; |
32 | | |
33 | | /* ------------------------------------------------------------------------- */ |
34 | | static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst, |
35 | | UINT32 ulen) |
36 | 0 | { |
37 | 0 | size_t len = ulen; |
38 | 0 | const INT16* sptr = pSrc; |
39 | 0 | INT16* dptr = pDst; |
40 | 0 | size_t count = 0; |
41 | |
|
42 | 0 | if (len < 16) |
43 | 0 | { |
44 | 0 | return generic->sign_16s(pSrc, pDst, ulen); |
45 | 0 | } |
46 | | |
47 | | /* Check for 16-byte alignment (eventually). */ |
48 | 0 | if ((ULONG_PTR)pDst & 0x01) |
49 | 0 | { |
50 | 0 | return generic->sign_16s(pSrc, pDst, ulen); |
51 | 0 | } |
52 | | |
53 | | /* Seek 16-byte alignment. */ |
54 | 0 | while ((ULONG_PTR)dptr & 0x0f) |
55 | 0 | { |
56 | 0 | INT16 src = *sptr++; |
57 | 0 | *dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0)); |
58 | | |
59 | 0 | if (--len == 0) |
60 | 0 | return PRIMITIVES_SUCCESS; |
61 | 0 | } |
62 | | |
63 | | /* Do 32-short chunks using 8 XMM registers. */ |
64 | 0 | count = len >> 5; /* / 32 */ |
65 | 0 | len -= count << 5; /* * 32 */ |
66 | |
|
67 | 0 | if ((ULONG_PTR)sptr & 0x0f) |
68 | 0 | { |
69 | | /* Unaligned */ |
70 | 0 | while (count--) |
71 | 0 | { |
72 | 0 | __m128i xmm0; |
73 | 0 | __m128i xmm1; |
74 | 0 | __m128i xmm2; |
75 | 0 | __m128i xmm3; |
76 | 0 | __m128i xmm4; |
77 | 0 | __m128i xmm5; |
78 | 0 | __m128i xmm6; |
79 | 0 | __m128i xmm7; |
80 | 0 | xmm0 = _mm_set1_epi16(0x0001U); |
81 | 0 | xmm1 = _mm_set1_epi16(0x0001U); |
82 | 0 | xmm2 = _mm_set1_epi16(0x0001U); |
83 | 0 | xmm3 = _mm_set1_epi16(0x0001U); |
84 | 0 | xmm4 = LOAD_SI128(sptr); |
85 | 0 | sptr += 8; |
86 | 0 | xmm5 = LOAD_SI128(sptr); |
87 | 0 | sptr += 8; |
88 | 0 | xmm6 = LOAD_SI128(sptr); |
89 | 0 | sptr += 8; |
90 | 0 | xmm7 = LOAD_SI128(sptr); |
91 | 0 | sptr += 8; |
92 | 0 | xmm0 = _mm_sign_epi16(xmm0, xmm4); |
93 | 0 | xmm1 = _mm_sign_epi16(xmm1, xmm5); |
94 | 0 | xmm2 = _mm_sign_epi16(xmm2, xmm6); |
95 | 0 | xmm3 = _mm_sign_epi16(xmm3, xmm7); |
96 | 0 | STORE_SI128(dptr, xmm0); |
97 | 0 | dptr += 8; |
98 | 0 | STORE_SI128(dptr, xmm1); |
99 | 0 | dptr += 8; |
100 | 0 | STORE_SI128(dptr, xmm2); |
101 | 0 | dptr += 8; |
102 | 0 | STORE_SI128(dptr, xmm3); |
103 | 0 | dptr += 8; |
104 | 0 | } |
105 | 0 | } |
106 | 0 | else |
107 | 0 | { |
108 | | /* Aligned */ |
109 | 0 | while (count--) |
110 | 0 | { |
111 | 0 | __m128i xmm0; |
112 | 0 | __m128i xmm1; |
113 | 0 | __m128i xmm2; |
114 | 0 | __m128i xmm3; |
115 | 0 | __m128i xmm4; |
116 | 0 | __m128i xmm5; |
117 | 0 | __m128i xmm6; |
118 | 0 | __m128i xmm7; |
119 | 0 | xmm0 = _mm_set1_epi16(0x0001U); |
120 | 0 | xmm1 = _mm_set1_epi16(0x0001U); |
121 | 0 | xmm2 = _mm_set1_epi16(0x0001U); |
122 | 0 | xmm3 = _mm_set1_epi16(0x0001U); |
123 | 0 | xmm4 = LOAD_SI128(sptr); |
124 | 0 | sptr += 8; |
125 | 0 | xmm5 = LOAD_SI128(sptr); |
126 | 0 | sptr += 8; |
127 | 0 | xmm6 = LOAD_SI128(sptr); |
128 | 0 | sptr += 8; |
129 | 0 | xmm7 = LOAD_SI128(sptr); |
130 | 0 | sptr += 8; |
131 | 0 | xmm0 = _mm_sign_epi16(xmm0, xmm4); |
132 | 0 | xmm1 = _mm_sign_epi16(xmm1, xmm5); |
133 | 0 | xmm2 = _mm_sign_epi16(xmm2, xmm6); |
134 | 0 | xmm3 = _mm_sign_epi16(xmm3, xmm7); |
135 | 0 | STORE_SI128(dptr, xmm0); |
136 | 0 | dptr += 8; |
137 | 0 | STORE_SI128(dptr, xmm1); |
138 | 0 | dptr += 8; |
139 | 0 | STORE_SI128(dptr, xmm2); |
140 | 0 | dptr += 8; |
141 | 0 | STORE_SI128(dptr, xmm3); |
142 | 0 | dptr += 8; |
143 | 0 | } |
144 | 0 | } |
145 | | |
146 | | /* Do 8-short chunks using two XMM registers. */ |
147 | 0 | count = len >> 3; |
148 | 0 | len -= count << 3; |
149 | |
|
150 | 0 | while (count--) |
151 | 0 | { |
152 | 0 | __m128i xmm0 = _mm_set1_epi16(0x0001U); |
153 | 0 | __m128i xmm1 = LOAD_SI128(sptr); |
154 | 0 | sptr += 8; |
155 | 0 | xmm0 = _mm_sign_epi16(xmm0, xmm1); |
156 | 0 | STORE_SI128(dptr, xmm0); |
157 | 0 | dptr += 8; |
158 | 0 | } |
159 | | |
160 | | /* Do leftovers. */ |
161 | 0 | while (len--) |
162 | 0 | { |
163 | 0 | INT16 src = *sptr++; |
164 | 0 | *dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0)); |
165 | 0 | } |
166 | | |
167 | 0 | return PRIMITIVES_SUCCESS; |
168 | 0 | } |
169 | | |
170 | | #endif /* SSE_AVX_INTRINSICS_ENABLED */ |
171 | | |
172 | | /* ------------------------------------------------------------------------- */ |
173 | | void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims) |
174 | 0 | { |
175 | 0 | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
176 | 0 | generic = primitives_get_generic(); |
177 | | |
178 | | /* Pick tuned versions if possible. */ |
179 | | /* I didn't spot an IPP version of this. */ |
180 | |
|
181 | 0 | WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations"); |
182 | 0 | prims->sign_16s = ssse3_sign_16s; |
183 | |
|
184 | | #else |
185 | | WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available"); |
186 | | WINPR_UNUSED(prims); |
187 | | #endif |
188 | 0 | } |