/src/FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Shift operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | */ |
15 | | |
16 | | #include <freerdp/config.h> |
17 | | |
18 | | #include <freerdp/types.h> |
19 | | #include <freerdp/primitives.h> |
20 | | #include <winpr/sysinfo.h> |
21 | | |
22 | | #include "prim_shift.h" |
23 | | |
24 | | #include "prim_internal.h" |
25 | | #include "prim_templates.h" |
26 | | |
27 | | #if defined(SSE2_ENABLED) |
28 | | #include <emmintrin.h> |
29 | | #include <pmmintrin.h> |
30 | | |
31 | | static primitives_t* generic = NULL; |
32 | | |
33 | | /* ------------------------------------------------------------------------- */ |
34 | | SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, |
35 | | *dptr++ = (INT16)((UINT16)*sptr++ << val)) |
36 | | /* ------------------------------------------------------------------------- */ |
37 | | SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, |
38 | | *dptr++ = *sptr++ >> val) |
39 | | /* ------------------------------------------------------------------------- */ |
40 | | SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, |
41 | | *dptr++ = (INT16)((UINT16)*sptr++ << val)) |
42 | | /* ------------------------------------------------------------------------- */ |
43 | | SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, |
44 | | *dptr++ = *sptr++ >> val) |
45 | | |
46 | | static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len) |
47 | | { |
48 | | const INT32 shifts = 2; |
49 | | if (val == 0) |
50 | | return PRIMITIVES_SUCCESS; |
51 | | if (val >= 16) |
52 | | return -1; |
53 | | if (len < 16) /* pointless if too small */ |
54 | | return generic->lShiftC_16s_inplace(pSrcDst, val, len); |
55 | | |
56 | | UINT32 offBeatMask = (1 << (shifts - 1)) - 1; |
57 | | if ((ULONG_PTR)pSrcDst & offBeatMask) |
58 | | { |
59 | | /* Incrementing the pointer skips over 16-byte boundary. */ |
60 | | return generic->lShiftC_16s_inplace(pSrcDst, val, len); |
61 | | } |
62 | | /* Get to the 16-byte boundary now. */ |
63 | | const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16); |
64 | | if (rem > 0) |
65 | | { |
66 | | const UINT32 add = 16 - rem; |
67 | | pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add); |
68 | | if (status != PRIMITIVES_SUCCESS) |
69 | | return status; |
70 | | pSrcDst += add; |
71 | | len -= add; |
72 | | } |
73 | | |
74 | | /* Use 8 128-bit SSE registers. */ |
75 | | int count = len >> (8 - shifts); |
76 | | len -= count << (8 - shifts); |
77 | | |
78 | | while (count--) |
79 | | { |
80 | | const __m128i* src = (const __m128i*)pSrcDst; |
81 | | |
82 | | __m128i xmm0 = _mm_load_si128(src++); |
83 | | __m128i xmm1 = _mm_load_si128(src++); |
84 | | __m128i xmm2 = _mm_load_si128(src++); |
85 | | __m128i xmm3 = _mm_load_si128(src++); |
86 | | __m128i xmm4 = _mm_load_si128(src++); |
87 | | __m128i xmm5 = _mm_load_si128(src++); |
88 | | __m128i xmm6 = _mm_load_si128(src++); |
89 | | __m128i xmm7 = _mm_load_si128(src); |
90 | | |
91 | | xmm0 = _mm_slli_epi16(xmm0, val); |
92 | | xmm1 = _mm_slli_epi16(xmm1, val); |
93 | | xmm2 = _mm_slli_epi16(xmm2, val); |
94 | | xmm3 = _mm_slli_epi16(xmm3, val); |
95 | | xmm4 = _mm_slli_epi16(xmm4, val); |
96 | | xmm5 = _mm_slli_epi16(xmm5, val); |
97 | | xmm6 = _mm_slli_epi16(xmm6, val); |
98 | | xmm7 = _mm_slli_epi16(xmm7, val); |
99 | | |
100 | | __m128i* dst = (__m128i*)pSrcDst; |
101 | | |
102 | | _mm_store_si128(dst++, xmm0); |
103 | | _mm_store_si128(dst++, xmm1); |
104 | | _mm_store_si128(dst++, xmm2); |
105 | | _mm_store_si128(dst++, xmm3); |
106 | | _mm_store_si128(dst++, xmm4); |
107 | | _mm_store_si128(dst++, xmm5); |
108 | | _mm_store_si128(dst++, xmm6); |
109 | | _mm_store_si128(dst++, xmm7); |
110 | | |
111 | | pSrcDst = (INT16*)dst; |
112 | | } |
113 | | |
114 | | /* Use a single 128-bit SSE register. */ |
115 | | count = len >> (5 - shifts); |
116 | | len -= count << (5 - shifts); |
117 | | while (count--) |
118 | | { |
119 | | const __m128i* src = (const __m128i*)pSrcDst; |
120 | | __m128i xmm0 = LOAD_SI128(src); |
121 | | |
122 | | xmm0 = _mm_slli_epi16(xmm0, val); |
123 | | |
124 | | __m128i* dst = (__m128i*)pSrcDst; |
125 | | _mm_store_si128(dst++, xmm0); |
126 | | pSrcDst = (INT16*)dst; |
127 | | } |
128 | | |
129 | | /* Finish off the remainder. */ |
130 | | if (len > 0) |
131 | | return generic->lShiftC_16s_inplace(pSrcDst, val, len); |
132 | | |
133 | | return PRIMITIVES_SUCCESS; |
134 | | } |
135 | | #endif |
136 | | |
137 | | /* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s |
138 | | * depending on the sign of val. To avoid using the deprecated inplace |
139 | | * routines, a wrapper can use the src for the dest. |
140 | | */ |
141 | | |
142 | | /* ------------------------------------------------------------------------- */ |
143 | | void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims) |
144 | 0 | { |
145 | | #if defined(SSE2_ENABLED) |
146 | | generic = primitives_get_generic(); |
147 | | primitives_init_shift(prims); |
148 | | |
149 | | if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && |
150 | | IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) |
151 | | { |
152 | | WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); |
153 | | prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace; |
154 | | prims->lShiftC_16s = sse2_lShiftC_16s; |
155 | | prims->rShiftC_16s = sse2_rShiftC_16s; |
156 | | prims->lShiftC_16u = sse2_lShiftC_16u; |
157 | | prims->rShiftC_16u = sse2_rShiftC_16u; |
158 | | } |
159 | | |
160 | | #else |
161 | 0 | WLog_VRB(PRIM_TAG, "undefined WITH_SSE2"); |
162 | 0 | WINPR_UNUSED(prims); |
163 | 0 | #endif |
164 | 0 | } |