/src/FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Shift operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | */ |
15 | | |
16 | | #include <freerdp/config.h> |
17 | | |
18 | | #include <freerdp/types.h> |
19 | | #include <freerdp/primitives.h> |
20 | | #include <winpr/sysinfo.h> |
21 | | |
22 | | #include "prim_shift.h" |
23 | | |
24 | | #include "prim_internal.h" |
25 | | #include "prim_templates.h" |
26 | | |
27 | | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
28 | | #include <emmintrin.h> |
29 | | #include <pmmintrin.h> |
30 | | |
31 | | static primitives_t* generic = NULL; |
32 | | |
33 | | /* ------------------------------------------------------------------------- */ |
34 | | SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t, |
35 | | *dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF)) |
36 | | /* ------------------------------------------------------------------------- */ |
37 | | SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t, |
38 | | *dptr++ = *sptr++ >> val) |
39 | | /* ------------------------------------------------------------------------- */ |
40 | | SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t, |
41 | | *dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF)) |
42 | | /* ------------------------------------------------------------------------- */ |
43 | | SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t, |
44 | | *dptr++ = *sptr++ >> val) |
45 | | |
46 | | static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen) |
47 | 0 | { |
48 | 0 | size_t len = ulen; |
49 | 0 | const INT32 shifts = 2; |
50 | 0 | if (val == 0) |
51 | 0 | return PRIMITIVES_SUCCESS; |
52 | 0 | if (val >= 16) |
53 | 0 | return -1; |
54 | 0 | if (len < 16) /* pointless if too small */ |
55 | 0 | return generic->lShiftC_16s_inplace(pSrcDst, val, ulen); |
56 | | |
57 | 0 | UINT32 offBeatMask = (1 << (shifts - 1)) - 1; |
58 | 0 | if ((ULONG_PTR)pSrcDst & offBeatMask) |
59 | 0 | { |
60 | | /* Incrementing the pointer skips over 16-byte boundary. */ |
61 | 0 | return generic->lShiftC_16s_inplace(pSrcDst, val, ulen); |
62 | 0 | } |
63 | | /* Get to the 16-byte boundary now. */ |
64 | 0 | const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16); |
65 | 0 | if (rem > 0) |
66 | 0 | { |
67 | 0 | const UINT32 add = 16 - rem; |
68 | 0 | pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add); |
69 | 0 | if (status != PRIMITIVES_SUCCESS) |
70 | 0 | return status; |
71 | 0 | pSrcDst += add; |
72 | 0 | len -= add; |
73 | 0 | } |
74 | | |
75 | | /* Use 8 128-bit SSE registers. */ |
76 | 0 | size_t count = len >> (8 - shifts); |
77 | 0 | len -= count << (8 - shifts); |
78 | |
|
79 | 0 | while (count--) |
80 | 0 | { |
81 | 0 | const __m128i* src = (const __m128i*)pSrcDst; |
82 | |
|
83 | 0 | __m128i xmm0 = LOAD_SI128(src++); |
84 | 0 | __m128i xmm1 = LOAD_SI128(src++); |
85 | 0 | __m128i xmm2 = LOAD_SI128(src++); |
86 | 0 | __m128i xmm3 = LOAD_SI128(src++); |
87 | 0 | __m128i xmm4 = LOAD_SI128(src++); |
88 | 0 | __m128i xmm5 = LOAD_SI128(src++); |
89 | 0 | __m128i xmm6 = LOAD_SI128(src++); |
90 | 0 | __m128i xmm7 = LOAD_SI128(src); |
91 | |
|
92 | 0 | xmm0 = _mm_slli_epi16(xmm0, (int16_t)val); |
93 | 0 | xmm1 = _mm_slli_epi16(xmm1, (int16_t)val); |
94 | 0 | xmm2 = _mm_slli_epi16(xmm2, (int16_t)val); |
95 | 0 | xmm3 = _mm_slli_epi16(xmm3, (int16_t)val); |
96 | 0 | xmm4 = _mm_slli_epi16(xmm4, (int16_t)val); |
97 | 0 | xmm5 = _mm_slli_epi16(xmm5, (int16_t)val); |
98 | 0 | xmm6 = _mm_slli_epi16(xmm6, (int16_t)val); |
99 | 0 | xmm7 = _mm_slli_epi16(xmm7, (int16_t)val); |
100 | |
|
101 | 0 | __m128i* dst = (__m128i*)pSrcDst; |
102 | |
|
103 | 0 | STORE_SI128(dst++, xmm0); |
104 | 0 | STORE_SI128(dst++, xmm1); |
105 | 0 | STORE_SI128(dst++, xmm2); |
106 | 0 | STORE_SI128(dst++, xmm3); |
107 | 0 | STORE_SI128(dst++, xmm4); |
108 | 0 | STORE_SI128(dst++, xmm5); |
109 | 0 | STORE_SI128(dst++, xmm6); |
110 | 0 | STORE_SI128(dst++, xmm7); |
111 | |
|
112 | 0 | pSrcDst = (INT16*)dst; |
113 | 0 | } |
114 | | |
115 | | /* Use a single 128-bit SSE register. */ |
116 | 0 | count = len >> (5 - shifts); |
117 | 0 | len -= count << (5 - shifts); |
118 | 0 | while (count--) |
119 | 0 | { |
120 | 0 | const __m128i* src = (const __m128i*)pSrcDst; |
121 | 0 | __m128i xmm0 = LOAD_SI128(src); |
122 | |
|
123 | 0 | xmm0 = _mm_slli_epi16(xmm0, (int16_t)val); |
124 | |
|
125 | 0 | __m128i* dst = (__m128i*)pSrcDst; |
126 | 0 | STORE_SI128(dst++, xmm0); |
127 | 0 | pSrcDst = (INT16*)dst; |
128 | 0 | } |
129 | | |
130 | | /* Finish off the remainder. */ |
131 | 0 | if (len > 0) |
132 | 0 | return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len)); |
133 | | |
134 | 0 | return PRIMITIVES_SUCCESS; |
135 | 0 | } |
136 | | #endif |
137 | | |
138 | | /* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s |
139 | | * depending on the sign of val. To avoid using the deprecated inplace |
140 | | * routines, a wrapper can use the src for the dest. |
141 | | */ |
142 | | |
143 | | /* ------------------------------------------------------------------------- */ |
144 | | void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims) |
145 | 0 | { |
146 | 0 | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
147 | 0 | generic = primitives_get_generic(); |
148 | |
|
149 | 0 | WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); |
150 | 0 | prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace; |
151 | 0 | prims->lShiftC_16s = sse2_lShiftC_16s; |
152 | 0 | prims->rShiftC_16s = sse2_rShiftC_16s; |
153 | 0 | prims->lShiftC_16u = sse2_lShiftC_16u; |
154 | 0 | prims->rShiftC_16u = sse2_rShiftC_16u; |
155 | |
|
156 | | #else |
157 | | WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available"); |
158 | | WINPR_UNUSED(prims); |
159 | | #endif |
160 | 0 | } |