/src/FreeRDP/libfreerdp/primitives/sse/prim_add_sse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized add operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | * |
15 | | */ |
16 | | |
17 | | #include <freerdp/config.h> |
18 | | |
19 | | #include <freerdp/types.h> |
20 | | #include <freerdp/primitives.h> |
21 | | #include <winpr/sysinfo.h> |
22 | | |
23 | | #include "prim_add.h" |
24 | | |
25 | | #include "prim_internal.h" |
26 | | #include "prim_templates.h" |
27 | | |
28 | | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
29 | | #include <emmintrin.h> |
30 | | #include <pmmintrin.h> |
31 | | |
32 | | static primitives_t* generic = NULL; |
33 | | |
34 | | /* ------------------------------------------------------------------------- */ |
35 | | SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16, |
36 | | generic->add_16s(sptr1++, sptr2++, dptr++, 1)) |
37 | | |
38 | | static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1, |
39 | | INT16* WINPR_RESTRICT pSrcDst2, UINT32 ulen) |
40 | 0 | { |
41 | 0 | const int shifts = 2; |
42 | 0 | INT16* dptr1 = pSrcDst1; |
43 | 0 | INT16* dptr2 = pSrcDst2; |
44 | |
|
45 | 0 | if (ulen < 16) /* pointless if too small */ |
46 | 0 | return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen); |
47 | | |
48 | 0 | UINT32 offBeatMask = (1 << (shifts - 1)) - 1; |
49 | 0 | if ((ULONG_PTR)pSrcDst1 & offBeatMask) |
50 | 0 | { |
51 | | /* Incrementing the pointer skips over 16-byte boundary. */ |
52 | 0 | return generic->add_16s_inplace(pSrcDst1, pSrcDst2, ulen); |
53 | 0 | } |
54 | | /* Get to the 16-byte boundary now. */ |
55 | 0 | const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16); |
56 | 0 | if (rem != 0) |
57 | 0 | { |
58 | 0 | const UINT32 add = 16 - (UINT32)rem; |
59 | 0 | pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add); |
60 | 0 | if (status != PRIMITIVES_SUCCESS) |
61 | 0 | return status; |
62 | 0 | dptr1 += add; |
63 | 0 | dptr2 += add; |
64 | 0 | } |
65 | | /* Use 4 128-bit SSE registers. */ |
66 | 0 | size_t len = ulen; |
67 | 0 | size_t count = len >> (7 - shifts); |
68 | 0 | len -= count << (7 - shifts); |
69 | 0 | if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f)) |
70 | 0 | { |
71 | | /* Unaligned loads */ |
72 | 0 | while (count--) |
73 | 0 | { |
74 | 0 | const __m128i* vsptr1 = (const __m128i*)dptr1; |
75 | 0 | const __m128i* vsptr2 = (const __m128i*)dptr2; |
76 | 0 | __m128i* vdptr1 = (__m128i*)dptr1; |
77 | 0 | __m128i* vdptr2 = (__m128i*)dptr2; |
78 | |
|
79 | 0 | __m128i xmm0 = LOAD_SI128(vsptr1++); |
80 | 0 | __m128i xmm1 = LOAD_SI128(vsptr1++); |
81 | 0 | __m128i xmm2 = LOAD_SI128(vsptr1++); |
82 | 0 | __m128i xmm3 = LOAD_SI128(vsptr1++); |
83 | 0 | __m128i xmm4 = LOAD_SI128(vsptr2++); |
84 | 0 | __m128i xmm5 = LOAD_SI128(vsptr2++); |
85 | 0 | __m128i xmm6 = LOAD_SI128(vsptr2++); |
86 | 0 | __m128i xmm7 = LOAD_SI128(vsptr2++); |
87 | |
|
88 | 0 | xmm0 = _mm_adds_epi16(xmm0, xmm4); |
89 | 0 | xmm1 = _mm_adds_epi16(xmm1, xmm5); |
90 | 0 | xmm2 = _mm_adds_epi16(xmm2, xmm6); |
91 | 0 | xmm3 = _mm_adds_epi16(xmm3, xmm7); |
92 | |
|
93 | 0 | STORE_SI128(vdptr1++, xmm0); |
94 | 0 | STORE_SI128(vdptr1++, xmm1); |
95 | 0 | STORE_SI128(vdptr1++, xmm2); |
96 | 0 | STORE_SI128(vdptr1++, xmm3); |
97 | |
|
98 | 0 | STORE_SI128(vdptr2++, xmm0); |
99 | 0 | STORE_SI128(vdptr2++, xmm1); |
100 | 0 | STORE_SI128(vdptr2++, xmm2); |
101 | 0 | STORE_SI128(vdptr2++, xmm3); |
102 | |
|
103 | 0 | dptr1 = (INT16*)vdptr1; |
104 | 0 | dptr2 = (INT16*)vdptr2; |
105 | 0 | } |
106 | 0 | } |
107 | 0 | else |
108 | 0 | { |
109 | | /* Aligned loads */ |
110 | 0 | while (count--) |
111 | 0 | { |
112 | 0 | const __m128i* vsptr1 = (const __m128i*)dptr1; |
113 | 0 | const __m128i* vsptr2 = (const __m128i*)dptr2; |
114 | 0 | __m128i* vdptr1 = (__m128i*)dptr1; |
115 | 0 | __m128i* vdptr2 = (__m128i*)dptr2; |
116 | |
|
117 | 0 | __m128i xmm0 = LOAD_SI128(vsptr1++); |
118 | 0 | __m128i xmm1 = LOAD_SI128(vsptr1++); |
119 | 0 | __m128i xmm2 = LOAD_SI128(vsptr1++); |
120 | 0 | __m128i xmm3 = LOAD_SI128(vsptr1++); |
121 | 0 | __m128i xmm4 = LOAD_SI128(vsptr2++); |
122 | 0 | __m128i xmm5 = LOAD_SI128(vsptr2++); |
123 | 0 | __m128i xmm6 = LOAD_SI128(vsptr2++); |
124 | 0 | __m128i xmm7 = LOAD_SI128(vsptr2++); |
125 | |
|
126 | 0 | xmm0 = _mm_adds_epi16(xmm0, xmm4); |
127 | 0 | xmm1 = _mm_adds_epi16(xmm1, xmm5); |
128 | 0 | xmm2 = _mm_adds_epi16(xmm2, xmm6); |
129 | 0 | xmm3 = _mm_adds_epi16(xmm3, xmm7); |
130 | |
|
131 | 0 | STORE_SI128(vdptr1++, xmm0); |
132 | 0 | STORE_SI128(vdptr1++, xmm1); |
133 | 0 | STORE_SI128(vdptr1++, xmm2); |
134 | 0 | STORE_SI128(vdptr1++, xmm3); |
135 | |
|
136 | 0 | STORE_SI128(vdptr2++, xmm0); |
137 | 0 | STORE_SI128(vdptr2++, xmm1); |
138 | 0 | STORE_SI128(vdptr2++, xmm2); |
139 | 0 | STORE_SI128(vdptr2++, xmm3); |
140 | |
|
141 | 0 | dptr1 = (INT16*)vdptr1; |
142 | 0 | dptr2 = (INT16*)vdptr2; |
143 | 0 | } |
144 | 0 | } |
145 | | /* Use a single 128-bit SSE register. */ |
146 | 0 | count = len >> (5 - shifts); |
147 | 0 | len -= count << (5 - shifts); |
148 | 0 | while (count--) |
149 | 0 | { |
150 | 0 | const __m128i* vsptr1 = (const __m128i*)dptr1; |
151 | 0 | const __m128i* vsptr2 = (const __m128i*)dptr2; |
152 | 0 | __m128i* vdptr1 = (__m128i*)dptr1; |
153 | 0 | __m128i* vdptr2 = (__m128i*)dptr2; |
154 | |
|
155 | 0 | __m128i xmm0 = LOAD_SI128(vsptr1); |
156 | 0 | __m128i xmm1 = LOAD_SI128(vsptr2); |
157 | |
|
158 | 0 | xmm0 = _mm_adds_epi16(xmm0, xmm1); |
159 | |
|
160 | 0 | STORE_SI128(vdptr1++, xmm0); |
161 | 0 | STORE_SI128(vdptr2++, xmm0); |
162 | |
|
163 | 0 | dptr1 = (INT16*)vdptr1; |
164 | 0 | dptr2 = (INT16*)vdptr2; |
165 | 0 | } |
166 | | /* Finish off the remainder. */ |
167 | 0 | if (len > 0) |
168 | 0 | return generic->add_16s_inplace(dptr1, dptr2, WINPR_ASSERTING_INT_CAST(uint32_t, len)); |
169 | | |
170 | 0 | return PRIMITIVES_SUCCESS; |
171 | 0 | } |
172 | | #endif |
173 | | |
174 | | /* ------------------------------------------------------------------------- */ |
175 | | void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims) |
176 | 0 | { |
177 | 0 | #if defined(SSE_AVX_INTRINSICS_ENABLED) |
178 | 0 | generic = primitives_get_generic(); |
179 | |
|
180 | 0 | WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); |
181 | 0 | prims->add_16s = sse3_add_16s; |
182 | 0 | prims->add_16s_inplace = sse3_add_16s_inplace; |
183 | | #else |
184 | | WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available"); |
185 | | WINPR_UNUSED(prims); |
186 | | #endif |
187 | 0 | } |