/src/FreeRDP/libfreerdp/primitives/sse/prim_add_sse3.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* FreeRDP: A Remote Desktop Protocol Client |
2 | | * Optimized add operations. |
3 | | * vi:ts=4 sw=4: |
4 | | * |
5 | | * (c) Copyright 2012 Hewlett-Packard Development Company, L.P. |
6 | | * Licensed under the Apache License, Version 2.0 (the "License"); you may |
7 | | * not use this file except in compliance with the License. You may obtain |
8 | | * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0. |
9 | | * Unless required by applicable law or agreed to in writing, software |
10 | | * distributed under the License is distributed on an "AS IS" BASIS, |
11 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
12 | | * or implied. See the License for the specific language governing |
13 | | * permissions and limitations under the License. |
14 | | * |
15 | | */ |
16 | | |
17 | | #include <freerdp/config.h> |
18 | | |
19 | | #include <freerdp/types.h> |
20 | | #include <freerdp/primitives.h> |
21 | | #include <winpr/sysinfo.h> |
22 | | |
23 | | #include "prim_add.h" |
24 | | |
25 | | #include "prim_internal.h" |
26 | | #include "prim_templates.h" |
27 | | |
28 | | #if defined(SSE2_ENABLED) |
29 | | #include <emmintrin.h> |
30 | | #include <pmmintrin.h> |
31 | | |
32 | | static primitives_t* generic = NULL; |
33 | | |
34 | | /* ------------------------------------------------------------------------- */ |
35 | | SSE3_SSD_ROUTINE(sse3_add_16s, INT16, generic->add_16s, _mm_adds_epi16, |
36 | | generic->add_16s(sptr1++, sptr2++, dptr++, 1)) |
37 | | |
38 | | static pstatus_t sse3_add_16s_inplace(INT16* WINPR_RESTRICT pSrcDst1, |
39 | | INT16* WINPR_RESTRICT pSrcDst2, UINT32 len) |
40 | | { |
41 | | const int shifts = 2; |
42 | | INT16* dptr1 = pSrcDst1; |
43 | | INT16* dptr2 = pSrcDst2; |
44 | | |
45 | | if (len < 16) /* pointless if too small */ |
46 | | return generic->add_16s_inplace(pSrcDst1, pSrcDst2, len); |
47 | | |
48 | | UINT32 offBeatMask = (1 << (shifts - 1)) - 1; |
49 | | if ((ULONG_PTR)pSrcDst1 & offBeatMask) |
50 | | { |
51 | | /* Incrementing the pointer skips over 16-byte boundary. */ |
52 | | return generic->add_16s_inplace(pSrcDst1, pSrcDst2, len); |
53 | | } |
54 | | /* Get to the 16-byte boundary now. */ |
55 | | const size_t rem = ((UINT_PTR)dptr1 & 0xf) / sizeof(INT16); |
56 | | if (rem != 0) |
57 | | { |
58 | | const size_t add = 16 - rem; |
59 | | pstatus_t status = generic->add_16s_inplace(dptr1, dptr2, add); |
60 | | if (status != PRIMITIVES_SUCCESS) |
61 | | return status; |
62 | | dptr1 += add; |
63 | | dptr2 += add; |
64 | | } |
65 | | /* Use 4 128-bit SSE registers. */ |
66 | | size_t count = len >> (7 - shifts); |
67 | | len -= count << (7 - shifts); |
68 | | if (((const ULONG_PTR)dptr1 & 0x0f) || ((const ULONG_PTR)dptr2 & 0x0f)) |
69 | | { |
70 | | /* Unaligned loads */ |
71 | | while (count--) |
72 | | { |
73 | | const __m128i* vsptr1 = (const __m128i*)dptr1; |
74 | | const __m128i* vsptr2 = (const __m128i*)dptr2; |
75 | | __m128i* vdptr1 = (__m128i*)dptr1; |
76 | | __m128i* vdptr2 = (__m128i*)dptr2; |
77 | | |
78 | | __m128i xmm0 = _mm_lddqu_si128(vsptr1++); |
79 | | __m128i xmm1 = _mm_lddqu_si128(vsptr1++); |
80 | | __m128i xmm2 = _mm_lddqu_si128(vsptr1++); |
81 | | __m128i xmm3 = _mm_lddqu_si128(vsptr1++); |
82 | | __m128i xmm4 = _mm_lddqu_si128(vsptr2++); |
83 | | __m128i xmm5 = _mm_lddqu_si128(vsptr2++); |
84 | | __m128i xmm6 = _mm_lddqu_si128(vsptr2++); |
85 | | __m128i xmm7 = _mm_lddqu_si128(vsptr2++); |
86 | | |
87 | | xmm0 = _mm_adds_epi16(xmm0, xmm4); |
88 | | xmm1 = _mm_adds_epi16(xmm1, xmm5); |
89 | | xmm2 = _mm_adds_epi16(xmm2, xmm6); |
90 | | xmm3 = _mm_adds_epi16(xmm3, xmm7); |
91 | | |
92 | | _mm_store_si128(vdptr1++, xmm0); |
93 | | _mm_store_si128(vdptr1++, xmm1); |
94 | | _mm_store_si128(vdptr1++, xmm2); |
95 | | _mm_store_si128(vdptr1++, xmm3); |
96 | | |
97 | | _mm_store_si128(vdptr2++, xmm0); |
98 | | _mm_store_si128(vdptr2++, xmm1); |
99 | | _mm_store_si128(vdptr2++, xmm2); |
100 | | _mm_store_si128(vdptr2++, xmm3); |
101 | | |
102 | | dptr1 = (INT16*)vdptr1; |
103 | | dptr2 = (INT16*)vdptr2; |
104 | | } |
105 | | } |
106 | | else |
107 | | { |
108 | | /* Aligned loads */ |
109 | | while (count--) |
110 | | { |
111 | | const __m128i* vsptr1 = (const __m128i*)dptr1; |
112 | | const __m128i* vsptr2 = (const __m128i*)dptr2; |
113 | | __m128i* vdptr1 = (__m128i*)dptr1; |
114 | | __m128i* vdptr2 = (__m128i*)dptr2; |
115 | | |
116 | | __m128i xmm0 = _mm_load_si128(vsptr1++); |
117 | | __m128i xmm1 = _mm_load_si128(vsptr1++); |
118 | | __m128i xmm2 = _mm_load_si128(vsptr1++); |
119 | | __m128i xmm3 = _mm_load_si128(vsptr1++); |
120 | | __m128i xmm4 = _mm_load_si128(vsptr2++); |
121 | | __m128i xmm5 = _mm_load_si128(vsptr2++); |
122 | | __m128i xmm6 = _mm_load_si128(vsptr2++); |
123 | | __m128i xmm7 = _mm_load_si128(vsptr2++); |
124 | | |
125 | | xmm0 = _mm_adds_epi16(xmm0, xmm4); |
126 | | xmm1 = _mm_adds_epi16(xmm1, xmm5); |
127 | | xmm2 = _mm_adds_epi16(xmm2, xmm6); |
128 | | xmm3 = _mm_adds_epi16(xmm3, xmm7); |
129 | | |
130 | | _mm_store_si128(vdptr1++, xmm0); |
131 | | _mm_store_si128(vdptr1++, xmm1); |
132 | | _mm_store_si128(vdptr1++, xmm2); |
133 | | _mm_store_si128(vdptr1++, xmm3); |
134 | | |
135 | | _mm_store_si128(vdptr2++, xmm0); |
136 | | _mm_store_si128(vdptr2++, xmm1); |
137 | | _mm_store_si128(vdptr2++, xmm2); |
138 | | _mm_store_si128(vdptr2++, xmm3); |
139 | | |
140 | | dptr1 = (INT16*)vdptr1; |
141 | | dptr2 = (INT16*)vdptr2; |
142 | | } |
143 | | } |
144 | | /* Use a single 128-bit SSE register. */ |
145 | | count = len >> (5 - shifts); |
146 | | len -= count << (5 - shifts); |
147 | | while (count--) |
148 | | { |
149 | | const __m128i* vsptr1 = (const __m128i*)dptr1; |
150 | | const __m128i* vsptr2 = (const __m128i*)dptr2; |
151 | | __m128i* vdptr1 = (__m128i*)dptr1; |
152 | | __m128i* vdptr2 = (__m128i*)dptr2; |
153 | | |
154 | | __m128i xmm0 = LOAD_SI128(vsptr1); |
155 | | __m128i xmm1 = LOAD_SI128(vsptr2); |
156 | | |
157 | | xmm0 = _mm_adds_epi16(xmm0, xmm1); |
158 | | |
159 | | _mm_store_si128(vdptr1++, xmm0); |
160 | | _mm_store_si128(vdptr2++, xmm0); |
161 | | |
162 | | dptr1 = (INT16*)vdptr1; |
163 | | dptr2 = (INT16*)vdptr2; |
164 | | } |
165 | | /* Finish off the remainder. */ |
166 | | if (len > 0) |
167 | | return generic->add_16s_inplace(dptr1, dptr2, len); |
168 | | |
169 | | return PRIMITIVES_SUCCESS; |
170 | | } |
171 | | #endif |
172 | | |
173 | | /* ------------------------------------------------------------------------- */ |
174 | | void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims) |
175 | 0 | { |
176 | | #if defined(SSE2_ENABLED) |
177 | | generic = primitives_get_generic(); |
178 | | primitives_init_add(prims); |
179 | | |
180 | | if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) && |
181 | | IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */ |
182 | | { |
183 | | WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations"); |
184 | | prims->add_16s = sse3_add_16s; |
185 | | prims->add_16s_inplace = sse3_add_16s_inplace; |
186 | | } |
187 | | |
188 | | #else |
189 | 0 | WLog_VRB(PRIM_TAG, "undefined WITH_SSE2"); |
190 | 0 | WINPR_UNUSED(prims); |
191 | 0 | #endif |
192 | 0 | } |