Coverage Report

Created: 2024-09-08 06:20

/src/FreeRDP/libfreerdp/primitives/sse/prim_sign_ssse3.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized sign operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 */
15
16
#include <freerdp/config.h>
17
18
#include <freerdp/types.h>
19
#include <freerdp/primitives.h>
20
#include <winpr/sysinfo.h>
21
22
#include "prim_sign.h"
23
24
#include "prim_internal.h"
25
26
#if defined(SSE2_ENABLED)
27
#include <emmintrin.h>
28
#include <tmmintrin.h>
29
30
static primitives_t* generic = NULL;
31
32
/* ------------------------------------------------------------------------- */
33
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
34
                                UINT32 len)
35
{
36
  const INT16* sptr = pSrc;
37
  INT16* dptr = pDst;
38
  size_t count = 0;
39
40
  if (len < 16)
41
  {
42
    return generic->sign_16s(pSrc, pDst, len);
43
  }
44
45
  /* Check for 16-byte alignment (eventually). */
46
  if ((ULONG_PTR)pDst & 0x01)
47
  {
48
    return generic->sign_16s(pSrc, pDst, len);
49
  }
50
51
  /* Seek 16-byte alignment. */
52
  while ((ULONG_PTR)dptr & 0x0f)
53
  {
54
    INT16 src = *sptr++;
55
    *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
56
57
    if (--len == 0)
58
      return PRIMITIVES_SUCCESS;
59
  }
60
61
  /* Do 32-short chunks using 8 XMM registers. */
62
  count = len >> 5;  /* / 32  */
63
  len -= count << 5; /* * 32 */
64
65
  if ((ULONG_PTR)sptr & 0x0f)
66
  {
67
    /* Unaligned */
68
    while (count--)
69
    {
70
      __m128i xmm0;
71
      __m128i xmm1;
72
      __m128i xmm2;
73
      __m128i xmm3;
74
      __m128i xmm4;
75
      __m128i xmm5;
76
      __m128i xmm6;
77
      __m128i xmm7;
78
      xmm0 = _mm_set1_epi16(0x0001U);
79
      xmm1 = _mm_set1_epi16(0x0001U);
80
      xmm2 = _mm_set1_epi16(0x0001U);
81
      xmm3 = _mm_set1_epi16(0x0001U);
82
      xmm4 = _mm_lddqu_si128((const __m128i*)sptr);
83
      sptr += 8;
84
      xmm5 = _mm_lddqu_si128((const __m128i*)sptr);
85
      sptr += 8;
86
      xmm6 = _mm_lddqu_si128((const __m128i*)sptr);
87
      sptr += 8;
88
      xmm7 = _mm_lddqu_si128((const __m128i*)sptr);
89
      sptr += 8;
90
      xmm0 = _mm_sign_epi16(xmm0, xmm4);
91
      xmm1 = _mm_sign_epi16(xmm1, xmm5);
92
      xmm2 = _mm_sign_epi16(xmm2, xmm6);
93
      xmm3 = _mm_sign_epi16(xmm3, xmm7);
94
      _mm_store_si128((__m128i*)dptr, xmm0);
95
      dptr += 8;
96
      _mm_store_si128((__m128i*)dptr, xmm1);
97
      dptr += 8;
98
      _mm_store_si128((__m128i*)dptr, xmm2);
99
      dptr += 8;
100
      _mm_store_si128((__m128i*)dptr, xmm3);
101
      dptr += 8;
102
    }
103
  }
104
  else
105
  {
106
    /* Aligned */
107
    while (count--)
108
    {
109
      __m128i xmm0;
110
      __m128i xmm1;
111
      __m128i xmm2;
112
      __m128i xmm3;
113
      __m128i xmm4;
114
      __m128i xmm5;
115
      __m128i xmm6;
116
      __m128i xmm7;
117
      xmm0 = _mm_set1_epi16(0x0001U);
118
      xmm1 = _mm_set1_epi16(0x0001U);
119
      xmm2 = _mm_set1_epi16(0x0001U);
120
      xmm3 = _mm_set1_epi16(0x0001U);
121
      xmm4 = _mm_load_si128((const __m128i*)sptr);
122
      sptr += 8;
123
      xmm5 = _mm_load_si128((const __m128i*)sptr);
124
      sptr += 8;
125
      xmm6 = _mm_load_si128((const __m128i*)sptr);
126
      sptr += 8;
127
      xmm7 = _mm_load_si128((const __m128i*)sptr);
128
      sptr += 8;
129
      xmm0 = _mm_sign_epi16(xmm0, xmm4);
130
      xmm1 = _mm_sign_epi16(xmm1, xmm5);
131
      xmm2 = _mm_sign_epi16(xmm2, xmm6);
132
      xmm3 = _mm_sign_epi16(xmm3, xmm7);
133
      _mm_store_si128((__m128i*)dptr, xmm0);
134
      dptr += 8;
135
      _mm_store_si128((__m128i*)dptr, xmm1);
136
      dptr += 8;
137
      _mm_store_si128((__m128i*)dptr, xmm2);
138
      dptr += 8;
139
      _mm_store_si128((__m128i*)dptr, xmm3);
140
      dptr += 8;
141
    }
142
  }
143
144
  /* Do 8-short chunks using two XMM registers. */
145
  count = len >> 3;
146
  len -= count << 3;
147
148
  while (count--)
149
  {
150
    __m128i xmm0 = _mm_set1_epi16(0x0001U);
151
    __m128i xmm1 = LOAD_SI128(sptr);
152
    sptr += 8;
153
    xmm0 = _mm_sign_epi16(xmm0, xmm1);
154
    _mm_store_si128((__m128i*)dptr, xmm0);
155
    dptr += 8;
156
  }
157
158
  /* Do leftovers. */
159
  while (len--)
160
  {
161
    INT16 src = *sptr++;
162
    *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
163
  }
164
165
  return PRIMITIVES_SUCCESS;
166
}
167
168
#endif /* SSE2_ENABLED */
169
170
/* ------------------------------------------------------------------------- */
171
void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
172
0
{
173
#if defined(SSE2_ENABLED)
174
  generic = primitives_get_generic();
175
  primitives_init_sign(prims);
176
  /* Pick tuned versions if possible. */
177
  /* I didn't spot an IPP version of this. */
178
179
  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
180
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
181
  {
182
    WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
183
    prims->sign_16s = ssse3_sign_16s;
184
  }
185
186
#else
187
0
  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
188
0
  WINPR_UNUSED(prims);
189
0
#endif
190
0
}