Coverage Report

Created: 2023-09-25 06:56

/src/FreeRDP/libfreerdp/primitives/prim_sign_opt.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized sign operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 */
15
16
#include <freerdp/config.h>
17
18
#include <freerdp/types.h>
19
#include <freerdp/primitives.h>
20
#include <winpr/sysinfo.h>
21
22
#ifdef WITH_SSE2
23
#include <emmintrin.h>
24
#include <tmmintrin.h>
25
#endif /* WITH_SSE2 */
26
27
#include "prim_internal.h"
28
29
static primitives_t* generic = NULL;
30
31
#ifdef WITH_SSE2
32
/* ------------------------------------------------------------------------- */
33
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
34
                                UINT32 len)
35
0
{
36
0
  const INT16* sptr = (const INT16*)pSrc;
37
0
  INT16* dptr = (INT16*)pDst;
38
0
  size_t count;
39
40
0
  if (len < 16)
41
0
  {
42
0
    return generic->sign_16s(pSrc, pDst, len);
43
0
  }
44
45
  /* Check for 16-byte alignment (eventually). */
46
0
  if ((ULONG_PTR)pDst & 0x01)
47
0
  {
48
0
    return generic->sign_16s(pSrc, pDst, len);
49
0
  }
50
51
  /* Seek 16-byte alignment. */
52
0
  while ((ULONG_PTR)dptr & 0x0f)
53
0
  {
54
0
    INT16 src = *sptr++;
55
0
    *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0);
56
57
0
    if (--len == 0)
58
0
      return PRIMITIVES_SUCCESS;
59
0
  }
60
61
  /* Do 32-short chunks using 8 XMM registers. */
62
0
  count = len >> 5;  /* / 32  */
63
0
  len -= count << 5; /* * 32 */
64
65
0
  if ((ULONG_PTR)sptr & 0x0f)
66
0
  {
67
    /* Unaligned */
68
0
    while (count--)
69
0
    {
70
0
      __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
71
0
      xmm0 = _mm_set1_epi16(0x0001U);
72
0
      xmm1 = _mm_set1_epi16(0x0001U);
73
0
      xmm2 = _mm_set1_epi16(0x0001U);
74
0
      xmm3 = _mm_set1_epi16(0x0001U);
75
0
      xmm4 = _mm_lddqu_si128((const __m128i*)sptr);
76
0
      sptr += 8;
77
0
      xmm5 = _mm_lddqu_si128((const __m128i*)sptr);
78
0
      sptr += 8;
79
0
      xmm6 = _mm_lddqu_si128((const __m128i*)sptr);
80
0
      sptr += 8;
81
0
      xmm7 = _mm_lddqu_si128((const __m128i*)sptr);
82
0
      sptr += 8;
83
0
      xmm0 = _mm_sign_epi16(xmm0, xmm4);
84
0
      xmm1 = _mm_sign_epi16(xmm1, xmm5);
85
0
      xmm2 = _mm_sign_epi16(xmm2, xmm6);
86
0
      xmm3 = _mm_sign_epi16(xmm3, xmm7);
87
0
      _mm_store_si128((__m128i*)dptr, xmm0);
88
0
      dptr += 8;
89
0
      _mm_store_si128((__m128i*)dptr, xmm1);
90
0
      dptr += 8;
91
0
      _mm_store_si128((__m128i*)dptr, xmm2);
92
0
      dptr += 8;
93
0
      _mm_store_si128((__m128i*)dptr, xmm3);
94
0
      dptr += 8;
95
0
    }
96
0
  }
97
0
  else
98
0
  {
99
    /* Aligned */
100
0
    while (count--)
101
0
    {
102
0
      __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
103
0
      xmm0 = _mm_set1_epi16(0x0001U);
104
0
      xmm1 = _mm_set1_epi16(0x0001U);
105
0
      xmm2 = _mm_set1_epi16(0x0001U);
106
0
      xmm3 = _mm_set1_epi16(0x0001U);
107
0
      xmm4 = _mm_load_si128((const __m128i*)sptr);
108
0
      sptr += 8;
109
0
      xmm5 = _mm_load_si128((const __m128i*)sptr);
110
0
      sptr += 8;
111
0
      xmm6 = _mm_load_si128((const __m128i*)sptr);
112
0
      sptr += 8;
113
0
      xmm7 = _mm_load_si128((const __m128i*)sptr);
114
0
      sptr += 8;
115
0
      xmm0 = _mm_sign_epi16(xmm0, xmm4);
116
0
      xmm1 = _mm_sign_epi16(xmm1, xmm5);
117
0
      xmm2 = _mm_sign_epi16(xmm2, xmm6);
118
0
      xmm3 = _mm_sign_epi16(xmm3, xmm7);
119
0
      _mm_store_si128((__m128i*)dptr, xmm0);
120
0
      dptr += 8;
121
0
      _mm_store_si128((__m128i*)dptr, xmm1);
122
0
      dptr += 8;
123
0
      _mm_store_si128((__m128i*)dptr, xmm2);
124
0
      dptr += 8;
125
0
      _mm_store_si128((__m128i*)dptr, xmm3);
126
0
      dptr += 8;
127
0
    }
128
0
  }
129
130
  /* Do 8-short chunks using two XMM registers. */
131
0
  count = len >> 3;
132
0
  len -= count << 3;
133
134
0
  while (count--)
135
0
  {
136
0
    __m128i xmm0 = _mm_set1_epi16(0x0001U);
137
0
    __m128i xmm1 = LOAD_SI128(sptr);
138
0
    sptr += 8;
139
0
    xmm0 = _mm_sign_epi16(xmm0, xmm1);
140
0
    _mm_store_si128((__m128i*)dptr, xmm0);
141
0
    dptr += 8;
142
0
  }
143
144
  /* Do leftovers. */
145
0
  while (len--)
146
0
  {
147
0
    INT16 src = *sptr++;
148
0
    *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0);
149
0
  }
150
151
0
  return PRIMITIVES_SUCCESS;
152
0
}
153
#endif /* WITH_SSE2 */
154
155
/* ------------------------------------------------------------------------- */
156
void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
157
0
{
158
0
  generic = primitives_get_generic();
159
0
  primitives_init_sign(prims);
160
  /* Pick tuned versions if possible. */
161
  /* I didn't spot an IPP version of this. */
162
0
#if defined(WITH_SSE2)
163
164
0
  if (IsProcessorFeaturePresentEx(PF_EX_SSSE3) &&
165
0
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
166
0
  {
167
0
    prims->sign_16s = ssse3_sign_16s;
168
0
  }
169
170
0
#endif
171
0
}