Coverage Report

Created: 2025-07-01 06:46

/src/FreeRDP/libfreerdp/primitives/sse/prim_sign_ssse3.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Optimized sign operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 */
15
16
#include <freerdp/config.h>
17
18
#include <freerdp/types.h>
19
#include <freerdp/primitives.h>
20
#include <winpr/sysinfo.h>
21
22
#include "prim_sign.h"
23
24
#include "prim_internal.h"
25
#include "prim_avxsse.h"
26
27
#if defined(SSE_AVX_INTRINSICS_ENABLED)
28
#include <emmintrin.h>
29
#include <tmmintrin.h>
30
31
static primitives_t* generic = NULL;
32
33
/* ------------------------------------------------------------------------- */
34
static pstatus_t ssse3_sign_16s(const INT16* WINPR_RESTRICT pSrc, INT16* WINPR_RESTRICT pDst,
35
                                UINT32 ulen)
36
0
{
37
0
  size_t len = ulen;
38
0
  const INT16* sptr = pSrc;
39
0
  INT16* dptr = pDst;
40
0
  size_t count = 0;
41
42
0
  if (len < 16)
43
0
  {
44
0
    return generic->sign_16s(pSrc, pDst, ulen);
45
0
  }
46
47
  /* Check for 16-byte alignment (eventually). */
48
0
  if ((ULONG_PTR)pDst & 0x01)
49
0
  {
50
0
    return generic->sign_16s(pSrc, pDst, ulen);
51
0
  }
52
53
  /* Seek 16-byte alignment. */
54
0
  while ((ULONG_PTR)dptr & 0x0f)
55
0
  {
56
0
    INT16 src = *sptr++;
57
0
    *dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? (-1) : ((src > 0) ? 1 : 0));
58
59
0
    if (--len == 0)
60
0
      return PRIMITIVES_SUCCESS;
61
0
  }
62
63
  /* Do 32-short chunks using 8 XMM registers. */
64
0
  count = len >> 5;  /* / 32  */
65
0
  len -= count << 5; /* * 32 */
66
67
0
  if ((ULONG_PTR)sptr & 0x0f)
68
0
  {
69
    /* Unaligned */
70
0
    while (count--)
71
0
    {
72
0
      __m128i xmm0;
73
0
      __m128i xmm1;
74
0
      __m128i xmm2;
75
0
      __m128i xmm3;
76
0
      __m128i xmm4;
77
0
      __m128i xmm5;
78
0
      __m128i xmm6;
79
0
      __m128i xmm7;
80
0
      xmm0 = _mm_set1_epi16(0x0001U);
81
0
      xmm1 = _mm_set1_epi16(0x0001U);
82
0
      xmm2 = _mm_set1_epi16(0x0001U);
83
0
      xmm3 = _mm_set1_epi16(0x0001U);
84
0
      xmm4 = LOAD_SI128(sptr);
85
0
      sptr += 8;
86
0
      xmm5 = LOAD_SI128(sptr);
87
0
      sptr += 8;
88
0
      xmm6 = LOAD_SI128(sptr);
89
0
      sptr += 8;
90
0
      xmm7 = LOAD_SI128(sptr);
91
0
      sptr += 8;
92
0
      xmm0 = _mm_sign_epi16(xmm0, xmm4);
93
0
      xmm1 = _mm_sign_epi16(xmm1, xmm5);
94
0
      xmm2 = _mm_sign_epi16(xmm2, xmm6);
95
0
      xmm3 = _mm_sign_epi16(xmm3, xmm7);
96
0
      STORE_SI128(dptr, xmm0);
97
0
      dptr += 8;
98
0
      STORE_SI128(dptr, xmm1);
99
0
      dptr += 8;
100
0
      STORE_SI128(dptr, xmm2);
101
0
      dptr += 8;
102
0
      STORE_SI128(dptr, xmm3);
103
0
      dptr += 8;
104
0
    }
105
0
  }
106
0
  else
107
0
  {
108
    /* Aligned */
109
0
    while (count--)
110
0
    {
111
0
      __m128i xmm0;
112
0
      __m128i xmm1;
113
0
      __m128i xmm2;
114
0
      __m128i xmm3;
115
0
      __m128i xmm4;
116
0
      __m128i xmm5;
117
0
      __m128i xmm6;
118
0
      __m128i xmm7;
119
0
      xmm0 = _mm_set1_epi16(0x0001U);
120
0
      xmm1 = _mm_set1_epi16(0x0001U);
121
0
      xmm2 = _mm_set1_epi16(0x0001U);
122
0
      xmm3 = _mm_set1_epi16(0x0001U);
123
0
      xmm4 = LOAD_SI128(sptr);
124
0
      sptr += 8;
125
0
      xmm5 = LOAD_SI128(sptr);
126
0
      sptr += 8;
127
0
      xmm6 = LOAD_SI128(sptr);
128
0
      sptr += 8;
129
0
      xmm7 = LOAD_SI128(sptr);
130
0
      sptr += 8;
131
0
      xmm0 = _mm_sign_epi16(xmm0, xmm4);
132
0
      xmm1 = _mm_sign_epi16(xmm1, xmm5);
133
0
      xmm2 = _mm_sign_epi16(xmm2, xmm6);
134
0
      xmm3 = _mm_sign_epi16(xmm3, xmm7);
135
0
      STORE_SI128(dptr, xmm0);
136
0
      dptr += 8;
137
0
      STORE_SI128(dptr, xmm1);
138
0
      dptr += 8;
139
0
      STORE_SI128(dptr, xmm2);
140
0
      dptr += 8;
141
0
      STORE_SI128(dptr, xmm3);
142
0
      dptr += 8;
143
0
    }
144
0
  }
145
146
  /* Do 8-short chunks using two XMM registers. */
147
0
  count = len >> 3;
148
0
  len -= count << 3;
149
150
0
  while (count--)
151
0
  {
152
0
    __m128i xmm0 = _mm_set1_epi16(0x0001U);
153
0
    __m128i xmm1 = LOAD_SI128(sptr);
154
0
    sptr += 8;
155
0
    xmm0 = _mm_sign_epi16(xmm0, xmm1);
156
0
    STORE_SI128(dptr, xmm0);
157
0
    dptr += 8;
158
0
  }
159
160
  /* Do leftovers. */
161
0
  while (len--)
162
0
  {
163
0
    INT16 src = *sptr++;
164
0
    *dptr++ = WINPR_ASSERTING_INT_CAST(int16_t, (src < 0) ? -1 : ((src > 0) ? 1 : 0));
165
0
  }
166
167
0
  return PRIMITIVES_SUCCESS;
168
0
}
169
170
#endif /* SSE_AVX_INTRINSICS_ENABLED */
171
172
/* ------------------------------------------------------------------------- */
173
void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims)
174
0
{
175
0
#if defined(SSE_AVX_INTRINSICS_ENABLED)
176
0
  generic = primitives_get_generic();
177
178
  /* Pick tuned versions if possible. */
179
  /* I didn't spot an IPP version of this. */
180
181
0
  WLog_VRB(PRIM_TAG, "SSE3/SSSE3 optimizations");
182
0
  prims->sign_16s = ssse3_sign_16s;
183
184
#else
185
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSSE3/SSE3 intrinsics not available");
186
  WINPR_UNUSED(prims);
187
#endif
188
0
}