Coverage Report

Created: 2025-07-01 06:46

/src/FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Shift operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 */
15
16
#include <freerdp/config.h>
17
18
#include <freerdp/types.h>
19
#include <freerdp/primitives.h>
20
#include <winpr/sysinfo.h>
21
22
#include "prim_shift.h"
23
24
#include "prim_internal.h"
25
#include "prim_templates.h"
26
27
#if defined(SSE_AVX_INTRINSICS_ENABLED)
28
#include <emmintrin.h>
29
#include <pmmintrin.h>
30
31
static primitives_t* generic = NULL;
32
33
/* ------------------------------------------------------------------------- */
34
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16, int16_t,
35
                 *dptr++ = (INT16)(((UINT16)*sptr++ << val) & 0xFFFF))
36
/* ------------------------------------------------------------------------- */
37
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16, int16_t,
38
                 *dptr++ = *sptr++ >> val)
39
/* ------------------------------------------------------------------------- */
40
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16, int16_t,
41
                 *dptr++ = (((UINT16)*sptr++ << val) & 0xFFFF))
42
/* ------------------------------------------------------------------------- */
43
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16, int16_t,
44
                 *dptr++ = *sptr++ >> val)
45
46
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 ulen)
47
0
{
48
0
  size_t len = ulen;
49
0
  const INT32 shifts = 2;
50
0
  if (val == 0)
51
0
    return PRIMITIVES_SUCCESS;
52
0
  if (val >= 16)
53
0
    return -1;
54
0
  if (len < 16) /* pointless if too small */
55
0
    return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
56
57
0
  UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
58
0
  if ((ULONG_PTR)pSrcDst & offBeatMask)
59
0
  {
60
    /* Incrementing the pointer skips over 16-byte boundary. */
61
0
    return generic->lShiftC_16s_inplace(pSrcDst, val, ulen);
62
0
  }
63
  /* Get to the 16-byte boundary now. */
64
0
  const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
65
0
  if (rem > 0)
66
0
  {
67
0
    const UINT32 add = 16 - rem;
68
0
    pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
69
0
    if (status != PRIMITIVES_SUCCESS)
70
0
      return status;
71
0
    pSrcDst += add;
72
0
    len -= add;
73
0
  }
74
75
  /* Use 8 128-bit SSE registers. */
76
0
  size_t count = len >> (8 - shifts);
77
0
  len -= count << (8 - shifts);
78
79
0
  while (count--)
80
0
  {
81
0
    const __m128i* src = (const __m128i*)pSrcDst;
82
83
0
    __m128i xmm0 = LOAD_SI128(src++);
84
0
    __m128i xmm1 = LOAD_SI128(src++);
85
0
    __m128i xmm2 = LOAD_SI128(src++);
86
0
    __m128i xmm3 = LOAD_SI128(src++);
87
0
    __m128i xmm4 = LOAD_SI128(src++);
88
0
    __m128i xmm5 = LOAD_SI128(src++);
89
0
    __m128i xmm6 = LOAD_SI128(src++);
90
0
    __m128i xmm7 = LOAD_SI128(src);
91
92
0
    xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
93
0
    xmm1 = _mm_slli_epi16(xmm1, (int16_t)val);
94
0
    xmm2 = _mm_slli_epi16(xmm2, (int16_t)val);
95
0
    xmm3 = _mm_slli_epi16(xmm3, (int16_t)val);
96
0
    xmm4 = _mm_slli_epi16(xmm4, (int16_t)val);
97
0
    xmm5 = _mm_slli_epi16(xmm5, (int16_t)val);
98
0
    xmm6 = _mm_slli_epi16(xmm6, (int16_t)val);
99
0
    xmm7 = _mm_slli_epi16(xmm7, (int16_t)val);
100
101
0
    __m128i* dst = (__m128i*)pSrcDst;
102
103
0
    STORE_SI128(dst++, xmm0);
104
0
    STORE_SI128(dst++, xmm1);
105
0
    STORE_SI128(dst++, xmm2);
106
0
    STORE_SI128(dst++, xmm3);
107
0
    STORE_SI128(dst++, xmm4);
108
0
    STORE_SI128(dst++, xmm5);
109
0
    STORE_SI128(dst++, xmm6);
110
0
    STORE_SI128(dst++, xmm7);
111
112
0
    pSrcDst = (INT16*)dst;
113
0
  }
114
115
  /* Use a single 128-bit SSE register. */
116
0
  count = len >> (5 - shifts);
117
0
  len -= count << (5 - shifts);
118
0
  while (count--)
119
0
  {
120
0
    const __m128i* src = (const __m128i*)pSrcDst;
121
0
    __m128i xmm0 = LOAD_SI128(src);
122
123
0
    xmm0 = _mm_slli_epi16(xmm0, (int16_t)val);
124
125
0
    __m128i* dst = (__m128i*)pSrcDst;
126
0
    STORE_SI128(dst++, xmm0);
127
0
    pSrcDst = (INT16*)dst;
128
0
  }
129
130
  /* Finish off the remainder. */
131
0
  if (len > 0)
132
0
    return generic->lShiftC_16s_inplace(pSrcDst, val, WINPR_ASSERTING_INT_CAST(uint32_t, len));
133
134
0
  return PRIMITIVES_SUCCESS;
135
0
}
136
#endif
137
138
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
139
 * depending on the sign of val.  To avoid using the deprecated inplace
140
 * routines, a wrapper can use the src for the dest.
141
 */
142
143
/* ------------------------------------------------------------------------- */
144
void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims)
145
0
{
146
0
#if defined(SSE_AVX_INTRINSICS_ENABLED)
147
0
  generic = primitives_get_generic();
148
149
0
  WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
150
0
  prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
151
0
  prims->lShiftC_16s = sse2_lShiftC_16s;
152
0
  prims->rShiftC_16s = sse2_rShiftC_16s;
153
0
  prims->lShiftC_16u = sse2_lShiftC_16u;
154
0
  prims->rShiftC_16u = sse2_rShiftC_16u;
155
156
#else
157
  WLog_VRB(PRIM_TAG, "undefined WITH_SIMD or SSE3 intrinsics not available");
158
  WINPR_UNUSED(prims);
159
#endif
160
0
}