Coverage Report

Created: 2024-09-08 06:20

/src/FreeRDP/libfreerdp/primitives/sse/prim_shift_sse3.c
Line
Count
Source (jump to first uncovered line)
1
/* FreeRDP: A Remote Desktop Protocol Client
2
 * Shift operations.
3
 * vi:ts=4 sw=4:
4
 *
5
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
6
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
7
 * not use this file except in compliance with the License. You may obtain
8
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
9
 * Unless required by applicable law or agreed to in writing, software
10
 * distributed under the License is distributed on an "AS IS" BASIS,
11
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
12
 * or implied. See the License for the specific language governing
13
 * permissions and limitations under the License.
14
 */
15
16
#include <freerdp/config.h>
17
18
#include <freerdp/types.h>
19
#include <freerdp/primitives.h>
20
#include <winpr/sysinfo.h>
21
22
#include "prim_shift.h"
23
24
#include "prim_internal.h"
25
#include "prim_templates.h"
26
27
#if defined(SSE2_ENABLED)
28
#include <emmintrin.h>
29
#include <pmmintrin.h>
30
31
static primitives_t* generic = NULL;
32
33
/* ------------------------------------------------------------------------- */
34
SSE3_SCD_ROUTINE(sse2_lShiftC_16s, INT16, generic->lShiftC_16s, _mm_slli_epi16,
35
                 *dptr++ = (INT16)((UINT16)*sptr++ << val))
36
/* ------------------------------------------------------------------------- */
37
SSE3_SCD_ROUTINE(sse2_rShiftC_16s, INT16, generic->rShiftC_16s, _mm_srai_epi16,
38
                 *dptr++ = *sptr++ >> val)
39
/* ------------------------------------------------------------------------- */
40
SSE3_SCD_ROUTINE(sse2_lShiftC_16u, UINT16, generic->lShiftC_16u, _mm_slli_epi16,
41
                 *dptr++ = (INT16)((UINT16)*sptr++ << val))
42
/* ------------------------------------------------------------------------- */
43
SSE3_SCD_ROUTINE(sse2_rShiftC_16u, UINT16, generic->rShiftC_16u, _mm_srli_epi16,
44
                 *dptr++ = *sptr++ >> val)
45
46
static pstatus_t sse2_lShiftC_16s_inplace(INT16* WINPR_RESTRICT pSrcDst, UINT32 val, UINT32 len)
47
{
48
  const INT32 shifts = 2;
49
  if (val == 0)
50
    return PRIMITIVES_SUCCESS;
51
  if (val >= 16)
52
    return -1;
53
  if (len < 16) /* pointless if too small */
54
    return generic->lShiftC_16s_inplace(pSrcDst, val, len);
55
56
  UINT32 offBeatMask = (1 << (shifts - 1)) - 1;
57
  if ((ULONG_PTR)pSrcDst & offBeatMask)
58
  {
59
    /* Incrementing the pointer skips over 16-byte boundary. */
60
    return generic->lShiftC_16s_inplace(pSrcDst, val, len);
61
  }
62
  /* Get to the 16-byte boundary now. */
63
  const UINT32 rem = ((UINT_PTR)pSrcDst & 0x0f) / sizeof(INT16);
64
  if (rem > 0)
65
  {
66
    const UINT32 add = 16 - rem;
67
    pstatus_t status = generic->lShiftC_16s_inplace(pSrcDst, val, add);
68
    if (status != PRIMITIVES_SUCCESS)
69
      return status;
70
    pSrcDst += add;
71
    len -= add;
72
  }
73
74
  /* Use 8 128-bit SSE registers. */
75
  int count = len >> (8 - shifts);
76
  len -= count << (8 - shifts);
77
78
  while (count--)
79
  {
80
    const __m128i* src = (const __m128i*)pSrcDst;
81
82
    __m128i xmm0 = _mm_load_si128(src++);
83
    __m128i xmm1 = _mm_load_si128(src++);
84
    __m128i xmm2 = _mm_load_si128(src++);
85
    __m128i xmm3 = _mm_load_si128(src++);
86
    __m128i xmm4 = _mm_load_si128(src++);
87
    __m128i xmm5 = _mm_load_si128(src++);
88
    __m128i xmm6 = _mm_load_si128(src++);
89
    __m128i xmm7 = _mm_load_si128(src);
90
91
    xmm0 = _mm_slli_epi16(xmm0, val);
92
    xmm1 = _mm_slli_epi16(xmm1, val);
93
    xmm2 = _mm_slli_epi16(xmm2, val);
94
    xmm3 = _mm_slli_epi16(xmm3, val);
95
    xmm4 = _mm_slli_epi16(xmm4, val);
96
    xmm5 = _mm_slli_epi16(xmm5, val);
97
    xmm6 = _mm_slli_epi16(xmm6, val);
98
    xmm7 = _mm_slli_epi16(xmm7, val);
99
100
    __m128i* dst = (__m128i*)pSrcDst;
101
102
    _mm_store_si128(dst++, xmm0);
103
    _mm_store_si128(dst++, xmm1);
104
    _mm_store_si128(dst++, xmm2);
105
    _mm_store_si128(dst++, xmm3);
106
    _mm_store_si128(dst++, xmm4);
107
    _mm_store_si128(dst++, xmm5);
108
    _mm_store_si128(dst++, xmm6);
109
    _mm_store_si128(dst++, xmm7);
110
111
    pSrcDst = (INT16*)dst;
112
  }
113
114
  /* Use a single 128-bit SSE register. */
115
  count = len >> (5 - shifts);
116
  len -= count << (5 - shifts);
117
  while (count--)
118
  {
119
    const __m128i* src = (const __m128i*)pSrcDst;
120
    __m128i xmm0 = LOAD_SI128(src);
121
122
    xmm0 = _mm_slli_epi16(xmm0, val);
123
124
    __m128i* dst = (__m128i*)pSrcDst;
125
    _mm_store_si128(dst++, xmm0);
126
    pSrcDst = (INT16*)dst;
127
  }
128
129
  /* Finish off the remainder. */
130
  if (len > 0)
131
    return generic->lShiftC_16s_inplace(pSrcDst, val, len);
132
133
  return PRIMITIVES_SUCCESS;
134
}
135
#endif
136
137
/* Note: the IPP version will have to call ippLShiftC_16s or ippRShiftC_16s
138
 * depending on the sign of val.  To avoid using the deprecated inplace
139
 * routines, a wrapper can use the src for the dest.
140
 */
141
142
/* ------------------------------------------------------------------------- */
143
void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
144
0
{
145
#if defined(SSE2_ENABLED)
146
  generic = primitives_get_generic();
147
  primitives_init_shift(prims);
148
149
  if (IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) &&
150
      IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
151
  {
152
    WLog_VRB(PRIM_TAG, "SSE2/SSE3 optimizations");
153
    prims->lShiftC_16s_inplace = sse2_lShiftC_16s_inplace;
154
    prims->lShiftC_16s = sse2_lShiftC_16s;
155
    prims->rShiftC_16s = sse2_rShiftC_16s;
156
    prims->lShiftC_16u = sse2_lShiftC_16u;
157
    prims->rShiftC_16u = sse2_rShiftC_16u;
158
  }
159
160
#else
161
0
  WLog_VRB(PRIM_TAG, "undefined WITH_SSE2");
162
0
  WINPR_UNUSED(prims);
163
0
#endif
164
0
}