/src/FreeRDP/libfreerdp/primitives/prim_templates.h

Source (jump to first uncovered line)
/* prim_templates.h
 * vi:ts=4 sw=4
 *
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may obtain
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing
 * permissions and limitations under the License.  Algorithms used by
 * this code may be covered by patents by HP, Microsoft, or other parties.
 */

#ifdef __GNUC__
#pragma once
#endif

#ifndef FREERDP_LIB_PRIM_TEMPLATES_H
#define FREERDP_LIB_PRIM_TEMPLATES_H

/* These are prototypes for SSE (potentially NEON) routines that do a
 * simple SSE operation over an array of data.  Since so much of this
 * code is shared except for the operation itself, these prototypes are
 * used rather than duplicating code.  The naming convention depends on
 * the parameters:  S=Source param; C=Constant; D=Destination.
 * All the macros have parameters for a fallback procedure if the data
 * is too small and an operation "the slow way" for use at 16-byte edges.
 */

/* SSE3 note:  If someone needs to support an SSE2 version of these without
 * SSE3 support, an alternative version could be added that merely checks
 * that 16-byte alignment on both destination and source(s) can be
 * achieved, rather than use LDDQU for unaligned reads.
 */

/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
 * It easily can't do that if the value is stored in a variable.
 * So don't save it as an intermediate value.
 */

/* ----------------------------------------------------------------------------
 * SCD = Source, Constant, Destination
 */
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                 \
  static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
  {                                                                                 \
    INT32 shifts = 0;                                                             \
    UINT32 offBeatMask;                                                           \
    const _type_* sptr = pSrc;                                                    \
    _type_* dptr = pDst;                                                          \
    int count;                                                                    \
    if (val == 0)                                                                 \
      return PRIMITIVES_SUCCESS;                                                \
    if (val >= 16)                                                                \
      return -1;                                                                \
    if (len < 16) /* pointless if too small */                                    \
    {                                                                             \
      return _fallback_(pSrc, val, pDst, len);                                  \
    }                                                                             \
    if (sizeof(_type_) == 1)                                                      \
      shifts = 1;                                                               \
    else if (sizeof(_type_) == 2)                                                 \
      shifts = 2;                                                               \
    else if (sizeof(_type_) == 4)                                                 \
      shifts = 3;                                                               \
    else if (sizeof(_type_) == 8)                                                 \
      shifts = 4;                                                               \
    offBeatMask = (1 << (shifts - 1)) - 1;                                        \
    if ((ULONG_PTR)pDst & offBeatMask)                                            \
    {                                                                             \
      /* Incrementing the pointer skips over 16-byte boundary. */               \
      return _fallback_(pSrc, val, pDst, len);                                  \
    }                                                                             \
    /* Get to the 16-byte boundary now. */                                        \
    while ((ULONG_PTR)dptr & 0x0f)                                                \
    {                                                                             \
      _slowWay_;                                                                \
      if (--len == 0)                                                           \
        return PRIMITIVES_SUCCESS;                                            \
    }                                                                             \
    /* Use 8 128-bit SSE registers. */                                            \
    count = len >> (8 - shifts);                                                  \
    len -= count << (8 - shifts);                                                 \
    if ((const ULONG_PTR)sptr & 0x0f)                                             \
    {                                                                             \
      while (count--)                                                           \
      {                                                                         \
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;               \
        xmm0 = _mm_lddqu_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                        \
        xmm1 = _mm_lddqu_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                        \
        xmm2 = _mm_lddqu_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                        \
        xmm3 = _mm_lddqu_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                        \
        xmm4 = _mm_lddqu_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                        \
        xmm5 = _mm_lddqu_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                        \
        xmm6 = _mm_lddqu_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                        \
        xmm7 = _mm_lddqu_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                        \
        xmm0 = _op_(xmm0, val);                                               \
        xmm1 = _op_(xmm1, val);                                               \
        xmm2 = _op_(xmm2, val);                                               \
        xmm3 = _op_(xmm3, val);                                               \
        xmm4 = _op_(xmm4, val);                                               \
        xmm5 = _op_(xmm5, val);                                               \
        xmm6 = _op_(xmm6, val);                                               \
        xmm7 = _op_(xmm7, val);                                               \
        _mm_store_si128((__m128i*)dptr, xmm0);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm1);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm2);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm3);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm4);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm5);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm6);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm7);                                \
        dptr += (16 / sizeof(_type_));                                        \
      }                                                                         \
    }                                                                             \
    else                                                                          \
    {                                                                             \
      while (count--)                                                           \
      {                                                                         \
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;               \
        xmm0 = _mm_load_si128((const __m128i*)sptr);                          \
        sptr += (16 / sizeof(_type_));                                        \
        xmm1 = _mm_load_si128((const __m128i*)sptr);                          \
        sptr += (16 / sizeof(_type_));                                        \
        xmm2 = _mm_load_si128((const __m128i*)sptr);                          \
        sptr += (16 / sizeof(_type_));                                        \
        xmm3 = _mm_load_si128((const __m128i*)sptr);                          \
        sptr += (16 / sizeof(_type_));                                        \
        xmm4 = _mm_load_si128((const __m128i*)sptr);                          \
        sptr += (16 / sizeof(_type_));                                        \
        xmm5 = _mm_load_si128((const __m128i*)sptr);                          \
        sptr += (16 / sizeof(_type_));                                        \
        xmm6 = _mm_load_si128((const __m128i*)sptr);                          \
        sptr += (16 / sizeof(_type_));                                        \
        xmm7 = _mm_load_si128((const __m128i*)sptr);                          \
        sptr += (16 / sizeof(_type_));                                        \
        xmm0 = _op_(xmm0, val);                                               \
        xmm1 = _op_(xmm1, val);                                               \
        xmm2 = _op_(xmm2, val);                                               \
        xmm3 = _op_(xmm3, val);                                               \
        xmm4 = _op_(xmm4, val);                                               \
        xmm5 = _op_(xmm5, val);                                               \
        xmm6 = _op_(xmm6, val);                                               \
        xmm7 = _op_(xmm7, val);                                               \
        _mm_store_si128((__m128i*)dptr, xmm0);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm1);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm2);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm3);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm4);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm5);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm6);                                \
        dptr += (16 / sizeof(_type_));                                        \
        _mm_store_si128((__m128i*)dptr, xmm7);                                \
        dptr += (16 / sizeof(_type_));                                        \
      }                                                                         \
    }                                                                             \
    /* Use a single 128-bit SSE register. */                                      \
    count = len >> (5 - shifts);                                                  \
    len -= count << (5 - shifts);                                                 \
    while (count--)                                                               \
    {                                                                             \
      __m128i xmm0 = LOAD_SI128(sptr);                                          \
      sptr += (16 / sizeof(_type_));                                            \
      xmm0 = _op_(xmm0, val);                                                   \
      _mm_store_si128((__m128i*)dptr, xmm0);                                    \
      dptr += (16 / sizeof(_type_));                                            \
    }                                                                             \
    /* Finish off the remainder. */                                               \
    while (len--)                                                                 \
    {                                                                             \
      _slowWay_;                                                                \
    }                                                                             \
    return PRIMITIVES_SUCCESS;                                                    \
  }

/* ----------------------------------------------------------------------------
 * SCD = Source, Constant, Destination
 * PRE = preload xmm0 with the constant.
 */
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)            \
  static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
  {                                                                                \
    int shifts = 0;                                                              \
    UINT32 offBeatMask;                                                          \
    const _type_* sptr = pSrc;                                                   \
    _type_* dptr = pDst;                                                         \
    size_t count;                                                                \
    __m128i xmm0;                                                                \
    if (len < 16) /* pointless if too small */                                   \
    {                                                                            \
      return _fallback_(pSrc, val, pDst, len);                                 \
    }                                                                            \
    if (sizeof(_type_) == 1)                                                     \
      shifts = 1;                                                              \
    else if (sizeof(_type_) == 2)                                                \
      shifts = 2;                                                              \
    else if (sizeof(_type_) == 4)                                                \
      shifts = 3;                                                              \
    else if (sizeof(_type_) == 8)                                                \
      shifts = 4;                                                              \
    offBeatMask = (1 << (shifts - 1)) - 1;                                       \
    if ((ULONG_PTR)pDst & offBeatMask)                                           \
    {                                                                            \
      /* Incrementing the pointer skips over 16-byte boundary. */              \
      return _fallback_(pSrc, val, pDst, len);                                 \
    }                                                                            \
    /* Get to the 16-byte boundary now. */                                       \
    while ((ULONG_PTR)dptr & 0x0f)                                               \
    {                                                                            \
      _slowWay_;                                                               \
      if (--len == 0)                                                          \
        return PRIMITIVES_SUCCESS;                                           \
    }                                                                            \
    /* Use 4 128-bit SSE registers. */                                           \
    count = len >> (7 - shifts);                                                 \
    len -= count << (7 - shifts);                                                \
    xmm0 = _mm_set1_epi32(val);                                                  \
    if ((const ULONG_PTR)sptr & 0x0f)                                            \
    {                                                                            \
      while (count--)                                                          \
      {                                                                        \
        __m128i xmm1, xmm2, xmm3, xmm4;                                      \
        xmm1 = _mm_lddqu_si128((const __m128i*)sptr);                        \
        sptr += (16 / sizeof(_type_));                                       \
        xmm2 = _mm_lddqu_si128((const __m128i*)sptr);                        \
        sptr += (16 / sizeof(_type_));                                       \
        xmm3 = _mm_lddqu_si128((const __m128i*)sptr);                        \
        sptr += (16 / sizeof(_type_));                                       \
        xmm4 = _mm_lddqu_si128((const __m128i*)sptr);                        \
        sptr += (16 / sizeof(_type_));                                       \
        xmm1 = _op_(xmm1, xmm0);                                             \
        xmm2 = _op_(xmm2, xmm0);                                             \
        xmm3 = _op_(xmm3, xmm0);                                             \
        xmm4 = _op_(xmm4, xmm0);                                             \
        _mm_store_si128((__m128i*)dptr, xmm1);                               \
        dptr += (16 / sizeof(_type_));                                       \
        _mm_store_si128((__m128i*)dptr, xmm2);                               \
        dptr += (16 / sizeof(_type_));                                       \
        _mm_store_si128((__m128i*)dptr, xmm3);                               \
        dptr += (16 / sizeof(_type_));                                       \
        _mm_store_si128((__m128i*)dptr, xmm4);                               \
        dptr += (16 / sizeof(_type_));                                       \
      }                                                                        \
    }                                                                            \
    else                                                                         \
    {                                                                            \
      while (count--)                                                          \
      {                                                                        \
        __m128i xmm1, xmm2, xmm3, xmm4;                                      \
        xmm1 = _mm_load_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                       \
        xmm2 = _mm_load_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                       \
        xmm3 = _mm_load_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                       \
        xmm4 = _mm_load_si128((const __m128i*)sptr);                         \
        sptr += (16 / sizeof(_type_));                                       \
        xmm1 = _op_(xmm1, xmm0);                                             \
        xmm2 = _op_(xmm2, xmm0);                                             \
        xmm3 = _op_(xmm3, xmm0);                                             \
        xmm4 = _op_(xmm4, xmm0);                                             \
        _mm_store_si128((__m128i*)dptr, xmm1);                               \
        dptr += (16 / sizeof(_type_));                                       \
        _mm_store_si128((__m128i*)dptr, xmm2);                               \
        dptr += (16 / sizeof(_type_));                                       \
        _mm_store_si128((__m128i*)dptr, xmm3);                               \
        dptr += (16 / sizeof(_type_));                                       \
        _mm_store_si128((__m128i*)dptr, xmm4);                               \
        dptr += (16 / sizeof(_type_));                                       \
      }                                                                        \
    }                                                                            \
    /* Use a single 128-bit SSE register. */                                     \
    count = len >> (5 - shifts);                                                 \
    len -= count << (5 - shifts);                                                \
    while (count--)                                                              \
    {                                                                            \
      __m128i xmm1 = LOAD_SI128(sptr);                                         \
      sptr += (16 / sizeof(_type_));                                           \
      xmm1 = _op_(xmm1, xmm0);                                                 \
      _mm_store_si128((__m128i*)dptr, xmm1);                                   \
      dptr += (16 / sizeof(_type_));                                           \
    }                                                                            \
    /* Finish off the remainder. */                                              \
    while (len--)                                                                \
    {                                                                            \
      _slowWay_;                                                               \
    }                                                                            \
    return PRIMITIVES_SUCCESS;                                                   \
  }

/* ----------------------------------------------------------------------------
 * SSD = Source1, Source2, Destination
 */
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                           \
  static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
  {                                                                                           \
    int shifts = 0;                                                                         \
    UINT32 offBeatMask;                                                                     \
    const _type_* sptr1 = pSrc1;                                                            \
    const _type_* sptr2 = pSrc2;                                                            \
    _type_* dptr = pDst;                                                                    \
    size_t count;                                                                           \
    if (len < 16) /* pointless if too small */                                              \
    {                                                                                       \
      return _fallback_(pSrc1, pSrc2, pDst, len);                                         \
    }                                                                                       \
    if (sizeof(_type_) == 1)                                                                \
      shifts = 1;                                                                         \
    else if (sizeof(_type_) == 2)                                                           \
      shifts = 2;                                                                         \
    else if (sizeof(_type_) == 4)                                                           \
      shifts = 3;                                                                         \
    else if (sizeof(_type_) == 8)                                                           \
      shifts = 4;                                                                         \
    offBeatMask = (1 << (shifts - 1)) - 1;                                                  \
    if ((ULONG_PTR)pDst & offBeatMask)                                                      \
    {                                                                                       \
      /* Incrementing the pointer skips over 16-byte boundary. */                         \
      return _fallback_(pSrc1, pSrc2, pDst, len);                                         \
    }                                                                                       \
    /* Get to the 16-byte boundary now. */                                                  \
    while ((ULONG_PTR)dptr & 0x0f)                                                          \
    {                                                                                       \
      pstatus_t status;                                                                   \
      status = _slowWay_;                                                                 \
      if (status != PRIMITIVES_SUCCESS)                                                   \
        return status;                                                                  \
      if (--len == 0)                                                                     \
        return PRIMITIVES_SUCCESS;                                                      \
    }                                                                                       \
    /* Use 4 128-bit SSE registers. */                                                      \
    count = len >> (7 - shifts);                                                            \
    len -= count << (7 - shifts);                                                           \
    if (((const ULONG_PTR)sptr1 & 0x0f) || ((const ULONG_PTR)sptr2 & 0x0f))                 \
    {                                                                                       \
      /* Unaligned loads */                                                               \
      while (count--)                                                                     \
      {                                                                                   \
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;                         \
        xmm0 = _mm_lddqu_si128((const __m128i*)sptr1);                                  \
        sptr1 += (16 / sizeof(_type_));                                                 \
        xmm1 = _mm_lddqu_si128((const __m128i*)sptr1);                                  \
        sptr1 += (16 / sizeof(_type_));                                                 \
        xmm2 = _mm_lddqu_si128((const __m128i*)sptr1);                                  \
        sptr1 += (16 / sizeof(_type_));                                                 \
        xmm3 = _mm_lddqu_si128((const __m128i*)sptr1);                                  \
        sptr1 += (16 / sizeof(_type_));                                                 \
        xmm4 = _mm_lddqu_si128((const __m128i*)sptr2);                                  \
        sptr2 += (16 / sizeof(_type_));                                                 \
        xmm5 = _mm_lddqu_si128((const __m128i*)sptr2);                                  \
        sptr2 += (16 / sizeof(_type_));                                                 \
        xmm6 = _mm_lddqu_si128((const __m128i*)sptr2);                                  \
        sptr2 += (16 / sizeof(_type_));                                                 \
        xmm7 = _mm_lddqu_si128((const __m128i*)sptr2);                                  \
        sptr2 += (16 / sizeof(_type_));                                                 \
        xmm0 = _op_(xmm0, xmm4);                                                        \
        xmm1 = _op_(xmm1, xmm5);                                                        \
        xmm2 = _op_(xmm2, xmm6);                                                        \
        xmm3 = _op_(xmm3, xmm7);                                                        \
        _mm_store_si128((__m128i*)dptr, xmm0);                                          \
        dptr += (16 / sizeof(_type_));                                                  \
        _mm_store_si128((__m128i*)dptr, xmm1);                                          \
        dptr += (16 / sizeof(_type_));                                                  \
        _mm_store_si128((__m128i*)dptr, xmm2);                                          \
        dptr += (16 / sizeof(_type_));                                                  \
        _mm_store_si128((__m128i*)dptr, xmm3);                                          \
        dptr += (16 / sizeof(_type_));                                                  \
      }                                                                                   \
    }                                                                                       \
    else                                                                                    \
    {                                                                                       \
      /* Aligned loads */                                                                 \
      while (count--)                                                                     \
      {                                                                                   \
        __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;                         \
        xmm0 = _mm_load_si128((const __m128i*)sptr1);                                   \
        sptr1 += (16 / sizeof(_type_));                                                 \
        xmm1 = _mm_load_si128((const __m128i*)sptr1);                                   \
        sptr1 += (16 / sizeof(_type_));                                                 \
        xmm2 = _mm_load_si128((const __m128i*)sptr1);                                   \
        sptr1 += (16 / sizeof(_type_));                                                 \
        xmm3 = _mm_load_si128((const __m128i*)sptr1);                                   \
        sptr1 += (16 / sizeof(_type_));                                                 \
        xmm4 = _mm_load_si128((const __m128i*)sptr2);                                   \
        sptr2 += (16 / sizeof(_type_));                                                 \
        xmm5 = _mm_load_si128((const __m128i*)sptr2);                                   \
        sptr2 += (16 / sizeof(_type_));                                                 \
        xmm6 = _mm_load_si128((const __m128i*)sptr2);                                   \
        sptr2 += (16 / sizeof(_type_));                                                 \
        xmm7 = _mm_load_si128((const __m128i*)sptr2);                                   \
        sptr2 += (16 / sizeof(_type_));                                                 \
        xmm0 = _op_(xmm0, xmm4);                                                        \
        xmm1 = _op_(xmm1, xmm5);                                                        \
        xmm2 = _op_(xmm2, xmm6);                                                        \
        xmm3 = _op_(xmm3, xmm7);                                                        \
        _mm_store_si128((__m128i*)dptr, xmm0);                                          \
        dptr += (16 / sizeof(_type_));                                                  \
        _mm_store_si128((__m128i*)dptr, xmm1);                                          \
        dptr += (16 / sizeof(_type_));                                                  \
        _mm_store_si128((__m128i*)dptr, xmm2);                                          \
        dptr += (16 / sizeof(_type_));                                                  \
        _mm_store_si128((__m128i*)dptr, xmm3);                                          \
        dptr += (16 / sizeof(_type_));                                                  \
      }                                                                                   \
    }                                                                                       \
    /* Use a single 128-bit SSE register. */                                                \
    count = len >> (5 - shifts);                                                            \
    len -= count << (5 - shifts);                                                           \
    while (count--)                                                                         \
    {                                                                                       \
      __m128i xmm0, xmm1;                                                                 \
      xmm0 = LOAD_SI128(sptr1);                                                           \
      sptr1 += (16 / sizeof(_type_));                                                     \
      xmm1 = LOAD_SI128(sptr2);                                                           \
      sptr2 += (16 / sizeof(_type_));                                                     \
      xmm0 = _op_(xmm0, xmm1);                                                            \
      _mm_store_si128((__m128i*)dptr, xmm0);                                              \
      dptr += (16 / sizeof(_type_));                                                      \
    }                                                                                       \
    /* Finish off the remainder. */                                                         \
    while (len--)                                                                           \
    {                                                                                       \
      _slowWay_;                                                                          \
    }                                                                                       \
    return PRIMITIVES_SUCCESS;                                                              \
  }

#endif /* FREERDP_LIB_PRIM_TEMPLATES_H */

Coverage Report

Created: 2023-09-25 06:56

Line	Count	Source (jump to first uncovered line)
1		/* prim_templates.h
2		* vi:ts=4 sw=4
3		*
4		* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5		* Licensed under the Apache License, Version 2.0 (the "License"); you may
6		* not use this file except in compliance with the License. You may obtain
7		* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8		* Unless required by applicable law or agreed to in writing, software
9		* distributed under the License is distributed on an "AS IS" BASIS,
10		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11		* or implied. See the License for the specific language governing
12		* permissions and limitations under the License. Algorithms used by
13		* this code may be covered by patents by HP, Microsoft, or other parties.
14		*/
15
16		#ifdef __GNUC__
17		#pragma once
18		#endif
19
20		#ifndef FREERDP_LIB_PRIM_TEMPLATES_H
21		#define FREERDP_LIB_PRIM_TEMPLATES_H
22
23		/* These are prototypes for SSE (potentially NEON) routines that do a
24		* simple SSE operation over an array of data. Since so much of this
25		* code is shared except for the operation itself, these prototypes are
26		* used rather than duplicating code. The naming convention depends on
27		* the parameters: S=Source param; C=Constant; D=Destination.
28		* All the macros have parameters for a fallback procedure if the data
29		* is too small and an operation "the slow way" for use at 16-byte edges.
30		*/
31
32		/* SSE3 note: If someone needs to support an SSE2 version of these without
33		* SSE3 support, an alternative version could be added that merely checks
34		* that 16-byte alignment on both destination and source(s) can be
35		* achieved, rather than use LDDQU for unaligned reads.
36		*/
37
38		/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
39		* It easily can't do that if the value is stored in a variable.
40		* So don't save it as an intermediate value.
41		*/
42
43		/* ----------------------------------------------------------------------------
44		* SCD = Source, Constant, Destination
45		*/
46		#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
47		static pstatus_t _name_(const _type_* pSrc, UINT32 val, _type_* pDst, UINT32 len) \
48	0	{ \
49	0	INT32 shifts = 0; \
50	0	UINT32 offBeatMask; \
51	0	const _type_* sptr = pSrc; \
52	0	_type_* dptr = pDst; \
53	0	int count; \
54	0	if (val == 0) \
55	0	return PRIMITIVES_SUCCESS; \
56	0	if (val >= 16) \
57	0	return -1; \
58	0	if (len < 16) /* pointless if too small */ \
59	0	{ \
60	0	return _fallback_(pSrc, val, pDst, len); \
61	0	} \
62	0	if (sizeof(_type_) == 1) \
63	0	shifts = 1; \
64	0	else if (sizeof(_type_) == 2) \
65	0	shifts = 2; \
66	0	else if (sizeof(_type_) == 4) \
67	0	shifts = 3; \
68	0	else if (sizeof(_type_) == 8) \
69	0	shifts = 4; \
70	0	offBeatMask = (1 << (shifts - 1)) - 1; \
71	0	if ((ULONG_PTR)pDst & offBeatMask) \
72	0	{ \
73	0	/* Incrementing the pointer skips over 16-byte boundary. */ \
74	0	return _fallback_(pSrc, val, pDst, len); \
75	0	} \
76	0	/* Get to the 16-byte boundary now. */ \
77	0	while ((ULONG_PTR)dptr & 0x0f) \
78	0	{ \
79	0	_slowWay_; \
80	0	if (--len == 0) \
81	0	return PRIMITIVES_SUCCESS; \
82	0	} \
83	0	/* Use 8 128-bit SSE registers. */ \
84	0	count = len >> (8 - shifts); \
85	0	len -= count << (8 - shifts); \
86	0	if ((const ULONG_PTR)sptr & 0x0f) \
87	0	{ \
88	0	while (count--) \
89	0	{ \
90	0	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
91	0	xmm0 = _mm_lddqu_si128((const __m128i*)sptr); \
92	0	sptr += (16 / sizeof(_type_)); \
93	0	xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
94	0	sptr += (16 / sizeof(_type_)); \
95	0	xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
96	0	sptr += (16 / sizeof(_type_)); \
97	0	xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
98	0	sptr += (16 / sizeof(_type_)); \
99	0	xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
100	0	sptr += (16 / sizeof(_type_)); \
101	0	xmm5 = _mm_lddqu_si128((const __m128i*)sptr); \
102	0	sptr += (16 / sizeof(_type_)); \
103	0	xmm6 = _mm_lddqu_si128((const __m128i*)sptr); \
104	0	sptr += (16 / sizeof(_type_)); \
105	0	xmm7 = _mm_lddqu_si128((const __m128i*)sptr); \
106	0	sptr += (16 / sizeof(_type_)); \
107	0	xmm0 = _op_(xmm0, val); \
108	0	xmm1 = _op_(xmm1, val); \
109	0	xmm2 = _op_(xmm2, val); \
110	0	xmm3 = _op_(xmm3, val); \
111	0	xmm4 = _op_(xmm4, val); \
112	0	xmm5 = _op_(xmm5, val); \
113	0	xmm6 = _op_(xmm6, val); \
114	0	xmm7 = _op_(xmm7, val); \
115	0	_mm_store_si128((__m128i*)dptr, xmm0); \
116	0	dptr += (16 / sizeof(_type_)); \
117	0	_mm_store_si128((__m128i*)dptr, xmm1); \
118	0	dptr += (16 / sizeof(_type_)); \
119	0	_mm_store_si128((__m128i*)dptr, xmm2); \
120	0	dptr += (16 / sizeof(_type_)); \
121	0	_mm_store_si128((__m128i*)dptr, xmm3); \
122	0	dptr += (16 / sizeof(_type_)); \
123	0	_mm_store_si128((__m128i*)dptr, xmm4); \
124	0	dptr += (16 / sizeof(_type_)); \
125	0	_mm_store_si128((__m128i*)dptr, xmm5); \
126	0	dptr += (16 / sizeof(_type_)); \
127	0	_mm_store_si128((__m128i*)dptr, xmm6); \
128	0	dptr += (16 / sizeof(_type_)); \
129	0	_mm_store_si128((__m128i*)dptr, xmm7); \
130	0	dptr += (16 / sizeof(_type_)); \
131	0	} \
132	0	} \
133	0	else \
134	0	{ \
135	0	while (count--) \
136	0	{ \
137	0	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
138	0	xmm0 = _mm_load_si128((const __m128i*)sptr); \
139	0	sptr += (16 / sizeof(_type_)); \
140	0	xmm1 = _mm_load_si128((const __m128i*)sptr); \
141	0	sptr += (16 / sizeof(_type_)); \
142	0	xmm2 = _mm_load_si128((const __m128i*)sptr); \
143	0	sptr += (16 / sizeof(_type_)); \
144	0	xmm3 = _mm_load_si128((const __m128i*)sptr); \
145	0	sptr += (16 / sizeof(_type_)); \
146	0	xmm4 = _mm_load_si128((const __m128i*)sptr); \
147	0	sptr += (16 / sizeof(_type_)); \
148	0	xmm5 = _mm_load_si128((const __m128i*)sptr); \
149	0	sptr += (16 / sizeof(_type_)); \
150	0	xmm6 = _mm_load_si128((const __m128i*)sptr); \
151	0	sptr += (16 / sizeof(_type_)); \
152	0	xmm7 = _mm_load_si128((const __m128i*)sptr); \
153	0	sptr += (16 / sizeof(_type_)); \
154	0	xmm0 = _op_(xmm0, val); \
155	0	xmm1 = _op_(xmm1, val); \
156	0	xmm2 = _op_(xmm2, val); \
157	0	xmm3 = _op_(xmm3, val); \
158	0	xmm4 = _op_(xmm4, val); \
159	0	xmm5 = _op_(xmm5, val); \
160	0	xmm6 = _op_(xmm6, val); \
161	0	xmm7 = _op_(xmm7, val); \
162	0	_mm_store_si128((__m128i*)dptr, xmm0); \
163	0	dptr += (16 / sizeof(_type_)); \
164	0	_mm_store_si128((__m128i*)dptr, xmm1); \
165	0	dptr += (16 / sizeof(_type_)); \
166	0	_mm_store_si128((__m128i*)dptr, xmm2); \
167	0	dptr += (16 / sizeof(_type_)); \
168	0	_mm_store_si128((__m128i*)dptr, xmm3); \
169	0	dptr += (16 / sizeof(_type_)); \
170	0	_mm_store_si128((__m128i*)dptr, xmm4); \
171	0	dptr += (16 / sizeof(_type_)); \
172	0	_mm_store_si128((__m128i*)dptr, xmm5); \
173	0	dptr += (16 / sizeof(_type_)); \
174	0	_mm_store_si128((__m128i*)dptr, xmm6); \
175	0	dptr += (16 / sizeof(_type_)); \
176	0	_mm_store_si128((__m128i*)dptr, xmm7); \
177	0	dptr += (16 / sizeof(_type_)); \
178	0	} \
179	0	} \
180	0	/* Use a single 128-bit SSE register. */ \
181	0	count = len >> (5 - shifts); \
182	0	len -= count << (5 - shifts); \
183	0	while (count--) \
184	0	{ \
185	0	__m128i xmm0 = LOAD_SI128(sptr); \
186	0	sptr += (16 / sizeof(_type_)); \
187	0	xmm0 = _op_(xmm0, val); \
188	0	_mm_store_si128((__m128i*)dptr, xmm0); \
189	0	dptr += (16 / sizeof(_type_)); \
190	0	} \
191	0	/* Finish off the remainder. */ \
192	0	while (len--) \
193	0	{ \
194	0	_slowWay_; \
195	0	} \
196	0	return PRIMITIVES_SUCCESS; \
197	0	} Unexecuted instantiation: prim_shift_opt.c:sse2_lShiftC_16s Unexecuted instantiation: prim_shift_opt.c:sse2_rShiftC_16s Unexecuted instantiation: prim_shift_opt.c:sse2_lShiftC_16u Unexecuted instantiation: prim_shift_opt.c:sse2_rShiftC_16u
198
199		/* ----------------------------------------------------------------------------
200		* SCD = Source, Constant, Destination
201		* PRE = preload xmm0 with the constant.
202		*/
203		#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
204		static pstatus_t _name_(const _type_* pSrc, _type_ val, _type_* pDst, INT32 len) \
205	0	{ \
206	0	int shifts = 0; \
207	0	UINT32 offBeatMask; \
208	0	const _type_* sptr = pSrc; \
209	0	_type_* dptr = pDst; \
210	0	size_t count; \
211	0	__m128i xmm0; \
212	0	if (len < 16) /* pointless if too small */ \
213	0	{ \
214	0	return _fallback_(pSrc, val, pDst, len); \
215	0	} \
216	0	if (sizeof(_type_) == 1) \
217	0	shifts = 1; \
218	0	else if (sizeof(_type_) == 2) \
219	0	shifts = 2; \
220	0	else if (sizeof(_type_) == 4) \
221	0	shifts = 3; \
222	0	else if (sizeof(_type_) == 8) \
223	0	shifts = 4; \
224	0	offBeatMask = (1 << (shifts - 1)) - 1; \
225	0	if ((ULONG_PTR)pDst & offBeatMask) \
226	0	{ \
227	0	/* Incrementing the pointer skips over 16-byte boundary. */ \
228	0	return _fallback_(pSrc, val, pDst, len); \
229	0	} \
230	0	/* Get to the 16-byte boundary now. */ \
231	0	while ((ULONG_PTR)dptr & 0x0f) \
232	0	{ \
233	0	_slowWay_; \
234	0	if (--len == 0) \
235	0	return PRIMITIVES_SUCCESS; \
236	0	} \
237	0	/* Use 4 128-bit SSE registers. */ \
238	0	count = len >> (7 - shifts); \
239	0	len -= count << (7 - shifts); \
240	0	xmm0 = _mm_set1_epi32(val); \
241	0	if ((const ULONG_PTR)sptr & 0x0f) \
242	0	{ \
243	0	while (count--) \
244	0	{ \
245	0	__m128i xmm1, xmm2, xmm3, xmm4; \
246	0	xmm1 = _mm_lddqu_si128((const __m128i*)sptr); \
247	0	sptr += (16 / sizeof(_type_)); \
248	0	xmm2 = _mm_lddqu_si128((const __m128i*)sptr); \
249	0	sptr += (16 / sizeof(_type_)); \
250	0	xmm3 = _mm_lddqu_si128((const __m128i*)sptr); \
251	0	sptr += (16 / sizeof(_type_)); \
252	0	xmm4 = _mm_lddqu_si128((const __m128i*)sptr); \
253	0	sptr += (16 / sizeof(_type_)); \
254	0	xmm1 = _op_(xmm1, xmm0); \
255	0	xmm2 = _op_(xmm2, xmm0); \
256	0	xmm3 = _op_(xmm3, xmm0); \
257	0	xmm4 = _op_(xmm4, xmm0); \
258	0	_mm_store_si128((__m128i*)dptr, xmm1); \
259	0	dptr += (16 / sizeof(_type_)); \
260	0	_mm_store_si128((__m128i*)dptr, xmm2); \
261	0	dptr += (16 / sizeof(_type_)); \
262	0	_mm_store_si128((__m128i*)dptr, xmm3); \
263	0	dptr += (16 / sizeof(_type_)); \
264	0	_mm_store_si128((__m128i*)dptr, xmm4); \
265	0	dptr += (16 / sizeof(_type_)); \
266	0	} \
267	0	} \
268	0	else \
269	0	{ \
270	0	while (count--) \
271	0	{ \
272	0	__m128i xmm1, xmm2, xmm3, xmm4; \
273	0	xmm1 = _mm_load_si128((const __m128i*)sptr); \
274	0	sptr += (16 / sizeof(_type_)); \
275	0	xmm2 = _mm_load_si128((const __m128i*)sptr); \
276	0	sptr += (16 / sizeof(_type_)); \
277	0	xmm3 = _mm_load_si128((const __m128i*)sptr); \
278	0	sptr += (16 / sizeof(_type_)); \
279	0	xmm4 = _mm_load_si128((const __m128i*)sptr); \
280	0	sptr += (16 / sizeof(_type_)); \
281	0	xmm1 = _op_(xmm1, xmm0); \
282	0	xmm2 = _op_(xmm2, xmm0); \
283	0	xmm3 = _op_(xmm3, xmm0); \
284	0	xmm4 = _op_(xmm4, xmm0); \
285	0	_mm_store_si128((__m128i*)dptr, xmm1); \
286	0	dptr += (16 / sizeof(_type_)); \
287	0	_mm_store_si128((__m128i*)dptr, xmm2); \
288	0	dptr += (16 / sizeof(_type_)); \
289	0	_mm_store_si128((__m128i*)dptr, xmm3); \
290	0	dptr += (16 / sizeof(_type_)); \
291	0	_mm_store_si128((__m128i*)dptr, xmm4); \
292	0	dptr += (16 / sizeof(_type_)); \
293	0	} \
294	0	} \
295	0	/* Use a single 128-bit SSE register. */ \
296	0	count = len >> (5 - shifts); \
297	0	len -= count << (5 - shifts); \
298	0	while (count--) \
299	0	{ \
300	0	__m128i xmm1 = LOAD_SI128(sptr); \
301	0	sptr += (16 / sizeof(_type_)); \
302	0	xmm1 = _op_(xmm1, xmm0); \
303	0	_mm_store_si128((__m128i*)dptr, xmm1); \
304	0	dptr += (16 / sizeof(_type_)); \
305	0	} \
306	0	/* Finish off the remainder. */ \
307	0	while (len--) \
308	0	{ \
309	0	_slowWay_; \
310	0	} \
311	0	return PRIMITIVES_SUCCESS; \
312	0	} Unexecuted instantiation: prim_andor_opt.c:sse3_andC_32u Unexecuted instantiation: prim_andor_opt.c:sse3_orC_32u
313
314		/* ----------------------------------------------------------------------------
315		* SSD = Source1, Source2, Destination
316		*/
317		#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
318		static pstatus_t _name_(const _type_* pSrc1, const _type_* pSrc2, _type_* pDst, UINT32 len) \
319	0	{ \
320	0	int shifts = 0; \
321	0	UINT32 offBeatMask; \
322	0	const _type_* sptr1 = pSrc1; \
323	0	const _type_* sptr2 = pSrc2; \
324	0	_type_* dptr = pDst; \
325	0	size_t count; \
326	0	if (len < 16) /* pointless if too small */ \
327	0	{ \
328	0	return _fallback_(pSrc1, pSrc2, pDst, len); \
329	0	} \
330	0	if (sizeof(_type_) == 1) \
331	0	shifts = 1; \
332	0	else if (sizeof(_type_) == 2) \
333	0	shifts = 2; \
334	0	else if (sizeof(_type_) == 4) \
335	0	shifts = 3; \
336	0	else if (sizeof(_type_) == 8) \
337	0	shifts = 4; \
338	0	offBeatMask = (1 << (shifts - 1)) - 1; \
339	0	if ((ULONG_PTR)pDst & offBeatMask) \
340	0	{ \
341	0	/* Incrementing the pointer skips over 16-byte boundary. */ \
342	0	return _fallback_(pSrc1, pSrc2, pDst, len); \
343	0	} \
344	0	/* Get to the 16-byte boundary now. */ \
345	0	while ((ULONG_PTR)dptr & 0x0f) \
346	0	{ \
347	0	pstatus_t status; \
348	0	status = _slowWay_; \
349	0	if (status != PRIMITIVES_SUCCESS) \
350	0	return status; \
351	0	if (--len == 0) \
352	0	return PRIMITIVES_SUCCESS; \
353	0	} \
354	0	/* Use 4 128-bit SSE registers. */ \
355	0	count = len >> (7 - shifts); \
356	0	len -= count << (7 - shifts); \
357	0	if (((const ULONG_PTR)sptr1 & 0x0f) \|\| ((const ULONG_PTR)sptr2 & 0x0f)) \
358	0	{ \
359	0	/* Unaligned loads */ \
360	0	while (count--) \
361	0	{ \
362	0	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
363	0	xmm0 = _mm_lddqu_si128((const __m128i*)sptr1); \
364	0	sptr1 += (16 / sizeof(_type_)); \
365	0	xmm1 = _mm_lddqu_si128((const __m128i*)sptr1); \
366	0	sptr1 += (16 / sizeof(_type_)); \
367	0	xmm2 = _mm_lddqu_si128((const __m128i*)sptr1); \
368	0	sptr1 += (16 / sizeof(_type_)); \
369	0	xmm3 = _mm_lddqu_si128((const __m128i*)sptr1); \
370	0	sptr1 += (16 / sizeof(_type_)); \
371	0	xmm4 = _mm_lddqu_si128((const __m128i*)sptr2); \
372	0	sptr2 += (16 / sizeof(_type_)); \
373	0	xmm5 = _mm_lddqu_si128((const __m128i*)sptr2); \
374	0	sptr2 += (16 / sizeof(_type_)); \
375	0	xmm6 = _mm_lddqu_si128((const __m128i*)sptr2); \
376	0	sptr2 += (16 / sizeof(_type_)); \
377	0	xmm7 = _mm_lddqu_si128((const __m128i*)sptr2); \
378	0	sptr2 += (16 / sizeof(_type_)); \
379	0	xmm0 = _op_(xmm0, xmm4); \
380	0	xmm1 = _op_(xmm1, xmm5); \
381	0	xmm2 = _op_(xmm2, xmm6); \
382	0	xmm3 = _op_(xmm3, xmm7); \
383	0	_mm_store_si128((__m128i*)dptr, xmm0); \
384	0	dptr += (16 / sizeof(_type_)); \
385	0	_mm_store_si128((__m128i*)dptr, xmm1); \
386	0	dptr += (16 / sizeof(_type_)); \
387	0	_mm_store_si128((__m128i*)dptr, xmm2); \
388	0	dptr += (16 / sizeof(_type_)); \
389	0	_mm_store_si128((__m128i*)dptr, xmm3); \
390	0	dptr += (16 / sizeof(_type_)); \
391	0	} \
392	0	} \
393	0	else \
394	0	{ \
395	0	/* Aligned loads */ \
396	0	while (count--) \
397	0	{ \
398	0	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; \
399	0	xmm0 = _mm_load_si128((const __m128i*)sptr1); \
400	0	sptr1 += (16 / sizeof(_type_)); \
401	0	xmm1 = _mm_load_si128((const __m128i*)sptr1); \
402	0	sptr1 += (16 / sizeof(_type_)); \
403	0	xmm2 = _mm_load_si128((const __m128i*)sptr1); \
404	0	sptr1 += (16 / sizeof(_type_)); \
405	0	xmm3 = _mm_load_si128((const __m128i*)sptr1); \
406	0	sptr1 += (16 / sizeof(_type_)); \
407	0	xmm4 = _mm_load_si128((const __m128i*)sptr2); \
408	0	sptr2 += (16 / sizeof(_type_)); \
409	0	xmm5 = _mm_load_si128((const __m128i*)sptr2); \
410	0	sptr2 += (16 / sizeof(_type_)); \
411	0	xmm6 = _mm_load_si128((const __m128i*)sptr2); \
412	0	sptr2 += (16 / sizeof(_type_)); \
413	0	xmm7 = _mm_load_si128((const __m128i*)sptr2); \
414	0	sptr2 += (16 / sizeof(_type_)); \
415	0	xmm0 = _op_(xmm0, xmm4); \
416	0	xmm1 = _op_(xmm1, xmm5); \
417	0	xmm2 = _op_(xmm2, xmm6); \
418	0	xmm3 = _op_(xmm3, xmm7); \
419	0	_mm_store_si128((__m128i*)dptr, xmm0); \
420	0	dptr += (16 / sizeof(_type_)); \
421	0	_mm_store_si128((__m128i*)dptr, xmm1); \
422	0	dptr += (16 / sizeof(_type_)); \
423	0	_mm_store_si128((__m128i*)dptr, xmm2); \
424	0	dptr += (16 / sizeof(_type_)); \
425	0	_mm_store_si128((__m128i*)dptr, xmm3); \
426	0	dptr += (16 / sizeof(_type_)); \
427	0	} \
428	0	} \
429	0	/* Use a single 128-bit SSE register. */ \
430	0	count = len >> (5 - shifts); \
431	0	len -= count << (5 - shifts); \
432	0	while (count--) \
433	0	{ \
434	0	__m128i xmm0, xmm1; \
435	0	xmm0 = LOAD_SI128(sptr1); \
436	0	sptr1 += (16 / sizeof(_type_)); \
437	0	xmm1 = LOAD_SI128(sptr2); \
438	0	sptr2 += (16 / sizeof(_type_)); \
439	0	xmm0 = _op_(xmm0, xmm1); \
440	0	_mm_store_si128((__m128i*)dptr, xmm0); \
441	0	dptr += (16 / sizeof(_type_)); \
442	0	} \
443	0	/* Finish off the remainder. */ \
444	0	while (len--) \
445	0	{ \
446	0	_slowWay_; \
447	0	} \
448	0	return PRIMITIVES_SUCCESS; \
449	0	}
450
451		#endif /* FREERDP_LIB_PRIM_TEMPLATES_H */