/src/c-blosc2/blosc/bitshuffle-sse2.c

Source (jump to first uncovered line)
/*********************************************************************
  Blosc - Blocked Shuffling and Compression Library

  Copyright (c) 2021  Blosc Development Team <blosc@blosc.org>
  https://blosc.org
  License: BSD 3-Clause (see LICENSE.txt)

  See LICENSE.txt for details about copyright and rights to use.
**********************************************************************/

/*********************************************************************
  Bitshuffle - Filter for improving compression of typed binary data.

  Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
  Website: https://github.com/kiyo-masui/bitshuffle

  Note: Adapted for c-blosc by Francesc Alted.

  See LICENSES/BITSHUFFLE.txt file for details about copyright and
  rights to use.
**********************************************************************/


#include "bitshuffle-sse2.h"
#include "bitshuffle-generic.h"
#include <stdlib.h>

/* Make sure SSE2 is available for the compilation target and compiler. */
#if defined(__SSE2__)

#include <emmintrin.h>

/* The next is useful for debugging purposes */
#if 0
#include <stdio.h>
#include <string.h>


static void printxmm(__m128i xmm0)
{
  uint8_t buf[32];

  ((__m128i *)buf)[0] = xmm0;
  printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
          buf[0], buf[1], buf[2], buf[3],
          buf[4], buf[5], buf[6], buf[7],
          buf[8], buf[9], buf[10], buf[11],
          buf[12], buf[13], buf[14], buf[15]);
}
#endif


/* ---- Worker code that requires SSE2. Intel Petium 4 (2000) and later. ---- */


/* Transpose bytes within elements for 16 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) {

  size_t ii;
  const char *in_b = (const char*) in;
  char *out_b = (char*) out;
  __m128i a0, b0, a1, b1;

  for (ii=0; ii + 15 < size; ii += 16) {
    a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]);
    b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]);

    a1 = _mm_unpacklo_epi8(a0, b0);
    b1 = _mm_unpackhi_epi8(a0, b0);

    a0 = _mm_unpacklo_epi8(a1, b1);
    b0 = _mm_unpackhi_epi8(a1, b1);

    a1 = _mm_unpacklo_epi8(a0, b0);
    b1 = _mm_unpackhi_epi8(a0, b0);

    a0 = _mm_unpacklo_epi8(a1, b1);
    b0 = _mm_unpackhi_epi8(a1, b1);

    _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
    _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
  }
  return bshuf_trans_byte_elem_remainder(in, out, size, 2,
                                         size - size % 16);
}


/* Transpose bytes within elements for 32 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) {

  size_t ii;
  const char *in_b;
  char *out_b;
  in_b = (const char*) in;
  out_b = (char*) out;
  __m128i a0, b0, c0, d0, a1, b1, c1, d1;

  for (ii=0; ii + 15 < size; ii += 16) {
    a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
    b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
    c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
    d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);

    a1 = _mm_unpacklo_epi8(a0, b0);
    b1 = _mm_unpackhi_epi8(a0, b0);
    c1 = _mm_unpacklo_epi8(c0, d0);
    d1 = _mm_unpackhi_epi8(c0, d0);

    a0 = _mm_unpacklo_epi8(a1, b1);
    b0 = _mm_unpackhi_epi8(a1, b1);
    c0 = _mm_unpacklo_epi8(c1, d1);
    d0 = _mm_unpackhi_epi8(c1, d1);

    a1 = _mm_unpacklo_epi8(a0, b0);
    b1 = _mm_unpackhi_epi8(a0, b0);
    c1 = _mm_unpacklo_epi8(c0, d0);
    d1 = _mm_unpackhi_epi8(c0, d0);

    a0 = _mm_unpacklo_epi64(a1, c1);
    b0 = _mm_unpackhi_epi64(a1, c1);
    c0 = _mm_unpacklo_epi64(b1, d1);
    d0 = _mm_unpackhi_epi64(b1, d1);

    _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
    _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
    _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
    _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
  }
  return bshuf_trans_byte_elem_remainder(in, out, size, 4,
                                         size - size % 16);
}


/* Transpose bytes within elements for 64 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) {

  size_t ii;
  const char* in_b = (const char*) in;
  char* out_b = (char*) out;
  __m128i a0, b0, c0, d0, e0, f0, g0, h0;
  __m128i a1, b1, c1, d1, e1, f1, g1, h1;

  for (ii=0; ii + 15 < size; ii += 16) {
    a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
    b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
    c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
    d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
    e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
    f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
    g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
    h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);

    a1 = _mm_unpacklo_epi8(a0, b0);
    b1 = _mm_unpackhi_epi8(a0, b0);
    c1 = _mm_unpacklo_epi8(c0, d0);
    d1 = _mm_unpackhi_epi8(c0, d0);
    e1 = _mm_unpacklo_epi8(e0, f0);
    f1 = _mm_unpackhi_epi8(e0, f0);
    g1 = _mm_unpacklo_epi8(g0, h0);
    h1 = _mm_unpackhi_epi8(g0, h0);

    a0 = _mm_unpacklo_epi8(a1, b1);
    b0 = _mm_unpackhi_epi8(a1, b1);
    c0 = _mm_unpacklo_epi8(c1, d1);
    d0 = _mm_unpackhi_epi8(c1, d1);
    e0 = _mm_unpacklo_epi8(e1, f1);
    f0 = _mm_unpackhi_epi8(e1, f1);
    g0 = _mm_unpacklo_epi8(g1, h1);
    h0 = _mm_unpackhi_epi8(g1, h1);

    a1 = _mm_unpacklo_epi32(a0, c0);
    b1 = _mm_unpackhi_epi32(a0, c0);
    c1 = _mm_unpacklo_epi32(b0, d0);
    d1 = _mm_unpackhi_epi32(b0, d0);
    e1 = _mm_unpacklo_epi32(e0, g0);
    f1 = _mm_unpackhi_epi32(e0, g0);
    g1 = _mm_unpacklo_epi32(f0, h0);
    h1 = _mm_unpackhi_epi32(f0, h0);

    a0 = _mm_unpacklo_epi64(a1, e1);
    b0 = _mm_unpackhi_epi64(a1, e1);
    c0 = _mm_unpacklo_epi64(b1, f1);
    d0 = _mm_unpackhi_epi64(b1, f1);
    e0 = _mm_unpacklo_epi64(c1, g1);
    f0 = _mm_unpackhi_epi64(c1, g1);
    g0 = _mm_unpacklo_epi64(d1, h1);
    h0 = _mm_unpackhi_epi64(d1, h1);

    _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
    _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
    _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
    _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
    _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
    _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
    _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
    _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
  }
  return bshuf_trans_byte_elem_remainder(in, out, size, 8,
                                         size - size % 16);
}


/* Transpose bytes within elements using the best SSE algorithm available. */
int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
                                  const size_t elem_size) {

  int64_t count;

  // Trivial cases: power of 2 bytes.
  switch (elem_size) {
    case 1:
      count = bshuf_copy(in, out, size, elem_size);
      return count;
    case 2:
      count = bshuf_trans_byte_elem_SSE_16(in, out, size);
      return count;
    case 4:
      count = bshuf_trans_byte_elem_SSE_32(in, out, size);
      return count;
    case 8:
      count = bshuf_trans_byte_elem_SSE_64(in, out, size);
      return count;
  }

  // Worst case: odd number of bytes. Turns out that this is faster for
  // (odd * 2) byte elements as well (hence % 4).
  if (elem_size % 4) {
    count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
    return count;
  }

  // Multiple of power of 2: transpose hierarchically.
  {
    size_t nchunk_elem;
    void* tmp_buf = malloc(size * elem_size);
    if (tmp_buf == NULL) return -1;

    if ((elem_size % 8) == 0) {
      nchunk_elem = elem_size / 8;
      TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
      count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
                                           size * nchunk_elem);
      bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
    } else if ((elem_size % 4) == 0) {
      nchunk_elem = elem_size / 4;
      TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
      count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
                                           size * nchunk_elem);
      bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
    } else {
      // Not used since scalar algorithm is faster.
      nchunk_elem = elem_size / 2;
      TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
      count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
                                           size * nchunk_elem);
      bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
    }

    free(tmp_buf);
    return count;
  }
}


/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
                                 const size_t elem_size) {

  size_t ii, kk;
  const char* in_b = (const char*) in;
  char* out_b = (char*) out;
  uint16_t* out_ui16;

  int64_t count;

  size_t nbyte = elem_size * size;

  CHECK_MULT_EIGHT(nbyte);

  __m128i xmm;
  int32_t bt;

  for (ii = 0; ii + 15 < nbyte; ii += 16) {
    xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
    for (kk = 0; kk < 8; kk++) {
      bt = _mm_movemask_epi8(xmm);
      xmm = _mm_slli_epi16(xmm, 1);
      out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
      *out_ui16 = bt;
    }
  }
  count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
                                         nbyte - nbyte % 16);
  return count;
}


/* Transpose bits within elements. */
int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
                                 const size_t elem_size) {

  int64_t count;

  CHECK_MULT_EIGHT(size);

  void* tmp_buf = malloc(size * elem_size);
  if (tmp_buf == NULL) return -1;

  count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
  CHECK_ERR_FREE(count, tmp_buf);
  count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size);
  CHECK_ERR_FREE(count, tmp_buf);
  count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);

  free(tmp_buf);

  return count;
}


/* For data organized into a row for each bit (8 * elem_size rows), transpose
 * the bytes. */
int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size,
                                    const size_t elem_size) {

  size_t ii, jj;
  const char* in_b = (const char*) in;
  char* out_b = (char*) out;

  CHECK_MULT_EIGHT(size);

  size_t nrows = 8 * elem_size;
  size_t nbyte_row = size / 8;

  __m128i a0, b0, c0, d0, e0, f0, g0, h0;
  __m128i a1, b1, c1, d1, e1, f1, g1, h1;
  __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;

  for (ii = 0; ii + 7 < nrows; ii += 8) {
    for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
      a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
      b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
      c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
      d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
      e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
      f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
      g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
      h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);


      a1 = _mm_unpacklo_epi8(a0, b0);
      b1 = _mm_unpacklo_epi8(c0, d0);
      c1 = _mm_unpacklo_epi8(e0, f0);
      d1 = _mm_unpacklo_epi8(g0, h0);
      e1 = _mm_unpackhi_epi8(a0, b0);
      f1 = _mm_unpackhi_epi8(c0, d0);
      g1 = _mm_unpackhi_epi8(e0, f0);
      h1 = _mm_unpackhi_epi8(g0, h0);


      a0 = _mm_unpacklo_epi16(a1, b1);
      b0 = _mm_unpacklo_epi16(c1, d1);
      c0 = _mm_unpackhi_epi16(a1, b1);
      d0 = _mm_unpackhi_epi16(c1, d1);

      e0 = _mm_unpacklo_epi16(e1, f1);
      f0 = _mm_unpacklo_epi16(g1, h1);
      g0 = _mm_unpackhi_epi16(e1, f1);
      h0 = _mm_unpackhi_epi16(g1, h1);


      a1 = _mm_unpacklo_epi32(a0, b0);
      b1 = _mm_unpackhi_epi32(a0, b0);

      c1 = _mm_unpacklo_epi32(c0, d0);
      d1 = _mm_unpackhi_epi32(c0, d0);

      e1 = _mm_unpacklo_epi32(e0, f0);
      f1 = _mm_unpackhi_epi32(e0, f0);

      g1 = _mm_unpacklo_epi32(g0, h0);
      h1 = _mm_unpackhi_epi32(g0, h0);

      // We don't have a storeh instruction for integers, so interpret
      // as a float. Have a storel (_mm_storel_epi64).
      as = (__m128 *) &a1;
      bs = (__m128 *) &b1;
      cs = (__m128 *) &c1;
      ds = (__m128 *) &d1;
      es = (__m128 *) &e1;
      fs = (__m128 *) &f1;
      gs = (__m128 *) &g1;
      hs = (__m128 *) &h1;

      _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
      _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
      _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
      _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
      _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
      _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
      _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
      _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);

      _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
      _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
      _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
      _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
      _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
      _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
      _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
      _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
    }
    for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
      out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
      out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
      out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
      out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
      out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
      out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
      out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
      out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
    }
  }
  return size * elem_size;
}


/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size,
                                        const size_t elem_size) {

  CHECK_MULT_EIGHT(size);

  // With a bit of care, this could be written such that such that it is
  // in_buf = out_buf safe.
  const char* in_b = (const char*) in;
  uint16_t* out_ui16 = (uint16_t*) out;

  size_t ii, jj, kk;
  size_t nbyte = elem_size * size;

  __m128i xmm;
  int32_t bt;

  if (elem_size % 2) {
    bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
  } else {
    for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
         ii += 8 * elem_size) {
      for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
        xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
        for (kk = 0; kk < 8; kk++) {
          bt = _mm_movemask_epi8(xmm);
          xmm = _mm_slli_epi16(xmm, 1);
          size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
          out_ui16[ind / 2] = bt;
        }
      }
    }
  }
  return size * elem_size;
}


/* Untranspose bits within elements. */
int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
                                   const size_t elem_size) {

  int64_t count;

  CHECK_MULT_EIGHT(size);

  void* tmp_buf = malloc(size * elem_size);
  if (tmp_buf == NULL) return -1;

  count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size);
  CHECK_ERR_FREE(count, tmp_buf);
  count =  bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size);

  free(tmp_buf);

  return count;
}

const bool is_bshuf_SSE = true;

#else /* defined(__SSE2__) */

const bool is_bshuf_SSE = false;

int64_t
bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
                         const size_t elem_size) {
  abort();
}

int64_t
bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
                           const size_t elem_size) {
  abort();
}

#endif /* defined(__SSE2__) */

Coverage Report

Created: 2024-09-08 06:37

Line	Count	Source (jump to first uncovered line)
1		/*********************************************************************
2		Blosc - Blocked Shuffling and Compression Library
3
4		Copyright (c) 2021 Blosc Development Team <blosc@blosc.org>
5		https://blosc.org
6		License: BSD 3-Clause (see LICENSE.txt)
7
8		See LICENSE.txt for details about copyright and rights to use.
9		**********************************************************************/
10
11		/*********************************************************************
12		Bitshuffle - Filter for improving compression of typed binary data.
13
14		Author: Kiyoshi Masui <kiyo@physics.ubc.ca>
15		Website: https://github.com/kiyo-masui/bitshuffle
16
17		Note: Adapted for c-blosc by Francesc Alted.
18
19		See LICENSES/BITSHUFFLE.txt file for details about copyright and
20		rights to use.
21		**********************************************************************/
22
23
24		#include "bitshuffle-sse2.h"
25		#include "bitshuffle-generic.h"
26		#include <stdlib.h>
27
28		/* Make sure SSE2 is available for the compilation target and compiler. */
29		#if defined(__SSE2__)
30
31		#include <emmintrin.h>
32
33		/* The next is useful for debugging purposes */
34		#if 0
35		#include <stdio.h>
36		#include <string.h>
37
38
39		static void printxmm(__m128i xmm0)
40		{
41		uint8_t buf[32];
42
43		((__m128i *)buf)[0] = xmm0;
44		printf("%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x,%x\n",
45		buf[0], buf[1], buf[2], buf[3],
46		buf[4], buf[5], buf[6], buf[7],
47		buf[8], buf[9], buf[10], buf[11],
48		buf[12], buf[13], buf[14], buf[15]);
49		}
50		#endif
51
52
53		/* ---- Worker code that requires SSE2. Intel Petium 4 (2000) and later. ---- */
54
55
56		/* Transpose bytes within elements for 16 bit elements. */
57	0	int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) {
58
59	0	size_t ii;
60	0	const char in_b = (const char) in;
61	0	char out_b = (char) out;
62	0	__m128i a0, b0, a1, b1;
63
64	0	for (ii=0; ii + 15 < size; ii += 16) {
65	0	a0 = _mm_loadu_si128((__m128i ) &in_b[2ii + 0*16]);
66	0	b0 = _mm_loadu_si128((__m128i ) &in_b[2ii + 1*16]);
67
68	0	a1 = _mm_unpacklo_epi8(a0, b0);
69	0	b1 = _mm_unpackhi_epi8(a0, b0);
70
71	0	a0 = _mm_unpacklo_epi8(a1, b1);
72	0	b0 = _mm_unpackhi_epi8(a1, b1);
73
74	0	a1 = _mm_unpacklo_epi8(a0, b0);
75	0	b1 = _mm_unpackhi_epi8(a0, b0);
76
77	0	a0 = _mm_unpacklo_epi8(a1, b1);
78	0	b0 = _mm_unpackhi_epi8(a1, b1);
79
80	0	_mm_storeu_si128((__m128i ) &out_b[0size + ii], a0);
81	0	_mm_storeu_si128((__m128i ) &out_b[1size + ii], b0);
82	0	}
83	0	return bshuf_trans_byte_elem_remainder(in, out, size, 2,
84	0	size - size % 16);
85	0	}
86
87
88		/* Transpose bytes within elements for 32 bit elements. */
89	0	int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) {
90
91	0	size_t ii;
92	0	const char *in_b;
93	0	char *out_b;
94	0	in_b = (const char*) in;
95	0	out_b = (char*) out;
96	0	__m128i a0, b0, c0, d0, a1, b1, c1, d1;
97
98	0	for (ii=0; ii + 15 < size; ii += 16) {
99	0	a0 = _mm_loadu_si128((__m128i ) &in_b[4ii + 0*16]);
100	0	b0 = _mm_loadu_si128((__m128i ) &in_b[4ii + 1*16]);
101	0	c0 = _mm_loadu_si128((__m128i ) &in_b[4ii + 2*16]);
102	0	d0 = _mm_loadu_si128((__m128i ) &in_b[4ii + 3*16]);
103
104	0	a1 = _mm_unpacklo_epi8(a0, b0);
105	0	b1 = _mm_unpackhi_epi8(a0, b0);
106	0	c1 = _mm_unpacklo_epi8(c0, d0);
107	0	d1 = _mm_unpackhi_epi8(c0, d0);
108
109	0	a0 = _mm_unpacklo_epi8(a1, b1);
110	0	b0 = _mm_unpackhi_epi8(a1, b1);
111	0	c0 = _mm_unpacklo_epi8(c1, d1);
112	0	d0 = _mm_unpackhi_epi8(c1, d1);
113
114	0	a1 = _mm_unpacklo_epi8(a0, b0);
115	0	b1 = _mm_unpackhi_epi8(a0, b0);
116	0	c1 = _mm_unpacklo_epi8(c0, d0);
117	0	d1 = _mm_unpackhi_epi8(c0, d0);
118
119	0	a0 = _mm_unpacklo_epi64(a1, c1);
120	0	b0 = _mm_unpackhi_epi64(a1, c1);
121	0	c0 = _mm_unpacklo_epi64(b1, d1);
122	0	d0 = _mm_unpackhi_epi64(b1, d1);
123
124	0	_mm_storeu_si128((__m128i ) &out_b[0size + ii], a0);
125	0	_mm_storeu_si128((__m128i ) &out_b[1size + ii], b0);
126	0	_mm_storeu_si128((__m128i ) &out_b[2size + ii], c0);
127	0	_mm_storeu_si128((__m128i ) &out_b[3size + ii], d0);
128	0	}
129	0	return bshuf_trans_byte_elem_remainder(in, out, size, 4,
130	0	size - size % 16);
131	0	}
132
133
134		/* Transpose bytes within elements for 64 bit elements. */
135	0	int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) {
136
137	0	size_t ii;
138	0	const char* in_b = (const char*) in;
139	0	char* out_b = (char*) out;
140	0	__m128i a0, b0, c0, d0, e0, f0, g0, h0;
141	0	__m128i a1, b1, c1, d1, e1, f1, g1, h1;
142
143	0	for (ii=0; ii + 15 < size; ii += 16) {
144	0	a0 = _mm_loadu_si128((__m128i ) &in_b[8ii + 0*16]);
145	0	b0 = _mm_loadu_si128((__m128i ) &in_b[8ii + 1*16]);
146	0	c0 = _mm_loadu_si128((__m128i ) &in_b[8ii + 2*16]);
147	0	d0 = _mm_loadu_si128((__m128i ) &in_b[8ii + 3*16]);
148	0	e0 = _mm_loadu_si128((__m128i ) &in_b[8ii + 4*16]);
149	0	f0 = _mm_loadu_si128((__m128i ) &in_b[8ii + 5*16]);
150	0	g0 = _mm_loadu_si128((__m128i ) &in_b[8ii + 6*16]);
151	0	h0 = _mm_loadu_si128((__m128i ) &in_b[8ii + 7*16]);
152
153	0	a1 = _mm_unpacklo_epi8(a0, b0);
154	0	b1 = _mm_unpackhi_epi8(a0, b0);
155	0	c1 = _mm_unpacklo_epi8(c0, d0);
156	0	d1 = _mm_unpackhi_epi8(c0, d0);
157	0	e1 = _mm_unpacklo_epi8(e0, f0);
158	0	f1 = _mm_unpackhi_epi8(e0, f0);
159	0	g1 = _mm_unpacklo_epi8(g0, h0);
160	0	h1 = _mm_unpackhi_epi8(g0, h0);
161
162	0	a0 = _mm_unpacklo_epi8(a1, b1);
163	0	b0 = _mm_unpackhi_epi8(a1, b1);
164	0	c0 = _mm_unpacklo_epi8(c1, d1);
165	0	d0 = _mm_unpackhi_epi8(c1, d1);
166	0	e0 = _mm_unpacklo_epi8(e1, f1);
167	0	f0 = _mm_unpackhi_epi8(e1, f1);
168	0	g0 = _mm_unpacklo_epi8(g1, h1);
169	0	h0 = _mm_unpackhi_epi8(g1, h1);
170
171	0	a1 = _mm_unpacklo_epi32(a0, c0);
172	0	b1 = _mm_unpackhi_epi32(a0, c0);
173	0	c1 = _mm_unpacklo_epi32(b0, d0);
174	0	d1 = _mm_unpackhi_epi32(b0, d0);
175	0	e1 = _mm_unpacklo_epi32(e0, g0);
176	0	f1 = _mm_unpackhi_epi32(e0, g0);
177	0	g1 = _mm_unpacklo_epi32(f0, h0);
178	0	h1 = _mm_unpackhi_epi32(f0, h0);
179
180	0	a0 = _mm_unpacklo_epi64(a1, e1);
181	0	b0 = _mm_unpackhi_epi64(a1, e1);
182	0	c0 = _mm_unpacklo_epi64(b1, f1);
183	0	d0 = _mm_unpackhi_epi64(b1, f1);
184	0	e0 = _mm_unpacklo_epi64(c1, g1);
185	0	f0 = _mm_unpackhi_epi64(c1, g1);
186	0	g0 = _mm_unpacklo_epi64(d1, h1);
187	0	h0 = _mm_unpackhi_epi64(d1, h1);
188
189	0	_mm_storeu_si128((__m128i ) &out_b[0size + ii], a0);
190	0	_mm_storeu_si128((__m128i ) &out_b[1size + ii], b0);
191	0	_mm_storeu_si128((__m128i ) &out_b[2size + ii], c0);
192	0	_mm_storeu_si128((__m128i ) &out_b[3size + ii], d0);
193	0	_mm_storeu_si128((__m128i ) &out_b[4size + ii], e0);
194	0	_mm_storeu_si128((__m128i ) &out_b[5size + ii], f0);
195	0	_mm_storeu_si128((__m128i ) &out_b[6size + ii], g0);
196	0	_mm_storeu_si128((__m128i ) &out_b[7size + ii], h0);
197	0	}
198	0	return bshuf_trans_byte_elem_remainder(in, out, size, 8,
199	0	size - size % 16);
200	0	}
201
202
203		/* Transpose bytes within elements using the best SSE algorithm available. */
204		int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
205	107k	const size_t elem_size) {
206
207	107k	int64_t count;
208
209		// Trivial cases: power of 2 bytes.
210	107k	switch (elem_size) {
211	107k	case 1:
212	107k	count = bshuf_copy(in, out, size, elem_size);
213	107k	return count;
214	0	case 2:
215	0	count = bshuf_trans_byte_elem_SSE_16(in, out, size);
216	0	return count;
217	0	case 4:
218	0	count = bshuf_trans_byte_elem_SSE_32(in, out, size);
219	0	return count;
220	0	case 8:
221	0	count = bshuf_trans_byte_elem_SSE_64(in, out, size);
222	0	return count;
223	107k	}
224
225		// Worst case: odd number of bytes. Turns out that this is faster for
226		// (odd * 2) byte elements as well (hence % 4).
227	0	if (elem_size % 4) {
228	0	count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
229	0	return count;
230	0	}
231
232		// Multiple of power of 2: transpose hierarchically.
233	0	{
234	0	size_t nchunk_elem;
235	0	void* tmp_buf = malloc(size * elem_size);
236	0	if (tmp_buf == NULL) return -1;
237
238	0	if ((elem_size % 8) == 0) {
239	0	nchunk_elem = elem_size / 8;
240	0	TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
241	0	count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
242	0	size * nchunk_elem);
243	0	bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
244	0	} else if ((elem_size % 4) == 0) {
245	0	nchunk_elem = elem_size / 4;
246	0	TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
247	0	count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
248	0	size * nchunk_elem);
249	0	bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
250	0	} else {
251		// Not used since scalar algorithm is faster.
252	0	nchunk_elem = elem_size / 2;
253	0	TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
254	0	count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
255	0	size * nchunk_elem);
256	0	bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
257	0	}
258
259	0	free(tmp_buf);
260	0	return count;
261	0	}
262	0	}
263
264
265		/* Transpose bits within bytes. */
266		int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
267	0	const size_t elem_size) {
268
269	0	size_t ii, kk;
270	0	const char* in_b = (const char*) in;
271	0	char* out_b = (char*) out;
272	0	uint16_t* out_ui16;
273
274	0	int64_t count;
275
276	0	size_t nbyte = elem_size * size;
277
278	0	CHECK_MULT_EIGHT(nbyte);
279
280	0	__m128i xmm;
281	0	int32_t bt;
282
283	0	for (ii = 0; ii + 15 < nbyte; ii += 16) {
284	0	xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
285	0	for (kk = 0; kk < 8; kk++) {
286	0	bt = _mm_movemask_epi8(xmm);
287	0	xmm = _mm_slli_epi16(xmm, 1);
288	0	out_ui16 = (uint16_t) &out_b[((7 - kk) nbyte + ii) / 8];
289	0	*out_ui16 = bt;
290	0	}
291	0	}
292	0	count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
293	0	nbyte - nbyte % 16);
294	0	return count;
295	0	}
296
297
298		/* Transpose bits within elements. */
299		int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
300	0	const size_t elem_size) {
301
302	0	int64_t count;
303
304	0	CHECK_MULT_EIGHT(size);
305
306	0	void* tmp_buf = malloc(size * elem_size);
307	0	if (tmp_buf == NULL) return -1;
308
309	0	count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
310	0	CHECK_ERR_FREE(count, tmp_buf);
311	0	count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size);
312	0	CHECK_ERR_FREE(count, tmp_buf);
313	0	count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
314
315	0	free(tmp_buf);
316
317	0	return count;
318	0	}
319
320
321		/* For data organized into a row for each bit (8 * elem_size rows), transpose
322		* the bytes. */
323		int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size,
324	21.8k	const size_t elem_size) {
325
326	21.8k	size_t ii, jj;
327	21.8k	const char* in_b = (const char*) in;
328	21.8k	char* out_b = (char*) out;
329
330	21.8k	CHECK_MULT_EIGHT(size);
331
332	21.8k	size_t nrows = 8 * elem_size;
333	21.8k	size_t nbyte_row = size / 8;
334
335	21.8k	__m128i a0, b0, c0, d0, e0, f0, g0, h0;
336	21.8k	__m128i a1, b1, c1, d1, e1, f1, g1, h1;
337	21.8k	__m128 as, bs, cs, ds, es, fs, gs, hs;
338
339	43.7k	for (ii = 0; ii + 7 < nrows; ii += 8) {
340	701k	for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
341	679k	a0 = _mm_loadu_si128((__m128i ) &in_b[(ii + 0)nbyte_row + jj]);
342	679k	b0 = _mm_loadu_si128((__m128i ) &in_b[(ii + 1)nbyte_row + jj]);
343	679k	c0 = _mm_loadu_si128((__m128i ) &in_b[(ii + 2)nbyte_row + jj]);
344	679k	d0 = _mm_loadu_si128((__m128i ) &in_b[(ii + 3)nbyte_row + jj]);
345	679k	e0 = _mm_loadu_si128((__m128i ) &in_b[(ii + 4)nbyte_row + jj]);
346	679k	f0 = _mm_loadu_si128((__m128i ) &in_b[(ii + 5)nbyte_row + jj]);
347	679k	g0 = _mm_loadu_si128((__m128i ) &in_b[(ii + 6)nbyte_row + jj]);
348	679k	h0 = _mm_loadu_si128((__m128i ) &in_b[(ii + 7)nbyte_row + jj]);
349
350
351	679k	a1 = _mm_unpacklo_epi8(a0, b0);
352	679k	b1 = _mm_unpacklo_epi8(c0, d0);
353	679k	c1 = _mm_unpacklo_epi8(e0, f0);
354	679k	d1 = _mm_unpacklo_epi8(g0, h0);
355	679k	e1 = _mm_unpackhi_epi8(a0, b0);
356	679k	f1 = _mm_unpackhi_epi8(c0, d0);
357	679k	g1 = _mm_unpackhi_epi8(e0, f0);
358	679k	h1 = _mm_unpackhi_epi8(g0, h0);
359
360
361	679k	a0 = _mm_unpacklo_epi16(a1, b1);
362	679k	b0 = _mm_unpacklo_epi16(c1, d1);
363	679k	c0 = _mm_unpackhi_epi16(a1, b1);
364	679k	d0 = _mm_unpackhi_epi16(c1, d1);
365
366	679k	e0 = _mm_unpacklo_epi16(e1, f1);
367	679k	f0 = _mm_unpacklo_epi16(g1, h1);
368	679k	g0 = _mm_unpackhi_epi16(e1, f1);
369	679k	h0 = _mm_unpackhi_epi16(g1, h1);
370
371
372	679k	a1 = _mm_unpacklo_epi32(a0, b0);
373	679k	b1 = _mm_unpackhi_epi32(a0, b0);
374
375	679k	c1 = _mm_unpacklo_epi32(c0, d0);
376	679k	d1 = _mm_unpackhi_epi32(c0, d0);
377
378	679k	e1 = _mm_unpacklo_epi32(e0, f0);
379	679k	f1 = _mm_unpackhi_epi32(e0, f0);
380
381	679k	g1 = _mm_unpacklo_epi32(g0, h0);
382	679k	h1 = _mm_unpackhi_epi32(g0, h0);
383
384		// We don't have a storeh instruction for integers, so interpret
385		// as a float. Have a storel (_mm_storel_epi64).
386	679k	as = (__m128 *) &a1;
387	679k	bs = (__m128 *) &b1;
388	679k	cs = (__m128 *) &c1;
389	679k	ds = (__m128 *) &d1;
390	679k	es = (__m128 *) &e1;
391	679k	fs = (__m128 *) &f1;
392	679k	gs = (__m128 *) &g1;
393	679k	hs = (__m128 *) &h1;
394
395	679k	_mm_storel_pi((__m64 ) &out_b[(jj + 0) nrows + ii], *as);
396	679k	_mm_storel_pi((__m64 ) &out_b[(jj + 2) nrows + ii], *bs);
397	679k	_mm_storel_pi((__m64 ) &out_b[(jj + 4) nrows + ii], *cs);
398	679k	_mm_storel_pi((__m64 ) &out_b[(jj + 6) nrows + ii], *ds);
399	679k	_mm_storel_pi((__m64 ) &out_b[(jj + 8) nrows + ii], *es);
400	679k	_mm_storel_pi((__m64 ) &out_b[(jj + 10) nrows + ii], *fs);
401	679k	_mm_storel_pi((__m64 ) &out_b[(jj + 12) nrows + ii], *gs);
402	679k	_mm_storel_pi((__m64 ) &out_b[(jj + 14) nrows + ii], *hs);
403
404	679k	_mm_storeh_pi((__m64 ) &out_b[(jj + 1) nrows + ii], *as);
405	679k	_mm_storeh_pi((__m64 ) &out_b[(jj + 3) nrows + ii], *bs);
406	679k	_mm_storeh_pi((__m64 ) &out_b[(jj + 5) nrows + ii], *cs);
407	679k	_mm_storeh_pi((__m64 ) &out_b[(jj + 7) nrows + ii], *ds);
408	679k	_mm_storeh_pi((__m64 ) &out_b[(jj + 9) nrows + ii], *es);
409	679k	_mm_storeh_pi((__m64 ) &out_b[(jj + 11) nrows + ii], *fs);
410	679k	_mm_storeh_pi((__m64 ) &out_b[(jj + 13) nrows + ii], *gs);
411	679k	_mm_storeh_pi((__m64 ) &out_b[(jj + 15) nrows + ii], *hs);
412	679k	}
413	26.0k	for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
414	4.22k	out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
415	4.22k	out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
416	4.22k	out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
417	4.22k	out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
418	4.22k	out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
419	4.22k	out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
420	4.22k	out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
421	4.22k	out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
422	4.22k	}
423	21.8k	}
424	21.8k	return size * elem_size;
425	21.8k	}
426
427
428		/* Shuffle bits within the bytes of eight element blocks. */
429		int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size,
430	21.8k	const size_t elem_size) {
431
432	21.8k	CHECK_MULT_EIGHT(size);
433
434		// With a bit of care, this could be written such that such that it is
435		// in_buf = out_buf safe.
436	21.8k	const char* in_b = (const char*) in;
437	21.8k	uint16_t* out_ui16 = (uint16_t*) out;
438
439	21.8k	size_t ii, jj, kk;
440	21.8k	size_t nbyte = elem_size * size;
441
442	21.8k	__m128i xmm;
443	21.8k	int32_t bt;
444
445	21.8k	if (elem_size % 2) {
446	21.8k	bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
447	21.8k	} else {
448	0	for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
449	0	ii += 8 * elem_size) {
450	0	for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
451	0	xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
452	0	for (kk = 0; kk < 8; kk++) {
453	0	bt = _mm_movemask_epi8(xmm);
454	0	xmm = _mm_slli_epi16(xmm, 1);
455	0	size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
456	0	out_ui16[ind / 2] = bt;
457	0	}
458	0	}
459	0	}
460	0	}
461	21.8k	return size * elem_size;
462	21.8k	}
463
464
465		/* Untranspose bits within elements. */
466		int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
467	0	const size_t elem_size) {
468
469	0	int64_t count;
470
471	0	CHECK_MULT_EIGHT(size);
472
473	0	void* tmp_buf = malloc(size * elem_size);
474	0	if (tmp_buf == NULL) return -1;
475
476	0	count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size);
477	0	CHECK_ERR_FREE(count, tmp_buf);
478	0	count = bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size);
479
480	0	free(tmp_buf);
481
482	0	return count;
483	0	}
484
485		const bool is_bshuf_SSE = true;
486
487		#else /* defined(__SSE2__) */
488
489		const bool is_bshuf_SSE = false;
490
491		int64_t
492		bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
493		const size_t elem_size) {
494		abort();
495		}
496
497		int64_t
498		bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
499		const size_t elem_size) {
500		abort();
501		}
502
503		#endif /* defined(__SSE2__) */