/src/FreeRDP/libfreerdp/primitives/sse/prim_templates.h

Source (jump to first uncovered line)
/* prim_templates.h
 * vi:ts=4 sw=4
 *
 * (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License. You may obtain
 * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing
 * permissions and limitations under the License.  Algorithms used by
 * this code may be covered by patents by HP, Microsoft, or other parties.
 */

#pragma once

#include "prim_avxsse.h"

/* These are prototypes for SSE (potentially NEON) routines that do a
 * simple SSE operation over an array of data.  Since so much of this
 * code is shared except for the operation itself, these prototypes are
 * used rather than duplicating code.  The naming convention depends on
 * the parameters:  S=Source param; C=Constant; D=Destination.
 * All the macros have parameters for a fallback procedure if the data
 * is too small and an operation "the slow way" for use at 16-byte edges.
 */

/* SSE3 note:  If someone needs to support an SSE2 version of these without
 * SSE3 support, an alternative version could be added that merely checks
 * that 16-byte alignment on both destination and source(s) can be
 * achieved, rather than use LDDQU for unaligned reads.
 */

/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
 * It easily can't do that if the value is stored in a variable.
 * So don't save it as an intermediate value.
 */

/* ----------------------------------------------------------------------------
 * SCD = Source, Constant, Destination
 */
#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
  static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val,       \
                          _type_* WINPR_RESTRICT pDst, UINT32 ulen)            \
  {                                                                            \
    size_t len = ulen;                                                       \
    INT32 shifts = 0;                                                        \
    const _type_* sptr = pSrc;                                               \
    _type_* dptr = pDst;                                                     \
    if (val == 0)                                                            \
      return PRIMITIVES_SUCCESS;                                           \
    if (val >= 16)                                                           \
      return -1;                                                           \
    if (sizeof(_type_) == 1)                                                 \
      shifts = 1;                                                          \
    else if (sizeof(_type_) == 2)                                            \
      shifts = 2;                                                          \
    else if (sizeof(_type_) == 4)                                            \
      shifts = 3;                                                          \
    else if (sizeof(_type_) == 8)                                            \
      shifts = 4;                                                          \
    /* Use 8 128-bit SSE registers. */                                       \
    size_t count = len >> (8 - shifts);                                      \
    len -= count << (8 - shifts);                                            \
                                                                                 \
    while (count--)                                                          \
    {                                                                        \
      __m128i xmm0 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      __m128i xmm1 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      __m128i xmm2 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      __m128i xmm3 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      __m128i xmm4 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      __m128i xmm5 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      __m128i xmm6 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      __m128i xmm7 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      xmm0 = _op_(xmm0, (_op_type_)val);                                   \
      xmm1 = _op_(xmm1, (_op_type_)val);                                   \
      xmm2 = _op_(xmm2, (_op_type_)val);                                   \
      xmm3 = _op_(xmm3, (_op_type_)val);                                   \
      xmm4 = _op_(xmm4, (_op_type_)val);                                   \
      xmm5 = _op_(xmm5, (_op_type_)val);                                   \
      xmm6 = _op_(xmm6, (_op_type_)val);                                   \
      xmm7 = _op_(xmm7, (_op_type_)val);                                   \
      STORE_SI128(dptr, xmm0);                                             \
      dptr += (16 / sizeof(_type_));                                       \
      STORE_SI128(dptr, xmm1);                                             \
      dptr += (16 / sizeof(_type_));                                       \
      STORE_SI128(dptr, xmm2);                                             \
      dptr += (16 / sizeof(_type_));                                       \
      STORE_SI128(dptr, xmm3);                                             \
      dptr += (16 / sizeof(_type_));                                       \
      STORE_SI128(dptr, xmm4);                                             \
      dptr += (16 / sizeof(_type_));                                       \
      STORE_SI128(dptr, xmm5);                                             \
      dptr += (16 / sizeof(_type_));                                       \
      STORE_SI128(dptr, xmm6);                                             \
      dptr += (16 / sizeof(_type_));                                       \
      STORE_SI128(dptr, xmm7);                                             \
      dptr += (16 / sizeof(_type_));                                       \
    }                                                                        \
                                                                                 \
    /* Use a single 128-bit SSE register. */                                 \
    count = len >> (5 - shifts);                                             \
    len -= count << (5 - shifts);                                            \
    while (count--)                                                          \
    {                                                                        \
      __m128i xmm0 = LOAD_SI128(sptr);                                     \
      sptr += (16 / sizeof(_type_));                                       \
      xmm0 = _op_(xmm0, (_op_type_)val);                                   \
      STORE_SI128(dptr, xmm0);                                             \
      dptr += (16 / sizeof(_type_));                                       \
    }                                                                        \
    /* Finish off the remainder. */                                          \
    while (len--)                                                            \
    {                                                                        \
      _slowWay_;                                                           \
    }                                                                        \
    return PRIMITIVES_SUCCESS;                                               \
  }

/* ----------------------------------------------------------------------------
 * SCD = Source, Constant, Destination
 * PRE = preload xmm0 with the constant.
 */
#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)  \
  static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
                          _type_* WINPR_RESTRICT pDst, INT32 ilen)       \
  {                                                                      \
    size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen);               \
    int shifts = 0;                                                    \
    const _type_* sptr = pSrc;                                         \
    _type_* dptr = pDst;                                               \
    __m128i xmm0;                                                      \
    if (sizeof(_type_) == 1)                                           \
      shifts = 1;                                                    \
    else if (sizeof(_type_) == 2)                                      \
      shifts = 2;                                                    \
    else if (sizeof(_type_) == 4)                                      \
      shifts = 3;                                                    \
    else if (sizeof(_type_) == 8)                                      \
      shifts = 4;                                                    \
    /* Use 4 128-bit SSE registers. */                                 \
    size_t count = len >> (7 - shifts);                                \
    len -= count << (7 - shifts);                                      \
    xmm0 = mm_set1_epu32(val);                                         \
    for (size_t x = 0; x < count; x++)                                 \
    {                                                                  \
      __m128i xmm1 = LOAD_SI128(sptr);                               \
      sptr += (16 / sizeof(_type_));                                 \
      __m128i xmm2 = LOAD_SI128(sptr);                               \
      sptr += (16 / sizeof(_type_));                                 \
      __m128i xmm3 = LOAD_SI128(sptr);                               \
      sptr += (16 / sizeof(_type_));                                 \
      __m128i xmm4 = LOAD_SI128(sptr);                               \
      sptr += (16 / sizeof(_type_));                                 \
      xmm1 = _op_(xmm1, xmm0);                                       \
      xmm2 = _op_(xmm2, xmm0);                                       \
      xmm3 = _op_(xmm3, xmm0);                                       \
      xmm4 = _op_(xmm4, xmm0);                                       \
      STORE_SI128(dptr, xmm1);                                       \
      dptr += (16 / sizeof(_type_));                                 \
      STORE_SI128(dptr, xmm2);                                       \
      dptr += (16 / sizeof(_type_));                                 \
      STORE_SI128(dptr, xmm3);                                       \
      dptr += (16 / sizeof(_type_));                                 \
      STORE_SI128(dptr, xmm4);                                       \
      dptr += (16 / sizeof(_type_));                                 \
    }                                                                  \
    /* Use a single 128-bit SSE register. */                           \
    count = len >> (5 - shifts);                                       \
    len -= count << (5 - shifts);                                      \
    for (size_t x = 0; x < count; x++)                                 \
    {                                                                  \
      __m128i xmm1 = LOAD_SI128(sptr);                               \
      sptr += (16 / sizeof(_type_));                                 \
      xmm1 = _op_(xmm1, xmm0);                                       \
      STORE_SI128(dptr, xmm1);                                       \
      dptr += (16 / sizeof(_type_));                                 \
    }                                                                  \
    /* Finish off the remainder. */                                    \
    for (size_t x = 0; x < len; x++)                                   \
    {                                                                  \
      _slowWay_;                                                     \
    }                                                                  \
    return PRIMITIVES_SUCCESS;                                         \
  }

/* ----------------------------------------------------------------------------
 * SSD = Source1, Source2, Destination
 */
#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_)                        \
  static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1,                              \
                          const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
                          UINT32 ulen)                                                     \
  {                                                                                        \
    size_t len = ulen;                                                                   \
    int shifts = 0;                                                                      \
    const _type_* sptr1 = pSrc1;                                                         \
    const _type_* sptr2 = pSrc2;                                                         \
    _type_* dptr = pDst;                                                                 \
    size_t count;                                                                        \
    if (sizeof(_type_) == 1)                                                             \
      shifts = 1;                                                                      \
    else if (sizeof(_type_) == 2)                                                        \
      shifts = 2;                                                                      \
    else if (sizeof(_type_) == 4)                                                        \
      shifts = 3;                                                                      \
    else if (sizeof(_type_) == 8)                                                        \
      shifts = 4;                                                                      \
    /* Use 4 128-bit SSE registers. */                                                   \
    count = len >> (7 - shifts);                                                         \
    len -= count << (7 - shifts);                                                        \
    /* Aligned loads */                                                                  \
    while (count--)                                                                      \
    {                                                                                    \
      __m128i xmm0 = LOAD_SI128(sptr1);                                                \
      sptr1 += (16 / sizeof(_type_));                                                  \
      __m128i xmm1 = LOAD_SI128(sptr1);                                                \
      sptr1 += (16 / sizeof(_type_));                                                  \
      __m128i xmm2 = LOAD_SI128(sptr1);                                                \
      sptr1 += (16 / sizeof(_type_));                                                  \
      __m128i xmm3 = LOAD_SI128(sptr1);                                                \
      sptr1 += (16 / sizeof(_type_));                                                  \
      __m128i xmm4 = LOAD_SI128(sptr2);                                                \
      sptr2 += (16 / sizeof(_type_));                                                  \
      __m128i xmm5 = LOAD_SI128(sptr2);                                                \
      sptr2 += (16 / sizeof(_type_));                                                  \
      __m128i xmm6 = LOAD_SI128(sptr2);                                                \
      sptr2 += (16 / sizeof(_type_));                                                  \
      __m128i xmm7 = LOAD_SI128(sptr2);                                                \
      sptr2 += (16 / sizeof(_type_));                                                  \
      xmm0 = _op_(xmm0, xmm4);                                                         \
      xmm1 = _op_(xmm1, xmm5);                                                         \
      xmm2 = _op_(xmm2, xmm6);                                                         \
      xmm3 = _op_(xmm3, xmm7);                                                         \
      STORE_SI128(dptr, xmm0);                                                         \
      dptr += (16 / sizeof(_type_));                                                   \
      STORE_SI128(dptr, xmm1);                                                         \
      dptr += (16 / sizeof(_type_));                                                   \
      STORE_SI128(dptr, xmm2);                                                         \
      dptr += (16 / sizeof(_type_));                                                   \
      STORE_SI128(dptr, xmm3);                                                         \
      dptr += (16 / sizeof(_type_));                                                   \
    }                                                                                    \
    /* Use a single 128-bit SSE register. */                                             \
    count = len >> (5 - shifts);                                                         \
    len -= count << (5 - shifts);                                                        \
    while (count--)                                                                      \
    {                                                                                    \
      __m128i xmm0 = LOAD_SI128(sptr1);                                                \
      sptr1 += (16 / sizeof(_type_));                                                  \
      __m128i xmm1 = LOAD_SI128(sptr2);                                                \
      sptr2 += (16 / sizeof(_type_));                                                  \
      xmm0 = _op_(xmm0, xmm1);                                                         \
      STORE_SI128(dptr, xmm0);                                                         \
      dptr += (16 / sizeof(_type_));                                                   \
    }                                                                                    \
    /* Finish off the remainder. */                                                      \
    while (len--)                                                                        \
    {                                                                                    \
      _slowWay_;                                                                       \
    }                                                                                    \
    return PRIMITIVES_SUCCESS;                                                           \
  }

Coverage Report

Created: 2025-07-01 06:46

Line	Count	Source (jump to first uncovered line)
1		/* prim_templates.h
2		* vi:ts=4 sw=4
3		*
4		* (c) Copyright 2012 Hewlett-Packard Development Company, L.P.
5		* Licensed under the Apache License, Version 2.0 (the "License"); you may
6		* not use this file except in compliance with the License. You may obtain
7		* a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
8		* Unless required by applicable law or agreed to in writing, software
9		* distributed under the License is distributed on an "AS IS" BASIS,
10		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11		* or implied. See the License for the specific language governing
12		* permissions and limitations under the License. Algorithms used by
13		* this code may be covered by patents by HP, Microsoft, or other parties.
14		*/
15
16		#pragma once
17
18		#include "prim_avxsse.h"
19
20		/* These are prototypes for SSE (potentially NEON) routines that do a
21		* simple SSE operation over an array of data. Since so much of this
22		* code is shared except for the operation itself, these prototypes are
23		* used rather than duplicating code. The naming convention depends on
24		* the parameters: S=Source param; C=Constant; D=Destination.
25		* All the macros have parameters for a fallback procedure if the data
26		* is too small and an operation "the slow way" for use at 16-byte edges.
27		*/
28
29		/* SSE3 note: If someone needs to support an SSE2 version of these without
30		* SSE3 support, an alternative version could be added that merely checks
31		* that 16-byte alignment on both destination and source(s) can be
32		* achieved, rather than use LDDQU for unaligned reads.
33		*/
34
35		/* Note: the compiler is good at turning (16/sizeof(_type_)) into a constant.
36		* It easily can't do that if the value is stored in a variable.
37		* So don't save it as an intermediate value.
38		*/
39
40		/* ----------------------------------------------------------------------------
41		* SCD = Source, Constant, Destination
42		*/
43		#define SSE3_SCD_ROUTINE(_name_, _type_, _fallback_, _op_, _op_type_, _slowWay_) \
44		static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, UINT32 val, \
45		_type_* WINPR_RESTRICT pDst, UINT32 ulen) \
46	0	{ \
47	0	size_t len = ulen; \
48	0	INT32 shifts = 0; \
49	0	const _type_* sptr = pSrc; \
50	0	_type_* dptr = pDst; \
51	0	if (val == 0) \
52	0	return PRIMITIVES_SUCCESS; \
53	0	if (val >= 16) \
54	0	return -1; \
55	0	if (sizeof(_type_) == 1) \
56	0	shifts = 1; \
57	0	else if (sizeof(_type_) == 2) \
58	0	shifts = 2; \
59	0	else if (sizeof(_type_) == 4) \
60	0	shifts = 3; \
61	0	else if (sizeof(_type_) == 8) \
62	0	shifts = 4; \
63	0	/* Use 8 128-bit SSE registers. */ \
64	0	size_t count = len >> (8 - shifts); \
65	0	len -= count << (8 - shifts); \
66	0	\
67	0	while (count--) \
68	0	{ \
69	0	__m128i xmm0 = LOAD_SI128(sptr); \
70	0	sptr += (16 / sizeof(_type_)); \
71	0	__m128i xmm1 = LOAD_SI128(sptr); \
72	0	sptr += (16 / sizeof(_type_)); \
73	0	__m128i xmm2 = LOAD_SI128(sptr); \
74	0	sptr += (16 / sizeof(_type_)); \
75	0	__m128i xmm3 = LOAD_SI128(sptr); \
76	0	sptr += (16 / sizeof(_type_)); \
77	0	__m128i xmm4 = LOAD_SI128(sptr); \
78	0	sptr += (16 / sizeof(_type_)); \
79	0	__m128i xmm5 = LOAD_SI128(sptr); \
80	0	sptr += (16 / sizeof(_type_)); \
81	0	__m128i xmm6 = LOAD_SI128(sptr); \
82	0	sptr += (16 / sizeof(_type_)); \
83	0	__m128i xmm7 = LOAD_SI128(sptr); \
84	0	sptr += (16 / sizeof(_type_)); \
85	0	xmm0 = _op_(xmm0, (_op_type_)val); \
86	0	xmm1 = _op_(xmm1, (_op_type_)val); \
87	0	xmm2 = _op_(xmm2, (_op_type_)val); \
88	0	xmm3 = _op_(xmm3, (_op_type_)val); \
89	0	xmm4 = _op_(xmm4, (_op_type_)val); \
90	0	xmm5 = _op_(xmm5, (_op_type_)val); \
91	0	xmm6 = _op_(xmm6, (_op_type_)val); \
92	0	xmm7 = _op_(xmm7, (_op_type_)val); \
93	0	STORE_SI128(dptr, xmm0); \
94	0	dptr += (16 / sizeof(_type_)); \
95	0	STORE_SI128(dptr, xmm1); \
96	0	dptr += (16 / sizeof(_type_)); \
97	0	STORE_SI128(dptr, xmm2); \
98	0	dptr += (16 / sizeof(_type_)); \
99	0	STORE_SI128(dptr, xmm3); \
100	0	dptr += (16 / sizeof(_type_)); \
101	0	STORE_SI128(dptr, xmm4); \
102	0	dptr += (16 / sizeof(_type_)); \
103	0	STORE_SI128(dptr, xmm5); \
104	0	dptr += (16 / sizeof(_type_)); \
105	0	STORE_SI128(dptr, xmm6); \
106	0	dptr += (16 / sizeof(_type_)); \
107	0	STORE_SI128(dptr, xmm7); \
108	0	dptr += (16 / sizeof(_type_)); \
109	0	} \
110	0	\
111	0	/* Use a single 128-bit SSE register. */ \
112	0	count = len >> (5 - shifts); \
113	0	len -= count << (5 - shifts); \
114	0	while (count--) \
115	0	{ \
116	0	__m128i xmm0 = LOAD_SI128(sptr); \
117	0	sptr += (16 / sizeof(_type_)); \
118	0	xmm0 = _op_(xmm0, (_op_type_)val); \
119	0	STORE_SI128(dptr, xmm0); \
120	0	dptr += (16 / sizeof(_type_)); \
121	0	} \
122	0	/* Finish off the remainder. */ \
123	0	while (len--) \
124	0	{ \
125	0	_slowWay_; \
126	0	} \
127	0	return PRIMITIVES_SUCCESS; \
128	0	} Unexecuted instantiation: prim_shift_sse3.c:sse2_lShiftC_16s Unexecuted instantiation: prim_shift_sse3.c:sse2_rShiftC_16s Unexecuted instantiation: prim_shift_sse3.c:sse2_lShiftC_16u Unexecuted instantiation: prim_shift_sse3.c:sse2_rShiftC_16u
129
130		/* ----------------------------------------------------------------------------
131		* SCD = Source, Constant, Destination
132		* PRE = preload xmm0 with the constant.
133		*/
134		#define SSE3_SCD_PRE_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
135		static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc, _type_ val, \
136		_type_* WINPR_RESTRICT pDst, INT32 ilen) \
137	0	{ \
138	0	size_t len = WINPR_ASSERTING_INT_CAST(size_t, ilen); \
139	0	int shifts = 0; \
140	0	const _type_* sptr = pSrc; \
141	0	_type_* dptr = pDst; \
142	0	__m128i xmm0; \
143	0	if (sizeof(_type_) == 1) \
144	0	shifts = 1; \
145	0	else if (sizeof(_type_) == 2) \
146	0	shifts = 2; \
147	0	else if (sizeof(_type_) == 4) \
148	0	shifts = 3; \
149	0	else if (sizeof(_type_) == 8) \
150	0	shifts = 4; \
151	0	/* Use 4 128-bit SSE registers. */ \
152	0	size_t count = len >> (7 - shifts); \
153	0	len -= count << (7 - shifts); \
154	0	xmm0 = mm_set1_epu32(val); \
155	0	for (size_t x = 0; x < count; x++) \
156	0	{ \
157	0	__m128i xmm1 = LOAD_SI128(sptr); \
158	0	sptr += (16 / sizeof(_type_)); \
159	0	__m128i xmm2 = LOAD_SI128(sptr); \
160	0	sptr += (16 / sizeof(_type_)); \
161	0	__m128i xmm3 = LOAD_SI128(sptr); \
162	0	sptr += (16 / sizeof(_type_)); \
163	0	__m128i xmm4 = LOAD_SI128(sptr); \
164	0	sptr += (16 / sizeof(_type_)); \
165	0	xmm1 = _op_(xmm1, xmm0); \
166	0	xmm2 = _op_(xmm2, xmm0); \
167	0	xmm3 = _op_(xmm3, xmm0); \
168	0	xmm4 = _op_(xmm4, xmm0); \
169	0	STORE_SI128(dptr, xmm1); \
170	0	dptr += (16 / sizeof(_type_)); \
171	0	STORE_SI128(dptr, xmm2); \
172	0	dptr += (16 / sizeof(_type_)); \
173	0	STORE_SI128(dptr, xmm3); \
174	0	dptr += (16 / sizeof(_type_)); \
175	0	STORE_SI128(dptr, xmm4); \
176	0	dptr += (16 / sizeof(_type_)); \
177	0	} \
178	0	/* Use a single 128-bit SSE register. */ \
179	0	count = len >> (5 - shifts); \
180	0	len -= count << (5 - shifts); \
181	0	for (size_t x = 0; x < count; x++) \
182	0	{ \
183	0	__m128i xmm1 = LOAD_SI128(sptr); \
184	0	sptr += (16 / sizeof(_type_)); \
185	0	xmm1 = _op_(xmm1, xmm0); \
186	0	STORE_SI128(dptr, xmm1); \
187	0	dptr += (16 / sizeof(_type_)); \
188	0	} \
189	0	/* Finish off the remainder. */ \
190	0	for (size_t x = 0; x < len; x++) \
191	0	{ \
192	0	_slowWay_; \
193	0	} \
194	0	return PRIMITIVES_SUCCESS; \
195	0	} Unexecuted instantiation: prim_andor_sse3.c:sse3_andC_32u Unexecuted instantiation: prim_andor_sse3.c:sse3_orC_32u
196
197		/* ----------------------------------------------------------------------------
198		* SSD = Source1, Source2, Destination
199		*/
200		#define SSE3_SSD_ROUTINE(_name_, _type_, _fallback_, _op_, _slowWay_) \
201		static pstatus_t _name_(const _type_* WINPR_RESTRICT pSrc1, \
202		const _type_* WINPR_RESTRICT pSrc2, _type_* WINPR_RESTRICT pDst, \
203		UINT32 ulen) \
204	0	{ \
205	0	size_t len = ulen; \
206	0	int shifts = 0; \
207	0	const _type_* sptr1 = pSrc1; \
208	0	const _type_* sptr2 = pSrc2; \
209	0	_type_* dptr = pDst; \
210	0	size_t count; \
211	0	if (sizeof(_type_) == 1) \
212	0	shifts = 1; \
213	0	else if (sizeof(_type_) == 2) \
214	0	shifts = 2; \
215	0	else if (sizeof(_type_) == 4) \
216	0	shifts = 3; \
217	0	else if (sizeof(_type_) == 8) \
218	0	shifts = 4; \
219	0	/* Use 4 128-bit SSE registers. */ \
220	0	count = len >> (7 - shifts); \
221	0	len -= count << (7 - shifts); \
222	0	/* Aligned loads */ \
223	0	while (count--) \
224	0	{ \
225	0	__m128i xmm0 = LOAD_SI128(sptr1); \
226	0	sptr1 += (16 / sizeof(_type_)); \
227	0	__m128i xmm1 = LOAD_SI128(sptr1); \
228	0	sptr1 += (16 / sizeof(_type_)); \
229	0	__m128i xmm2 = LOAD_SI128(sptr1); \
230	0	sptr1 += (16 / sizeof(_type_)); \
231	0	__m128i xmm3 = LOAD_SI128(sptr1); \
232	0	sptr1 += (16 / sizeof(_type_)); \
233	0	__m128i xmm4 = LOAD_SI128(sptr2); \
234	0	sptr2 += (16 / sizeof(_type_)); \
235	0	__m128i xmm5 = LOAD_SI128(sptr2); \
236	0	sptr2 += (16 / sizeof(_type_)); \
237	0	__m128i xmm6 = LOAD_SI128(sptr2); \
238	0	sptr2 += (16 / sizeof(_type_)); \
239	0	__m128i xmm7 = LOAD_SI128(sptr2); \
240	0	sptr2 += (16 / sizeof(_type_)); \
241	0	xmm0 = _op_(xmm0, xmm4); \
242	0	xmm1 = _op_(xmm1, xmm5); \
243	0	xmm2 = _op_(xmm2, xmm6); \
244	0	xmm3 = _op_(xmm3, xmm7); \
245	0	STORE_SI128(dptr, xmm0); \
246	0	dptr += (16 / sizeof(_type_)); \
247	0	STORE_SI128(dptr, xmm1); \
248	0	dptr += (16 / sizeof(_type_)); \
249	0	STORE_SI128(dptr, xmm2); \
250	0	dptr += (16 / sizeof(_type_)); \
251	0	STORE_SI128(dptr, xmm3); \
252	0	dptr += (16 / sizeof(_type_)); \
253	0	} \
254	0	/* Use a single 128-bit SSE register. */ \
255	0	count = len >> (5 - shifts); \
256	0	len -= count << (5 - shifts); \
257	0	while (count--) \
258	0	{ \
259	0	__m128i xmm0 = LOAD_SI128(sptr1); \
260	0	sptr1 += (16 / sizeof(_type_)); \
261	0	__m128i xmm1 = LOAD_SI128(sptr2); \
262	0	sptr2 += (16 / sizeof(_type_)); \
263	0	xmm0 = _op_(xmm0, xmm1); \
264	0	STORE_SI128(dptr, xmm0); \
265	0	dptr += (16 / sizeof(_type_)); \
266	0	} \
267	0	/* Finish off the remainder. */ \
268	0	while (len--) \
269	0	{ \
270	0	_slowWay_; \
271	0	} \
272	0	return PRIMITIVES_SUCCESS; \
273	0	}