/src/Simd/src/Simd/SimdSet.h

Source
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2025 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef __SimdSet_h__
#define __SimdSet_h__

#include "Simd/SimdDefs.h"
#include "Simd/SimdConst.h"

namespace Simd
{
    namespace Base
    {
        SIMD_INLINE void SetZero(uint16_t* dst, size_t size)
        {
            for (size_t i = 0; i < size; ++i)
                dst[i] = 0;
        }
    }

#ifdef SIMD_SSE41_ENABLE
    namespace Sse41
    {
        SIMD_INLINE __m128i SetInt8(char a0, char a1)
        {
            return _mm_unpacklo_epi8(_mm_set1_epi8(a0), _mm_set1_epi8(a1));
        }

        SIMD_INLINE __m128i SetInt16(short a0, short a1)
        {
            return _mm_unpacklo_epi16(_mm_set1_epi16(a0), _mm_set1_epi16(a1));
        }

        SIMD_INLINE __m128i SetInt32(int a0, int a1)
        {
            return _mm_unpacklo_epi32(_mm_set1_epi32(a0), _mm_set1_epi32(a1));
        }

        SIMD_INLINE __m128 SetFloat(float a0, float a1)
        {
            return _mm_unpacklo_ps(_mm_set_ps1(a0), _mm_set_ps1(a1));
        }

        //-------------------------------------------------------------------------------------------------

        SIMD_INLINE void SetZero(uint16_t* dst)
        {
            _mm_storeu_si128((__m128i*)dst, _mm_setzero_si128());
        }
    }
#endif

#ifdef SIMD_AVX2_ENABLE
    namespace Avx2
    {
        SIMD_INLINE __m256 Set(__m128 a0, __m128 a1)
        {
            return _mm256_insertf128_ps(_mm256_castps128_ps256(a0), a1, 1);
}

        SIMD_INLINE __m256 Set(__m128 a)
        {
            return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 1);
        }

        SIMD_INLINE __m256i SetInt8(char a0, char a1)
        {
            return _mm256_unpacklo_epi8(_mm256_set1_epi8(a0), _mm256_set1_epi8(a1));
        }

        SIMD_INLINE __m256i SetInt16(short a0, short a1)
        {
            return _mm256_unpacklo_epi16(_mm256_set1_epi16(a0), _mm256_set1_epi16(a1));
        }

        SIMD_INLINE __m256i SetInt32(int a0, int a1)
        {
            return _mm256_unpacklo_epi32(_mm256_set1_epi32(a0), _mm256_set1_epi32(a1));
        }

        SIMD_INLINE __m256 SetFloat(float a0, float a1)
        {
            return _mm256_unpacklo_ps(_mm256_set1_ps(a0), _mm256_set1_ps(a1));
        }

        SIMD_INLINE __m256i Set(__m128i a0, __m128i a1)
        {
            return _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a1, 1);
        }

        SIMD_INLINE __m256i Set(__m128i a)
        {
            return _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 1);
        }

        template <class T> SIMD_INLINE __m256i SetMask(T first, size_t position, T second)
        {
            const size_t size = A / sizeof(T);
            assert(position <= size);
            T mask[size];
            for (size_t i = 0; i < position; ++i)
                mask[i] = first;
            for (size_t i = position; i < size; ++i)
                mask[i] = second;
            return _mm256_loadu_si256((__m256i*)mask);
        }

        //-------------------------------------------------------------------------------------------------

        SIMD_INLINE void SetZero(uint16_t* dst)
        {
            _mm256_storeu_si256((__m256i*)dst, _mm256_setzero_si256());
        }

        SIMD_INLINE void SetZero2(uint16_t* dst)
        {
            _mm256_storeu_si256((__m256i*)dst + 0, _mm256_setzero_si256());
            _mm256_storeu_si256((__m256i*)dst + 1, _mm256_setzero_si256());
        }
    }
#endif

#ifdef SIMD_AVX512BW_ENABLE
    namespace Avx512bw
    {
        SIMD_INLINE __m512i SetInt8(char a0, char a1)
        {
            return _mm512_unpacklo_epi8(_mm512_set1_epi8(a0), _mm512_set1_epi8(a1));
        }

        SIMD_INLINE __m512i SetInt16(short a0, short a1)
        {
            return _mm512_unpacklo_epi16(_mm512_set1_epi16(a0), _mm512_set1_epi16(a1));
        }

        SIMD_INLINE __m512i SetInt32(int a0, int a1)
        {
            return _mm512_unpacklo_epi32(_mm512_set1_epi32(a0), _mm512_set1_epi32(a1));
        }

        SIMD_INLINE __m512 SetFloat(float a0, float a1)
        {
            return _mm512_unpacklo_ps(_mm512_set1_ps(a0), _mm512_set1_ps(a1));
        }

        SIMD_INLINE __m512 Set(__m256 a0, __m256 a1)
        {
            return _mm512_insertf32x8(_mm512_castps256_ps512(a0), a1, 1);
        }

        SIMD_INLINE __m512i Set(__m256i a0, __m256i a1)
        {
            return _mm512_inserti32x8(_mm512_castsi256_si512(a0), a1, 1);
        }

        SIMD_INLINE __m512i Set(const __m128i& a0, const __m128i& a1, const __m128i& a2, const __m128i& a3)
        {
            return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a0), a1, 1), a2, 2), a3, 3);
        }

        //-------------------------------------------------------------------------------------------------

        SIMD_INLINE void SetZero(float* dst, __mmask16 mask = __mmask16(-1))
        {
            _mm512_mask_storeu_ps(dst, mask, _mm512_setzero_ps());
        }

        //-------------------------------------------------------------------------------------------------

        SIMD_INLINE void SetZero(uint16_t* dst, __mmask32 mask = __mmask32(-1))
        {
            _mm512_mask_storeu_epi16(dst, mask, _mm512_setzero_si512());
        }

        SIMD_INLINE void SetZeros(uint16_t* dst, size_t size32, __mmask32 tail)
        {
            size_t i = 0;
            __m512i zero = _mm512_setzero_si512();
            for (; i < size32; i += 32)
                _mm512_storeu_si512(dst + i, zero);
            if (tail)
                _mm512_mask_storeu_epi16(dst + i, tail, zero);
        }

        SIMD_INLINE void SetZeros(uint16_t* dst, size_t size)
        {
            size_t tail = size & 31;
            SetZeros(dst, size & (~31), tail ? __mmask32(-1) >> (32 - tail) : 0);
        }

        //-------------------------------------------------------------------------------------------------

        SIMD_INLINE void SetZero(uint8_t* dst, __m512i zero = _mm512_setzero_si512(), __mmask64 mask = __mmask64(-1))
        {
            _mm512_mask_storeu_epi8(dst, mask, zero);
        }

        SIMD_INLINE void SetZeros(uint8_t* dst, __m512i zero, size_t size64, __mmask64 tail)
        {
            size_t i = 0;
            for (; i < size64; i += 64)
                _mm512_storeu_si512(dst + i, zero);
            if (tail)
                _mm512_mask_storeu_epi8(dst + i, tail, zero);
        }

        SIMD_INLINE void SetZeros(uint8_t* dst, __m512i zero, size_t size)
        {
            size_t tail = size & 63;
            SetZeros(dst, zero, size & (~63), tail ? __mmask64(-1) >> (64 - tail) : 0);
        }
    }
#endif

#ifdef SIMD_NEON_ENABLE
    namespace Neon
    {
        SIMD_INLINE float32x4_t SetF32(float a0, float a1, float a2, float a3)
        {
            const float a[4] = { a0, a1, a2, a3 };
            return vld1q_f32(a);
        }

        SIMD_INLINE int32x4_t SetI32(int32_t a0, int32_t a1, int32_t a2, int32_t a3)
        {
            const int32_t a[4] = { a0, a1, a2, a3 };
            return vld1q_s32(a);
        }
    }
#endif
}

#endif

Coverage Report

Created: 2025-12-10 07:04

Line	Count	Source
1		/*
2		* Simd Library (http://ermig1979.github.io/Simd).
3		*
4		* Copyright (c) 2011-2025 Yermalayeu Ihar.
5		*
6		* Permission is hereby granted, free of charge, to any person obtaining a copy
7		* of this software and associated documentation files (the "Software"), to deal
8		* in the Software without restriction, including without limitation the rights
9		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10		* copies of the Software, and to permit persons to whom the Software is
11		* furnished to do so, subject to the following conditions:
12		*
13		* The above copyright notice and this permission notice shall be included in
14		* all copies or substantial portions of the Software.
15		*
16		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24		#ifndef __SimdSet_h__
25		#define __SimdSet_h__
26
27		#include "Simd/SimdDefs.h"
28		#include "Simd/SimdConst.h"
29
30		namespace Simd
31		{
32		namespace Base
33		{
34		SIMD_INLINE void SetZero(uint16_t* dst, size_t size)
35	0	{
36	0	for (size_t i = 0; i < size; ++i)
37	0	dst[i] = 0;
38	0	}
39		}
40
41		#ifdef SIMD_SSE41_ENABLE
42		namespace Sse41
43		{
44		SIMD_INLINE __m128i SetInt8(char a0, char a1)
45	0	{
46	0	return _mm_unpacklo_epi8(_mm_set1_epi8(a0), _mm_set1_epi8(a1));
47	0	}
48
49		SIMD_INLINE __m128i SetInt16(short a0, short a1)
50	0	{
51	0	return _mm_unpacklo_epi16(_mm_set1_epi16(a0), _mm_set1_epi16(a1));
52	0	}
53
54		SIMD_INLINE __m128i SetInt32(int a0, int a1)
55	0	{
56	0	return _mm_unpacklo_epi32(_mm_set1_epi32(a0), _mm_set1_epi32(a1));
57	0	}
58
59		SIMD_INLINE __m128 SetFloat(float a0, float a1)
60	0	{
61	0	return _mm_unpacklo_ps(_mm_set_ps1(a0), _mm_set_ps1(a1));
62	0	}
63
64		//-------------------------------------------------------------------------------------------------
65
66		SIMD_INLINE void SetZero(uint16_t* dst)
67	0	{
68	0	_mm_storeu_si128((__m128i*)dst, _mm_setzero_si128());
69	0	}
70		}
71		#endif
72
73		#ifdef SIMD_AVX2_ENABLE
74		namespace Avx2
75		{
76		SIMD_INLINE __m256 Set(__m128 a0, __m128 a1)
77	0	{
78	0	return _mm256_insertf128_ps(_mm256_castps128_ps256(a0), a1, 1);
79	0	}
80
81		SIMD_INLINE __m256 Set(__m128 a)
82	0	{
83	0	return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 1);
84	0	}
85
86		SIMD_INLINE __m256i SetInt8(char a0, char a1)
87	0	{
88	0	return _mm256_unpacklo_epi8(_mm256_set1_epi8(a0), _mm256_set1_epi8(a1));
89	0	}
90
91		SIMD_INLINE __m256i SetInt16(short a0, short a1)
92	0	{
93	0	return _mm256_unpacklo_epi16(_mm256_set1_epi16(a0), _mm256_set1_epi16(a1));
94	0	}
95
96		SIMD_INLINE __m256i SetInt32(int a0, int a1)
97	0	{
98	0	return _mm256_unpacklo_epi32(_mm256_set1_epi32(a0), _mm256_set1_epi32(a1));
99	0	}
100
101		SIMD_INLINE __m256 SetFloat(float a0, float a1)
102	0	{
103	0	return _mm256_unpacklo_ps(_mm256_set1_ps(a0), _mm256_set1_ps(a1));
104	0	}
105
106		SIMD_INLINE __m256i Set(__m128i a0, __m128i a1)
107	0	{
108	0	return _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a1, 1);
109	0	}
110
111		SIMD_INLINE __m256i Set(__m128i a)
112	0	{
113	0	return _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 1);
114	0	}
115
116		template <class T> SIMD_INLINE __m256i SetMask(T first, size_t position, T second)
117	0	{
118	0	const size_t size = A / sizeof(T);
119	0	assert(position <= size);
120	0	T mask[size];
121	0	for (size_t i = 0; i < position; ++i)
122	0	mask[i] = first;
123	0	for (size_t i = position; i < size; ++i)
124	0	mask[i] = second;
125	0	return _mm256_loadu_si256((__m256i*)mask);
126	0	} Unexecuted instantiation: long long __vector(4) Simd::Avx2::SetMask<unsigned char>(unsigned char, unsigned long, unsigned char) Unexecuted instantiation: long long __vector(4) Simd::Avx2::SetMask<unsigned short>(unsigned short, unsigned long, unsigned short)
127
128		//-------------------------------------------------------------------------------------------------
129
130		SIMD_INLINE void SetZero(uint16_t* dst)
131	0	{
132	0	_mm256_storeu_si256((__m256i*)dst, _mm256_setzero_si256());
133	0	}
134
135		SIMD_INLINE void SetZero2(uint16_t* dst)
136	0	{
137	0	_mm256_storeu_si256((__m256i*)dst + 0, _mm256_setzero_si256());
138	0	_mm256_storeu_si256((__m256i*)dst + 1, _mm256_setzero_si256());
139	0	}
140		}
141		#endif
142
143		#ifdef SIMD_AVX512BW_ENABLE
144		namespace Avx512bw
145		{
146		SIMD_INLINE __m512i SetInt8(char a0, char a1)
147	0	{
148	0	return _mm512_unpacklo_epi8(_mm512_set1_epi8(a0), _mm512_set1_epi8(a1));
149	0	}
150
151		SIMD_INLINE __m512i SetInt16(short a0, short a1)
152	0	{
153	0	return _mm512_unpacklo_epi16(_mm512_set1_epi16(a0), _mm512_set1_epi16(a1));
154	0	}
155
156		SIMD_INLINE __m512i SetInt32(int a0, int a1)
157	0	{
158	0	return _mm512_unpacklo_epi32(_mm512_set1_epi32(a0), _mm512_set1_epi32(a1));
159	0	}
160
161		SIMD_INLINE __m512 SetFloat(float a0, float a1)
162	0	{
163	0	return _mm512_unpacklo_ps(_mm512_set1_ps(a0), _mm512_set1_ps(a1));
164	0	}
165
166		SIMD_INLINE __m512 Set(__m256 a0, __m256 a1)
167	0	{
168	0	return _mm512_insertf32x8(_mm512_castps256_ps512(a0), a1, 1);
169	0	}
170
171		SIMD_INLINE __m512i Set(__m256i a0, __m256i a1)
172	0	{
173	0	return _mm512_inserti32x8(_mm512_castsi256_si512(a0), a1, 1);
174	0	}
175
176		SIMD_INLINE __m512i Set(const __m128i& a0, const __m128i& a1, const __m128i& a2, const __m128i& a3)
177	0	{
178	0	return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a0), a1, 1), a2, 2), a3, 3);
179	0	}
180
181		//-------------------------------------------------------------------------------------------------
182
183		SIMD_INLINE void SetZero(float* dst, __mmask16 mask = __mmask16(-1))
184	0	{
185	0	_mm512_mask_storeu_ps(dst, mask, _mm512_setzero_ps());
186	0	}
187
188		//-------------------------------------------------------------------------------------------------
189
190		SIMD_INLINE void SetZero(uint16_t* dst, __mmask32 mask = __mmask32(-1))
191	0	{
192	0	_mm512_mask_storeu_epi16(dst, mask, _mm512_setzero_si512());
193	0	}
194
195		SIMD_INLINE void SetZeros(uint16_t* dst, size_t size32, __mmask32 tail)
196	0	{
197	0	size_t i = 0;
198	0	__m512i zero = _mm512_setzero_si512();
199	0	for (; i < size32; i += 32)
200	0	_mm512_storeu_si512(dst + i, zero);
201	0	if (tail)
202	0	_mm512_mask_storeu_epi16(dst + i, tail, zero);
203	0	}
204
205		SIMD_INLINE void SetZeros(uint16_t* dst, size_t size)
206	0	{
207	0	size_t tail = size & 31;
208	0	SetZeros(dst, size & (~31), tail ? __mmask32(-1) >> (32 - tail) : 0);
209	0	}
210
211		//-------------------------------------------------------------------------------------------------
212
213		SIMD_INLINE void SetZero(uint8_t* dst, __m512i zero = _mm512_setzero_si512(), __mmask64 mask = __mmask64(-1))
214	0	{
215	0	_mm512_mask_storeu_epi8(dst, mask, zero);
216	0	}
217
218		SIMD_INLINE void SetZeros(uint8_t* dst, __m512i zero, size_t size64, __mmask64 tail)
219	0	{
220	0	size_t i = 0;
221	0	for (; i < size64; i += 64)
222	0	_mm512_storeu_si512(dst + i, zero);
223	0	if (tail)
224	0	_mm512_mask_storeu_epi8(dst + i, tail, zero);
225	0	}
226
227		SIMD_INLINE void SetZeros(uint8_t* dst, __m512i zero, size_t size)
228	0	{
229	0	size_t tail = size & 63;
230	0	SetZeros(dst, zero, size & (~63), tail ? __mmask64(-1) >> (64 - tail) : 0);
231	0	}
232		}
233		#endif
234
235		#ifdef SIMD_NEON_ENABLE
236		namespace Neon
237		{
238		SIMD_INLINE float32x4_t SetF32(float a0, float a1, float a2, float a3)
239		{
240		const float a[4] = { a0, a1, a2, a3 };
241		return vld1q_f32(a);
242		}
243
244		SIMD_INLINE int32x4_t SetI32(int32_t a0, int32_t a1, int32_t a2, int32_t a3)
245		{
246		const int32_t a[4] = { a0, a1, a2, a3 };
247		return vld1q_s32(a);
248		}
249		}
250		#endif
251		}
252
253		#endif