/src/Simd/src/Simd/SimdSet.h
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2025 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #ifndef __SimdSet_h__ |
25 | | #define __SimdSet_h__ |
26 | | |
27 | | #include "Simd/SimdDefs.h" |
28 | | #include "Simd/SimdConst.h" |
29 | | |
30 | | namespace Simd |
31 | | { |
32 | | namespace Base |
33 | | { |
34 | | SIMD_INLINE void SetZero(uint16_t* dst, size_t size) |
35 | 0 | { |
36 | 0 | for (size_t i = 0; i < size; ++i) |
37 | 0 | dst[i] = 0; |
38 | 0 | } |
39 | | } |
40 | | |
41 | | #ifdef SIMD_SSE41_ENABLE |
42 | | namespace Sse41 |
43 | | { |
44 | | SIMD_INLINE __m128i SetInt8(char a0, char a1) |
45 | 0 | { |
46 | 0 | return _mm_unpacklo_epi8(_mm_set1_epi8(a0), _mm_set1_epi8(a1)); |
47 | 0 | } |
48 | | |
49 | | SIMD_INLINE __m128i SetInt16(short a0, short a1) |
50 | 0 | { |
51 | 0 | return _mm_unpacklo_epi16(_mm_set1_epi16(a0), _mm_set1_epi16(a1)); |
52 | 0 | } |
53 | | |
54 | | SIMD_INLINE __m128i SetInt32(int a0, int a1) |
55 | 0 | { |
56 | 0 | return _mm_unpacklo_epi32(_mm_set1_epi32(a0), _mm_set1_epi32(a1)); |
57 | 0 | } |
58 | | |
59 | | SIMD_INLINE __m128 SetFloat(float a0, float a1) |
60 | 0 | { |
61 | 0 | return _mm_unpacklo_ps(_mm_set_ps1(a0), _mm_set_ps1(a1)); |
62 | 0 | } |
63 | | |
64 | | //------------------------------------------------------------------------------------------------- |
65 | | |
66 | | SIMD_INLINE void SetZero(uint16_t* dst) |
67 | 0 | { |
68 | 0 | _mm_storeu_si128((__m128i*)dst, _mm_setzero_si128()); |
69 | 0 | } |
70 | | } |
71 | | #endif |
72 | | |
73 | | #ifdef SIMD_AVX2_ENABLE |
74 | | namespace Avx2 |
75 | | { |
76 | | SIMD_INLINE __m256 Set(__m128 a0, __m128 a1) |
77 | 0 | { |
78 | 0 | return _mm256_insertf128_ps(_mm256_castps128_ps256(a0), a1, 1); |
79 | 0 | } |
80 | | |
81 | | SIMD_INLINE __m256 Set(__m128 a) |
82 | 0 | { |
83 | 0 | return _mm256_insertf128_ps(_mm256_castps128_ps256(a), a, 1); |
84 | 0 | } |
85 | | |
86 | | SIMD_INLINE __m256i SetInt8(char a0, char a1) |
87 | 0 | { |
88 | 0 | return _mm256_unpacklo_epi8(_mm256_set1_epi8(a0), _mm256_set1_epi8(a1)); |
89 | 0 | } |
90 | | |
91 | | SIMD_INLINE __m256i SetInt16(short a0, short a1) |
92 | 0 | { |
93 | 0 | return _mm256_unpacklo_epi16(_mm256_set1_epi16(a0), _mm256_set1_epi16(a1)); |
94 | 0 | } |
95 | | |
96 | | SIMD_INLINE __m256i SetInt32(int a0, int a1) |
97 | 0 | { |
98 | 0 | return _mm256_unpacklo_epi32(_mm256_set1_epi32(a0), _mm256_set1_epi32(a1)); |
99 | 0 | } |
100 | | |
101 | | SIMD_INLINE __m256 SetFloat(float a0, float a1) |
102 | 0 | { |
103 | 0 | return _mm256_unpacklo_ps(_mm256_set1_ps(a0), _mm256_set1_ps(a1)); |
104 | 0 | } |
105 | | |
106 | | SIMD_INLINE __m256i Set(__m128i a0, __m128i a1) |
107 | 0 | { |
108 | 0 | return _mm256_inserti128_si256(_mm256_castsi128_si256(a0), a1, 1); |
109 | 0 | } |
110 | | |
111 | | SIMD_INLINE __m256i Set(__m128i a) |
112 | 0 | { |
113 | 0 | return _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 1); |
114 | 0 | } |
115 | | |
116 | | template <class T> SIMD_INLINE __m256i SetMask(T first, size_t position, T second) |
117 | 0 | { |
118 | 0 | const size_t size = A / sizeof(T); |
119 | 0 | assert(position <= size); |
120 | 0 | T mask[size]; |
121 | 0 | for (size_t i = 0; i < position; ++i) |
122 | 0 | mask[i] = first; |
123 | 0 | for (size_t i = position; i < size; ++i) |
124 | 0 | mask[i] = second; |
125 | 0 | return _mm256_loadu_si256((__m256i*)mask); |
126 | 0 | } Unexecuted instantiation: long long __vector(4) Simd::Avx2::SetMask<unsigned char>(unsigned char, unsigned long, unsigned char) Unexecuted instantiation: long long __vector(4) Simd::Avx2::SetMask<unsigned short>(unsigned short, unsigned long, unsigned short) |
127 | | |
128 | | //------------------------------------------------------------------------------------------------- |
129 | | |
130 | | SIMD_INLINE void SetZero(uint16_t* dst) |
131 | 0 | { |
132 | 0 | _mm256_storeu_si256((__m256i*)dst, _mm256_setzero_si256()); |
133 | 0 | } |
134 | | |
135 | | SIMD_INLINE void SetZero2(uint16_t* dst) |
136 | 0 | { |
137 | 0 | _mm256_storeu_si256((__m256i*)dst + 0, _mm256_setzero_si256()); |
138 | 0 | _mm256_storeu_si256((__m256i*)dst + 1, _mm256_setzero_si256()); |
139 | 0 | } |
140 | | } |
141 | | #endif |
142 | | |
143 | | #ifdef SIMD_AVX512BW_ENABLE |
144 | | namespace Avx512bw |
145 | | { |
146 | | SIMD_INLINE __m512i SetInt8(char a0, char a1) |
147 | 0 | { |
148 | 0 | return _mm512_unpacklo_epi8(_mm512_set1_epi8(a0), _mm512_set1_epi8(a1)); |
149 | 0 | } |
150 | | |
151 | | SIMD_INLINE __m512i SetInt16(short a0, short a1) |
152 | 0 | { |
153 | 0 | return _mm512_unpacklo_epi16(_mm512_set1_epi16(a0), _mm512_set1_epi16(a1)); |
154 | 0 | } |
155 | | |
156 | | SIMD_INLINE __m512i SetInt32(int a0, int a1) |
157 | 0 | { |
158 | 0 | return _mm512_unpacklo_epi32(_mm512_set1_epi32(a0), _mm512_set1_epi32(a1)); |
159 | 0 | } |
160 | | |
161 | | SIMD_INLINE __m512 SetFloat(float a0, float a1) |
162 | 0 | { |
163 | 0 | return _mm512_unpacklo_ps(_mm512_set1_ps(a0), _mm512_set1_ps(a1)); |
164 | 0 | } |
165 | | |
166 | | SIMD_INLINE __m512 Set(__m256 a0, __m256 a1) |
167 | 0 | { |
168 | 0 | return _mm512_insertf32x8(_mm512_castps256_ps512(a0), a1, 1); |
169 | 0 | } |
170 | | |
171 | | SIMD_INLINE __m512i Set(__m256i a0, __m256i a1) |
172 | 0 | { |
173 | 0 | return _mm512_inserti32x8(_mm512_castsi256_si512(a0), a1, 1); |
174 | 0 | } |
175 | | |
176 | | SIMD_INLINE __m512i Set(const __m128i& a0, const __m128i& a1, const __m128i& a2, const __m128i& a3) |
177 | 0 | { |
178 | 0 | return _mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(a0), a1, 1), a2, 2), a3, 3); |
179 | 0 | } |
180 | | |
181 | | //------------------------------------------------------------------------------------------------- |
182 | | |
183 | | SIMD_INLINE void SetZero(float* dst, __mmask16 mask = __mmask16(-1)) |
184 | 0 | { |
185 | 0 | _mm512_mask_storeu_ps(dst, mask, _mm512_setzero_ps()); |
186 | 0 | } |
187 | | |
188 | | //------------------------------------------------------------------------------------------------- |
189 | | |
190 | | SIMD_INLINE void SetZero(uint16_t* dst, __mmask32 mask = __mmask32(-1)) |
191 | 0 | { |
192 | 0 | _mm512_mask_storeu_epi16(dst, mask, _mm512_setzero_si512()); |
193 | 0 | } |
194 | | |
195 | | SIMD_INLINE void SetZeros(uint16_t* dst, size_t size32, __mmask32 tail) |
196 | 0 | { |
197 | 0 | size_t i = 0; |
198 | 0 | __m512i zero = _mm512_setzero_si512(); |
199 | 0 | for (; i < size32; i += 32) |
200 | 0 | _mm512_storeu_si512(dst + i, zero); |
201 | 0 | if (tail) |
202 | 0 | _mm512_mask_storeu_epi16(dst + i, tail, zero); |
203 | 0 | } |
204 | | |
205 | | SIMD_INLINE void SetZeros(uint16_t* dst, size_t size) |
206 | 0 | { |
207 | 0 | size_t tail = size & 31; |
208 | 0 | SetZeros(dst, size & (~31), tail ? __mmask32(-1) >> (32 - tail) : 0); |
209 | 0 | } |
210 | | |
211 | | //------------------------------------------------------------------------------------------------- |
212 | | |
213 | | SIMD_INLINE void SetZero(uint8_t* dst, __m512i zero = _mm512_setzero_si512(), __mmask64 mask = __mmask64(-1)) |
214 | 0 | { |
215 | 0 | _mm512_mask_storeu_epi8(dst, mask, zero); |
216 | 0 | } |
217 | | |
218 | | SIMD_INLINE void SetZeros(uint8_t* dst, __m512i zero, size_t size64, __mmask64 tail) |
219 | 0 | { |
220 | 0 | size_t i = 0; |
221 | 0 | for (; i < size64; i += 64) |
222 | 0 | _mm512_storeu_si512(dst + i, zero); |
223 | 0 | if (tail) |
224 | 0 | _mm512_mask_storeu_epi8(dst + i, tail, zero); |
225 | 0 | } |
226 | | |
227 | | SIMD_INLINE void SetZeros(uint8_t* dst, __m512i zero, size_t size) |
228 | 0 | { |
229 | 0 | size_t tail = size & 63; |
230 | 0 | SetZeros(dst, zero, size & (~63), tail ? __mmask64(-1) >> (64 - tail) : 0); |
231 | 0 | } |
232 | | } |
233 | | #endif |
234 | | |
235 | | #ifdef SIMD_NEON_ENABLE |
236 | | namespace Neon |
237 | | { |
238 | | SIMD_INLINE float32x4_t SetF32(float a0, float a1, float a2, float a3) |
239 | | { |
240 | | const float a[4] = { a0, a1, a2, a3 }; |
241 | | return vld1q_f32(a); |
242 | | } |
243 | | |
244 | | SIMD_INLINE int32x4_t SetI32(int32_t a0, int32_t a1, int32_t a2, int32_t a3) |
245 | | { |
246 | | const int32_t a[4] = { a0, a1, a2, a3 }; |
247 | | return vld1q_s32(a); |
248 | | } |
249 | | } |
250 | | #endif |
251 | | } |
252 | | |
253 | | #endif |