/src/gdal/third_party/libdivide/libdivide.h
Line | Count | Source (jump to first uncovered line) |
1 | | // libdivide.h - Optimized integer division |
2 | | // https://libdivide.com |
3 | | // |
4 | | // Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com> |
5 | | // Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com> |
6 | | // |
7 | | // libdivide is dual-licensed under the Boost or zlib licenses. |
8 | | // You may use libdivide under the terms of either of these. |
9 | | // See LICENSE.txt for more details. |
10 | | |
11 | | #ifndef LIBDIVIDE_H |
12 | | #define LIBDIVIDE_H |
13 | | |
14 | | // *** Version numbers are auto generated - do not edit *** |
15 | | #define LIBDIVIDE_VERSION "5.2.0" |
16 | | #define LIBDIVIDE_VERSION_MAJOR 5 |
17 | | #define LIBDIVIDE_VERSION_MINOR 2 |
18 | | #define LIBDIVIDE_VERSION_PATCH 0 |
19 | | |
20 | | #include <stdint.h> |
21 | | |
22 | | #if !defined(__AVR__) && __STDC_HOSTED__ != 0 |
23 | | #include <stdio.h> |
24 | | #include <stdlib.h> |
25 | | #endif |
26 | | |
27 | | #if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \ |
28 | | (defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L)) |
29 | | #include <limits.h> |
30 | | #include <type_traits> |
31 | | #define LIBDIVIDE_VC_CXX20 |
32 | | #endif |
33 | | |
34 | | #if defined(LIBDIVIDE_SSE2) |
35 | | #include <emmintrin.h> |
36 | | #endif |
37 | | |
38 | | #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512) |
39 | | #include <immintrin.h> |
40 | | #endif |
41 | | |
42 | | #if defined(LIBDIVIDE_NEON) |
43 | | #include <arm_neon.h> |
44 | | #endif |
45 | | |
46 | | // Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics |
47 | | #if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \ |
48 | | (defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)) |
49 | | #define LIBDIVIDE_MULH_INTRINSICS |
50 | | #endif |
51 | | |
52 | | #if defined(_MSC_VER) |
53 | | #if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__) |
54 | | #include <intrin.h> |
55 | | #endif |
56 | | #ifndef __clang__ |
57 | | #pragma warning(push) |
58 | | // 4146: unary minus operator applied to unsigned type, result still unsigned |
59 | | #pragma warning(disable : 4146) |
60 | | |
61 | | // 4204: nonstandard extension used : non-constant aggregate initializer |
62 | | #pragma warning(disable : 4204) |
63 | | #endif |
64 | | #define LIBDIVIDE_VC |
65 | | #endif |
66 | | |
67 | | #if !defined(__has_builtin) |
68 | | #define __has_builtin(x) 0 |
69 | | #endif |
70 | | |
71 | | #if defined(__SIZEOF_INT128__) |
72 | | #define HAS_INT128_T |
73 | | // clang-cl on Windows does not yet support 128-bit division |
74 | | #if !(defined(__clang__) && defined(LIBDIVIDE_VC)) |
75 | | #define HAS_INT128_DIV |
76 | | #endif |
77 | | #endif |
78 | | |
79 | | #if defined(__x86_64__) || defined(_M_X64) |
80 | | #define LIBDIVIDE_X86_64 |
81 | | #endif |
82 | | |
83 | | #if defined(__i386__) |
84 | | #define LIBDIVIDE_i386 |
85 | | #endif |
86 | | |
87 | | #if defined(__GNUC__) || defined(__clang__) |
88 | | #define LIBDIVIDE_GCC_STYLE_ASM |
89 | | #endif |
90 | | |
91 | | #if defined(__cplusplus) || defined(LIBDIVIDE_VC) |
92 | 0 | #define LIBDIVIDE_FUNCTION __FUNCTION__ |
93 | | #else |
94 | | #define LIBDIVIDE_FUNCTION __func__ |
95 | | #endif |
96 | | |
97 | | // Set up forced inlining if possible. |
98 | | // We need both the attribute and keyword to avoid "might not be inlineable" warnings. |
99 | | #ifdef __has_attribute |
100 | | #if __has_attribute(always_inline) |
101 | | #define LIBDIVIDE_INLINE __attribute__((always_inline)) inline |
102 | | #endif |
103 | | #endif |
104 | | #ifndef LIBDIVIDE_INLINE |
105 | | #ifdef _MSC_VER |
106 | | #define LIBDIVIDE_INLINE __forceinline |
107 | | #else |
108 | | #define LIBDIVIDE_INLINE inline |
109 | | #endif |
110 | | #endif |
111 | | |
112 | | #if defined(__AVR__) || __STDC_HOSTED__ == 0 |
113 | | #define LIBDIVIDE_ERROR(msg) |
114 | | #else |
115 | | #define LIBDIVIDE_ERROR(msg) \ |
116 | 0 | do { \ |
117 | 0 | fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \ |
118 | 0 | abort(); \ |
119 | 0 | } while (0) |
120 | | #endif |
121 | | |
122 | | #if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0 |
123 | | #define LIBDIVIDE_ASSERT(x) \ |
124 | | do { \ |
125 | | if (!(x)) { \ |
126 | | fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \ |
127 | | LIBDIVIDE_FUNCTION, #x); \ |
128 | | abort(); \ |
129 | | } \ |
130 | | } while (0) |
131 | | #else |
132 | | #define LIBDIVIDE_ASSERT(x) |
133 | | #endif |
134 | | |
135 | | #ifdef __cplusplus |
136 | | |
137 | | // For constexpr zero initialization, c++11 might handle things ok, |
138 | | // but just limit to at least c++14 to ensure we don't break anyone's code: |
139 | | |
140 | | // Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr |
141 | | #if defined(__cpp_constexpr) && (__cpp_constexpr >= 201304L) |
142 | | #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE |
143 | | |
144 | | // Supposedly, MSVC might not implement feature test macros right: |
145 | | // https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c |
146 | | // so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS |
147 | | // 2017 15.0 (for extended constexpr support: |
148 | | // https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170) |
149 | | #elif (defined(_MSC_VER) && _MSC_VER >= 1910) && (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) |
150 | | #define LIBDIVIDE_CONSTEXPR constexpr LIBDIVIDE_INLINE |
151 | | |
152 | | #else |
153 | | #define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE |
154 | | #endif |
155 | | |
156 | | namespace libdivide { |
157 | | #endif |
158 | | |
159 | | #if defined(_MSC_VER) && !defined(__clang__) |
160 | | #if defined(LIBDIVIDE_VC_CXX20) |
161 | | static LIBDIVIDE_CONSTEXPR int __builtin_clz(unsigned x) { |
162 | | if (std::is_constant_evaluated()) { |
163 | | for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) { |
164 | | if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i; |
165 | | } |
166 | | return sizeof(x) * CHAR_BIT; |
167 | | } |
168 | | #else |
169 | | static LIBDIVIDE_INLINE int __builtin_clz(unsigned x) { |
170 | | #endif |
171 | | #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) |
172 | | return (int)_CountLeadingZeros(x); |
173 | | #elif defined(__AVX2__) || defined(__LZCNT__) |
174 | | return (int)_lzcnt_u32(x); |
175 | | #else |
176 | | unsigned long r; |
177 | | _BitScanReverse(&r, x); |
178 | | return (int)(r ^ 31); |
179 | | #endif |
180 | | } |
181 | | |
182 | | #if defined(LIBDIVIDE_VC_CXX20) |
183 | | static LIBDIVIDE_CONSTEXPR int __builtin_clzll(unsigned long long x) { |
184 | | if (std::is_constant_evaluated()) { |
185 | | for (int i = 0; i < sizeof(x) * CHAR_BIT; ++i) { |
186 | | if (x >> (sizeof(x) * CHAR_BIT - 1 - i)) return i; |
187 | | } |
188 | | return sizeof(x) * CHAR_BIT; |
189 | | } |
190 | | #else |
191 | | static LIBDIVIDE_INLINE int __builtin_clzll(unsigned long long x) { |
192 | | #endif |
193 | | #if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) |
194 | | return (int)_CountLeadingZeros64(x); |
195 | | #elif defined(_WIN64) |
196 | | #if defined(__AVX2__) || defined(__LZCNT__) |
197 | | return (int)_lzcnt_u64(x); |
198 | | #else |
199 | | unsigned long r; |
200 | | _BitScanReverse64(&r, x); |
201 | | return (int)(r ^ 63); |
202 | | #endif |
203 | | #else |
204 | | int l = __builtin_clz((unsigned)x) + 32; |
205 | | int h = __builtin_clz((unsigned)(x >> 32)); |
206 | | return !!((unsigned)(x >> 32)) ? h : l; |
207 | | #endif |
208 | | } |
209 | | #endif // defined(_MSC_VER) && !defined(__clang__) |
210 | | |
211 | | // pack divider structs to prevent compilers from padding. |
212 | | // This reduces memory usage by up to 43% when using a large |
213 | | // array of libdivide dividers and improves performance |
214 | | // by up to 10% because of reduced memory bandwidth. |
215 | | #pragma pack(push, 1) |
216 | | |
217 | | struct libdivide_u16_t { |
218 | | uint16_t magic; |
219 | | uint8_t more; |
220 | | }; |
221 | | |
222 | | struct libdivide_s16_t { |
223 | | int16_t magic; |
224 | | uint8_t more; |
225 | | }; |
226 | | |
227 | | struct libdivide_u32_t { |
228 | | uint32_t magic; |
229 | | uint8_t more; |
230 | | }; |
231 | | |
232 | | struct libdivide_s32_t { |
233 | | int32_t magic; |
234 | | uint8_t more; |
235 | | }; |
236 | | |
237 | | struct libdivide_u64_t { |
238 | | uint64_t magic; |
239 | | uint8_t more; |
240 | | }; |
241 | | |
242 | | struct libdivide_s64_t { |
243 | | int64_t magic; |
244 | | uint8_t more; |
245 | | }; |
246 | | |
247 | | struct libdivide_u16_branchfree_t { |
248 | | uint16_t magic; |
249 | | uint8_t more; |
250 | | }; |
251 | | |
252 | | struct libdivide_s16_branchfree_t { |
253 | | int16_t magic; |
254 | | uint8_t more; |
255 | | }; |
256 | | |
257 | | struct libdivide_u32_branchfree_t { |
258 | | uint32_t magic; |
259 | | uint8_t more; |
260 | | }; |
261 | | |
262 | | struct libdivide_s32_branchfree_t { |
263 | | int32_t magic; |
264 | | uint8_t more; |
265 | | }; |
266 | | |
267 | | struct libdivide_u64_branchfree_t { |
268 | | uint64_t magic; |
269 | | uint8_t more; |
270 | | }; |
271 | | |
272 | | struct libdivide_s64_branchfree_t { |
273 | | int64_t magic; |
274 | | uint8_t more; |
275 | | }; |
276 | | |
277 | | #pragma pack(pop) |
278 | | |
279 | | // Explanation of the "more" field: |
280 | | // |
281 | | // * Bits 0-5 is the shift value (for shift path or mult path). |
282 | | // * Bit 6 is the add indicator for mult path. |
283 | | // * Bit 7 is set if the divisor is negative. We use bit 7 as the negative |
284 | | // divisor indicator so that we can efficiently use sign extension to |
285 | | // create a bitmask with all bits set to 1 (if the divisor is negative) |
286 | | // or 0 (if the divisor is positive). |
287 | | // |
288 | | // u32: [0-4] shift value |
289 | | // [5] ignored |
290 | | // [6] add indicator |
291 | | // magic number of 0 indicates shift path |
292 | | // |
293 | | // s32: [0-4] shift value |
294 | | // [5] ignored |
295 | | // [6] add indicator |
296 | | // [7] indicates negative divisor |
297 | | // magic number of 0 indicates shift path |
298 | | // |
299 | | // u64: [0-5] shift value |
300 | | // [6] add indicator |
301 | | // magic number of 0 indicates shift path |
302 | | // |
303 | | // s64: [0-5] shift value |
304 | | // [6] add indicator |
305 | | // [7] indicates negative divisor |
306 | | // magic number of 0 indicates shift path |
307 | | // |
308 | | // In s32 and s64 branchfree modes, the magic number is negated according to |
309 | | // whether the divisor is negated. In branchfree strategy, it is not negated. |
310 | | |
311 | | enum { |
312 | | LIBDIVIDE_16_SHIFT_MASK = 0x1F, |
313 | | LIBDIVIDE_32_SHIFT_MASK = 0x1F, |
314 | | LIBDIVIDE_64_SHIFT_MASK = 0x3F, |
315 | | LIBDIVIDE_ADD_MARKER = 0x40, |
316 | | LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 |
317 | | }; |
318 | | |
319 | | static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d); |
320 | | static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d); |
321 | | static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d); |
322 | | static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d); |
323 | | static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d); |
324 | | static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d); |
325 | | |
326 | | static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d); |
327 | | static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d); |
328 | | static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); |
329 | | static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); |
330 | | static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); |
331 | | static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); |
332 | | |
333 | | static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw( |
334 | | int16_t numer, int16_t magic, uint8_t more); |
335 | | static LIBDIVIDE_INLINE int16_t libdivide_s16_do( |
336 | | int16_t numer, const struct libdivide_s16_t *denom); |
337 | | static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw( |
338 | | uint16_t numer, uint16_t magic, uint8_t more); |
339 | | static LIBDIVIDE_INLINE uint16_t libdivide_u16_do( |
340 | | uint16_t numer, const struct libdivide_u16_t *denom); |
341 | | static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw( |
342 | | int32_t numer, int32_t magic, uint8_t more); |
343 | | static LIBDIVIDE_INLINE int32_t libdivide_s32_do( |
344 | | int32_t numer, const struct libdivide_s32_t *denom); |
345 | | static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw( |
346 | | uint32_t numer, uint32_t magic, uint8_t more); |
347 | | static LIBDIVIDE_INLINE uint32_t libdivide_u32_do( |
348 | | uint32_t numer, const struct libdivide_u32_t *denom); |
349 | | static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw( |
350 | | int64_t numer, int64_t magic, uint8_t more); |
351 | | static LIBDIVIDE_INLINE int64_t libdivide_s64_do( |
352 | | int64_t numer, const struct libdivide_s64_t *denom); |
353 | | static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw( |
354 | | uint64_t numer, uint64_t magic, uint8_t more); |
355 | | static LIBDIVIDE_INLINE uint64_t libdivide_u64_do( |
356 | | uint64_t numer, const struct libdivide_u64_t *denom); |
357 | | |
358 | | static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do( |
359 | | int16_t numer, const struct libdivide_s16_branchfree_t *denom); |
360 | | static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( |
361 | | uint16_t numer, const struct libdivide_u16_branchfree_t *denom); |
362 | | static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do( |
363 | | int32_t numer, const struct libdivide_s32_branchfree_t *denom); |
364 | | static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( |
365 | | uint32_t numer, const struct libdivide_u32_branchfree_t *denom); |
366 | | static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do( |
367 | | int64_t numer, const struct libdivide_s64_branchfree_t *denom); |
368 | | static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( |
369 | | uint64_t numer, const struct libdivide_u64_branchfree_t *denom); |
370 | | |
371 | | static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom); |
372 | | static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom); |
373 | | static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); |
374 | | static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); |
375 | | static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); |
376 | | static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); |
377 | | |
378 | | static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover( |
379 | | const struct libdivide_s16_branchfree_t *denom); |
380 | | static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover( |
381 | | const struct libdivide_u16_branchfree_t *denom); |
382 | | static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover( |
383 | | const struct libdivide_s32_branchfree_t *denom); |
384 | | static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover( |
385 | | const struct libdivide_u32_branchfree_t *denom); |
386 | | static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover( |
387 | | const struct libdivide_s64_branchfree_t *denom); |
388 | | static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover( |
389 | | const struct libdivide_u64_branchfree_t *denom); |
390 | | |
391 | | //////// Internal Utility Functions |
392 | | |
393 | 0 | static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) { |
394 | 0 | uint32_t xl = x, yl = y; |
395 | 0 | uint32_t rl = xl * yl; |
396 | 0 | return (uint16_t)(rl >> 16); |
397 | 0 | } |
398 | | |
399 | 0 | static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) { |
400 | 0 | int32_t xl = x, yl = y; |
401 | 0 | int32_t rl = xl * yl; |
402 | 0 | // needs to be arithmetic shift |
403 | 0 | return (int16_t)(rl >> 16); |
404 | 0 | } |
405 | | |
406 | 0 | static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { |
407 | 0 | uint64_t xl = x, yl = y; |
408 | 0 | uint64_t rl = xl * yl; |
409 | 0 | return (uint32_t)(rl >> 32); |
410 | 0 | } |
411 | | |
412 | 0 | static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { |
413 | 0 | int64_t xl = x, yl = y; |
414 | 0 | int64_t rl = xl * yl; |
415 | 0 | // needs to be arithmetic shift |
416 | 0 | return (int32_t)(rl >> 32); |
417 | 0 | } |
418 | | |
419 | 0 | static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { |
420 | 0 | #if defined(LIBDIVIDE_MULH_INTRINSICS) |
421 | 0 | return __umulh(x, y); |
422 | 0 | #elif defined(HAS_INT128_T) |
423 | 0 | __uint128_t xl = x, yl = y; |
424 | 0 | __uint128_t rl = xl * yl; |
425 | 0 | return (uint64_t)(rl >> 64); |
426 | 0 | #else |
427 | 0 | // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) |
428 | 0 | uint32_t mask = 0xFFFFFFFF; |
429 | 0 | uint32_t x0 = (uint32_t)(x & mask); |
430 | 0 | uint32_t x1 = (uint32_t)(x >> 32); |
431 | 0 | uint32_t y0 = (uint32_t)(y & mask); |
432 | 0 | uint32_t y1 = (uint32_t)(y >> 32); |
433 | 0 | uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); |
434 | 0 | uint64_t x0y1 = x0 * (uint64_t)y1; |
435 | 0 | uint64_t x1y0 = x1 * (uint64_t)y0; |
436 | 0 | uint64_t x1y1 = x1 * (uint64_t)y1; |
437 | 0 | uint64_t temp = x1y0 + x0y0_hi; |
438 | 0 | uint64_t temp_lo = temp & mask; |
439 | 0 | uint64_t temp_hi = temp >> 32; |
440 | 0 |
|
441 | 0 | return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32); |
442 | 0 | #endif |
443 | 0 | } |
444 | | |
445 | 0 | static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { |
446 | 0 | #if defined(LIBDIVIDE_MULH_INTRINSICS) |
447 | 0 | return __mulh(x, y); |
448 | 0 | #elif defined(HAS_INT128_T) |
449 | 0 | __int128_t xl = x, yl = y; |
450 | 0 | __int128_t rl = xl * yl; |
451 | 0 | return (int64_t)(rl >> 64); |
452 | 0 | #else |
453 | 0 | // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) |
454 | 0 | uint32_t mask = 0xFFFFFFFF; |
455 | 0 | uint32_t x0 = (uint32_t)(x & mask); |
456 | 0 | uint32_t y0 = (uint32_t)(y & mask); |
457 | 0 | int32_t x1 = (int32_t)(x >> 32); |
458 | 0 | int32_t y1 = (int32_t)(y >> 32); |
459 | 0 | uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); |
460 | 0 | int64_t t = x1 * (int64_t)y0 + x0y0_hi; |
461 | 0 | int64_t w1 = x0 * (int64_t)y1 + (t & mask); |
462 | 0 |
|
463 | 0 | return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32); |
464 | 0 | #endif |
465 | 0 | } |
466 | | |
467 | 0 | static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) { |
468 | | #if defined(__AVR__) |
469 | | // Fast way to count leading zeros |
470 | | // On the AVR 8-bit architecture __builtin_clz() works on a int16_t. |
471 | | return __builtin_clz(val); |
472 | | #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER) |
473 | | // Fast way to count leading zeros |
474 | 0 | return (int16_t)(__builtin_clz(val) - 16); |
475 | | #else |
476 | | if (val == 0) return 16; |
477 | | int16_t result = 4; |
478 | | uint16_t hi = 0xFU << 12; |
479 | | while ((val & hi) == 0) { |
480 | | hi >>= 4; |
481 | | result += 4; |
482 | | } |
483 | | while (val & hi) { |
484 | | result -= 1; |
485 | | hi <<= 1; |
486 | | } |
487 | | return result; |
488 | | #endif |
489 | 0 | } |
490 | | |
491 | 0 | static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) { |
492 | | #if defined(__AVR__) |
493 | | // Fast way to count leading zeros |
494 | | return __builtin_clzl(val); |
495 | | #elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER) |
496 | | // Fast way to count leading zeros |
497 | 0 | return __builtin_clz(val); |
498 | | #else |
499 | | if (val == 0) return 32; |
500 | | int32_t result = 8; |
501 | | uint32_t hi = 0xFFU << 24; |
502 | | while ((val & hi) == 0) { |
503 | | hi >>= 8; |
504 | | result += 8; |
505 | | } |
506 | | while (val & hi) { |
507 | | result -= 1; |
508 | | hi <<= 1; |
509 | | } |
510 | | return result; |
511 | | #endif |
512 | 0 | } |
513 | | |
514 | 0 | static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) { |
515 | 0 | #if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER) |
516 | 0 | // Fast way to count leading zeros |
517 | 0 | return __builtin_clzll(val); |
518 | 0 | #else |
519 | 0 | uint32_t hi = val >> 32; |
520 | 0 | uint32_t lo = val & 0xFFFFFFFF; |
521 | 0 | if (hi != 0) return libdivide_count_leading_zeros32(hi); |
522 | 0 | return 32 + libdivide_count_leading_zeros32(lo); |
523 | 0 | #endif |
524 | 0 | } |
525 | | |
526 | | // libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit |
527 | | // uint {v}. The result must fit in 16 bits. |
528 | | // Returns the quotient directly and the remainder in *r |
529 | | static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16( |
530 | 0 | uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) { |
531 | 0 | uint32_t n = ((uint32_t)u1 << 16) | u0; |
532 | 0 | uint16_t result = (uint16_t)(n / v); |
533 | 0 | *r = (uint16_t)(n - result * (uint32_t)v); |
534 | 0 | return result; |
535 | 0 | } |
536 | | |
537 | | // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit |
538 | | // uint {v}. The result must fit in 32 bits. |
539 | | // Returns the quotient directly and the remainder in *r |
540 | | static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32( |
541 | 0 | uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { |
542 | 0 | #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM) |
543 | 0 | uint32_t result; |
544 | 0 | __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1)); |
545 | 0 | return result; |
546 | | #else |
547 | | uint64_t n = ((uint64_t)u1 << 32) | u0; |
548 | | uint32_t result = (uint32_t)(n / v); |
549 | | *r = (uint32_t)(n - result * (uint64_t)v); |
550 | | return result; |
551 | | #endif |
552 | 0 | } |
553 | | |
554 | | // libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The |
555 | | // result must fit in 64 bits. Returns the quotient directly and the remainder in *r |
556 | | static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( |
557 | 0 | uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) { |
558 | 0 | // N.B. resist the temptation to use __uint128_t here. |
559 | 0 | // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than |
560 | 0 | // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because |
561 | 0 | // it's not LIBDIVIDE_INLINEd. |
562 | 0 | #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM) |
563 | 0 | uint64_t result; |
564 | 0 | __asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi)); |
565 | 0 | return result; |
566 | 0 | #else |
567 | 0 | // We work in base 2**32. |
568 | 0 | // A uint32 holds a single digit. A uint64 holds two digits. |
569 | 0 | // Our numerator is conceptually [num3, num2, num1, num0]. |
570 | 0 | // Our denominator is [den1, den0]. |
571 | 0 | const uint64_t b = ((uint64_t)1 << 32); |
572 | 0 |
|
573 | 0 | // The high and low digits of our computed quotient. |
574 | 0 | uint32_t q1; |
575 | 0 | uint32_t q0; |
576 | 0 |
|
577 | 0 | // The normalization shift factor. |
578 | 0 | int shift; |
579 | 0 |
|
580 | 0 | // The high and low digits of our denominator (after normalizing). |
581 | 0 | // Also the low 2 digits of our numerator (after normalizing). |
582 | 0 | uint32_t den1; |
583 | 0 | uint32_t den0; |
584 | 0 | uint32_t num1; |
585 | 0 | uint32_t num0; |
586 | 0 |
|
587 | 0 | // A partial remainder. |
588 | 0 | uint64_t rem; |
589 | 0 |
|
590 | 0 | // The estimated quotient, and its corresponding remainder (unrelated to true remainder). |
591 | 0 | uint64_t qhat; |
592 | 0 | uint64_t rhat; |
593 | 0 |
|
594 | 0 | // Variables used to correct the estimated quotient. |
595 | 0 | uint64_t c1; |
596 | 0 | uint64_t c2; |
597 | 0 |
|
598 | 0 | // Check for overflow and divide by 0. |
599 | 0 | if (numhi >= den) { |
600 | 0 | if (r) *r = ~0ull; |
601 | 0 | return ~0ull; |
602 | 0 | } |
603 | 0 |
|
604 | 0 | // Determine the normalization factor. We multiply den by this, so that its leading digit is at |
605 | 0 | // least half b. In binary this means just shifting left by the number of leading zeros, so that |
606 | 0 | // there's a 1 in the MSB. |
607 | 0 | // We also shift numer by the same amount. This cannot overflow because numhi < den. |
608 | 0 | // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting |
609 | 0 | // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is |
610 | 0 | // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it. |
611 | 0 | shift = libdivide_count_leading_zeros64(den); |
612 | 0 | den <<= shift; |
613 | 0 | numhi <<= shift; |
614 | 0 | numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63); |
615 | 0 | numlo <<= shift; |
616 | 0 |
|
617 | 0 | // Extract the low digits of the numerator and both digits of the denominator. |
618 | 0 | num1 = (uint32_t)(numlo >> 32); |
619 | 0 | num0 = (uint32_t)(numlo & 0xFFFFFFFFu); |
620 | 0 | den1 = (uint32_t)(den >> 32); |
621 | 0 | den0 = (uint32_t)(den & 0xFFFFFFFFu); |
622 | 0 |
|
623 | 0 | // We wish to compute q1 = [n3 n2 n1] / [d1 d0]. |
624 | 0 | // Estimate q1 as [n3 n2] / [d1], and then correct it. |
625 | 0 | // Note while qhat may be 2 digits, q1 is always 1 digit. |
626 | 0 | qhat = numhi / den1; |
627 | 0 | rhat = numhi % den1; |
628 | 0 | c1 = qhat * den0; |
629 | 0 | c2 = rhat * b + num1; |
630 | 0 | if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; |
631 | 0 | q1 = (uint32_t)qhat; |
632 | 0 |
|
633 | 0 | // Compute the true (partial) remainder. |
634 | 0 | rem = numhi * b + num1 - q1 * den; |
635 | 0 |
|
636 | 0 | // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0]. |
637 | 0 | // Estimate q0 as [rem1 rem0] / [d1] and correct it. |
638 | 0 | qhat = rem / den1; |
639 | 0 | rhat = rem % den1; |
640 | 0 | c1 = qhat * den0; |
641 | 0 | c2 = rhat * b + num0; |
642 | 0 | if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; |
643 | 0 | q0 = (uint32_t)qhat; |
644 | 0 |
|
645 | 0 | // Return remainder if requested. |
646 | 0 | if (r) *r = (rem * b + num0 - q0 * den) >> shift; |
647 | 0 | return ((uint64_t)q1 << 32) | q0; |
648 | 0 | #endif |
649 | 0 | } |
650 | | |
651 | | #if !(defined(HAS_INT128_T) && \ |
652 | | defined(HAS_INT128_DIV)) |
653 | | |
654 | | // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) |
655 | | static LIBDIVIDE_INLINE void libdivide_u128_shift( |
656 | | uint64_t *u1, uint64_t *u0, int32_t signed_shift) { |
657 | | if (signed_shift > 0) { |
658 | | uint32_t shift = signed_shift; |
659 | | *u1 <<= shift; |
660 | | *u1 |= *u0 >> (64 - shift); |
661 | | *u0 <<= shift; |
662 | | } else if (signed_shift < 0) { |
663 | | uint32_t shift = -signed_shift; |
664 | | *u0 >>= shift; |
665 | | *u0 |= *u1 << (64 - shift); |
666 | | *u1 >>= shift; |
667 | | } |
668 | | } |
669 | | |
670 | | #endif |
671 | | |
672 | | // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. |
673 | | static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64( |
674 | 0 | uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { |
675 | 0 | #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV) |
676 | 0 | __uint128_t ufull = u_hi; |
677 | 0 | __uint128_t vfull = v_hi; |
678 | 0 | ufull = (ufull << 64) | u_lo; |
679 | 0 | vfull = (vfull << 64) | v_lo; |
680 | 0 | uint64_t res = (uint64_t)(ufull / vfull); |
681 | 0 | __uint128_t remainder = ufull - (vfull * res); |
682 | 0 | *r_lo = (uint64_t)remainder; |
683 | 0 | *r_hi = (uint64_t)(remainder >> 64); |
684 | 0 | return res; |
685 | 0 | #else |
686 | 0 | // Adapted from "Unsigned Doubleword Division" in Hacker's Delight |
687 | 0 | // We want to compute u / v |
688 | 0 | typedef struct { |
689 | 0 | uint64_t hi; |
690 | 0 | uint64_t lo; |
691 | 0 | } u128_t; |
692 | 0 | u128_t u = {u_hi, u_lo}; |
693 | 0 | u128_t v = {v_hi, v_lo}; |
694 | 0 |
|
695 | 0 | if (v.hi == 0) { |
696 | 0 | // divisor v is a 64 bit value, so we just need one 128/64 division |
697 | 0 | // Note that we are simpler than Hacker's Delight here, because we know |
698 | 0 | // the quotient fits in 64 bits whereas Hacker's Delight demands a full |
699 | 0 | // 128 bit quotient |
700 | 0 | *r_hi = 0; |
701 | 0 | return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo); |
702 | 0 | } |
703 | 0 | // Here v >= 2**64 |
704 | 0 | // We know that v.hi != 0, so count leading zeros is OK |
705 | 0 | // We have 0 <= n <= 63 |
706 | 0 | uint32_t n = libdivide_count_leading_zeros64(v.hi); |
707 | 0 |
|
708 | 0 | // Normalize the divisor so its MSB is 1 |
709 | 0 | u128_t v1t = v; |
710 | 0 | libdivide_u128_shift(&v1t.hi, &v1t.lo, n); |
711 | 0 | uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 |
712 | 0 |
|
713 | 0 | // To ensure no overflow |
714 | 0 | u128_t u1 = u; |
715 | 0 | libdivide_u128_shift(&u1.hi, &u1.lo, -1); |
716 | 0 |
|
717 | 0 | // Get quotient from divide unsigned insn. |
718 | 0 | uint64_t rem_ignored; |
719 | 0 | uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored); |
720 | 0 |
|
721 | 0 | // Undo normalization and division of u by 2. |
722 | 0 | u128_t q0 = {0, q1}; |
723 | 0 | libdivide_u128_shift(&q0.hi, &q0.lo, n); |
724 | 0 | libdivide_u128_shift(&q0.hi, &q0.lo, -63); |
725 | 0 |
|
726 | 0 | // Make q0 correct or too small by 1 |
727 | 0 | // Equivalent to `if (q0 != 0) q0 = q0 - 1;` |
728 | 0 | if (q0.hi != 0 || q0.lo != 0) { |
729 | 0 | q0.hi -= (q0.lo == 0); // borrow |
730 | 0 | q0.lo -= 1; |
731 | 0 | } |
732 | 0 |
|
733 | 0 | // Now q0 is correct. |
734 | 0 | // Compute q0 * v as q0v |
735 | 0 | // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo) |
736 | 0 | // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) + |
737 | 0 | // (q0.lo * v.hi << 64) + q0.lo * v.lo) |
738 | 0 | // Each term is 128 bit |
739 | 0 | // High half of full product (upper 128 bits!) are dropped |
740 | 0 | u128_t q0v = {0, 0}; |
741 | 0 | q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo); |
742 | 0 | q0v.lo = q0.lo * v.lo; |
743 | 0 |
|
744 | 0 | // Compute u - q0v as u_q0v |
745 | 0 | // This is the remainder |
746 | 0 | u128_t u_q0v = u; |
747 | 0 | u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow |
748 | 0 | u_q0v.lo -= q0v.lo; |
749 | 0 |
|
750 | 0 | // Check if u_q0v >= v |
751 | 0 | // This checks if our remainder is larger than the divisor |
752 | 0 | if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { |
753 | 0 | // Increment q0 |
754 | 0 | q0.lo += 1; |
755 | 0 | q0.hi += (q0.lo == 0); // carry |
756 | 0 |
|
757 | 0 | // Subtract v from remainder |
758 | 0 | u_q0v.hi -= v.hi + (u_q0v.lo < v.lo); |
759 | 0 | u_q0v.lo -= v.lo; |
760 | 0 | } |
761 | 0 |
|
762 | 0 | *r_hi = u_q0v.hi; |
763 | 0 | *r_lo = u_q0v.lo; |
764 | 0 |
|
765 | 0 | LIBDIVIDE_ASSERT(q0.hi == 0); |
766 | 0 | return q0.lo; |
767 | 0 | #endif |
768 | 0 | } |
769 | | |
770 | | ////////// UINT16 |
771 | | |
772 | | static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( |
773 | 0 | uint16_t d, int branchfree) { |
774 | 0 | if (d == 0) { |
775 | 0 | LIBDIVIDE_ERROR("divider must be != 0"); |
776 | 0 | } |
777 | | |
778 | 0 | struct libdivide_u16_t result; |
779 | 0 | uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d)); |
780 | | |
781 | | // Power of 2 |
782 | 0 | if ((d & (d - 1)) == 0) { |
783 | | // We need to subtract 1 from the shift value in case of an unsigned |
784 | | // branchfree divider because there is a hardcoded right shift by 1 |
785 | | // in its division algorithm. Because of this we also need to add back |
786 | | // 1 in its recovery algorithm. |
787 | 0 | result.magic = 0; |
788 | 0 | result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); |
789 | 0 | } else { |
790 | 0 | uint8_t more; |
791 | 0 | uint16_t rem, proposed_m; |
792 | 0 | proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem); |
793 | |
|
794 | 0 | LIBDIVIDE_ASSERT(rem > 0 && rem < d); |
795 | 0 | const uint16_t e = d - rem; |
796 | | |
797 | | // This power works if e < 2**floor_log_2_d. |
798 | 0 | if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) { |
799 | | // This power works |
800 | 0 | more = floor_log_2_d; |
801 | 0 | } else { |
802 | | // We have to use the general 17-bit algorithm. We need to compute |
803 | | // (2**power) / d. However, we already have (2**(power-1))/d and |
804 | | // its remainder. By doubling both, and then correcting the |
805 | | // remainder, we can compute the larger division. |
806 | | // don't care about overflow here - in fact, we expect it |
807 | 0 | proposed_m += proposed_m; |
808 | 0 | const uint16_t twice_rem = rem + rem; |
809 | 0 | if (twice_rem >= d || twice_rem < rem) proposed_m += 1; |
810 | 0 | more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; |
811 | 0 | } |
812 | 0 | result.magic = 1 + proposed_m; |
813 | 0 | result.more = more; |
814 | | // result.more's shift should in general be ceil_log_2_d. But if we |
815 | | // used the smaller power, we subtract one from the shift because we're |
816 | | // using the smaller power. If we're using the larger power, we |
817 | | // subtract one from the shift because it's taken care of by the add |
818 | | // indicator. So floor_log_2_d happens to be correct in both cases. |
819 | 0 | } |
820 | 0 | return result; |
821 | 0 | } |
822 | | |
823 | 0 | static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) { |
824 | 0 | return libdivide_internal_u16_gen(d, 0); |
825 | 0 | } |
826 | | |
827 | 0 | static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { |
828 | 0 | if (d == 1) { |
829 | 0 | LIBDIVIDE_ERROR("branchfree divider must be != 1"); |
830 | 0 | } |
831 | 0 | struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1); |
832 | 0 | struct libdivide_u16_branchfree_t ret = { |
833 | 0 | tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)}; |
834 | 0 | return ret; |
835 | 0 | } |
836 | | |
837 | | // The original libdivide_u16_do takes a const pointer. However, this cannot be used |
838 | | // with a compile time constant libdivide_u16_t: it will generate a warning about |
839 | | // taking the address of a temporary. Hence this overload. |
840 | 0 | static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { |
841 | 0 | if (!magic) { |
842 | 0 | return numer >> more; |
843 | 0 | } else { |
844 | 0 | uint16_t q = libdivide_mullhi_u16(numer, magic); |
845 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
846 | 0 | uint16_t t = ((numer - q) >> 1) + q; |
847 | 0 | return t >> (more & LIBDIVIDE_16_SHIFT_MASK); |
848 | 0 | } else { |
849 | 0 | // All upper bits are 0, |
850 | 0 | // don't need to mask them off. |
851 | 0 | return q >> more; |
852 | 0 | } |
853 | 0 | } |
854 | 0 | } |
855 | | |
856 | 0 | static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) { |
857 | 0 | return libdivide_u16_do_raw(numer, denom->magic, denom->more); |
858 | 0 | } |
859 | | |
860 | | static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( |
861 | 0 | uint16_t numer, const struct libdivide_u16_branchfree_t *denom) { |
862 | 0 | uint16_t q = libdivide_mullhi_u16(numer, denom->magic); |
863 | 0 | uint16_t t = ((numer - q) >> 1) + q; |
864 | 0 | return t >> denom->more; |
865 | 0 | } |
866 | | |
867 | 0 | static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) { |
868 | 0 | uint8_t more = denom->more; |
869 | 0 | uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
870 | 0 |
|
871 | 0 | if (!denom->magic) { |
872 | 0 | return (uint16_t)1 << shift; |
873 | 0 | } else if (!(more & LIBDIVIDE_ADD_MARKER)) { |
874 | 0 | // We compute q = n/d = n*m / 2^(16 + shift) |
875 | 0 | // Therefore we have d = 2^(16 + shift) / m |
876 | 0 | // We need to ceil it. |
877 | 0 | // We know d is not a power of 2, so m is not a power of 2, |
878 | 0 | // so we can just add 1 to the floor |
879 | 0 | uint16_t hi_dividend = (uint16_t)1 << shift; |
880 | 0 | uint16_t rem_ignored; |
881 | 0 | return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored); |
882 | 0 | } else { |
883 | 0 | // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). |
884 | 0 | // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now |
885 | 0 | // Also note that shift may be as high as 15, so shift + 1 will |
886 | 0 | // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and |
887 | 0 | // then double the quotient and remainder. |
888 | 0 | uint32_t half_n = (uint32_t)1 << (16 + shift); |
889 | 0 | uint32_t d = ((uint32_t)1 << 16) | denom->magic; |
890 | 0 | // Note that the quotient is guaranteed <= 16 bits, but the remainder |
891 | 0 | // may need 17! |
892 | 0 | uint16_t half_q = (uint16_t)(half_n / d); |
893 | 0 | uint32_t rem = half_n % d; |
894 | 0 | // We computed 2^(16+shift)/(m+2^16) |
895 | 0 | // Need to double it, and then add 1 to the quotient if doubling th |
896 | 0 | // remainder would increase the quotient. |
897 | 0 | // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits |
898 | 0 | uint16_t full_q = half_q + half_q + ((rem << 1) >= d); |
899 | 0 |
|
900 | 0 | // We rounded down in gen (hence +1) |
901 | 0 | return full_q + 1; |
902 | 0 | } |
903 | 0 | } |
904 | | |
905 | 0 | static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) { |
906 | 0 | uint8_t more = denom->more; |
907 | 0 | uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
908 | 0 |
|
909 | 0 | if (!denom->magic) { |
910 | 0 | return (uint16_t)1 << (shift + 1); |
911 | 0 | } else { |
912 | 0 | // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). |
913 | 0 | // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now |
914 | 0 | // Also note that shift may be as high as 15, so shift + 1 will |
915 | 0 | // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and |
916 | 0 | // then double the quotient and remainder. |
917 | 0 | uint32_t half_n = (uint32_t)1 << (16 + shift); |
918 | 0 | uint32_t d = ((uint32_t)1 << 16) | denom->magic; |
919 | 0 | // Note that the quotient is guaranteed <= 16 bits, but the remainder |
920 | 0 | // may need 17! |
921 | 0 | uint16_t half_q = (uint16_t)(half_n / d); |
922 | 0 | uint32_t rem = half_n % d; |
923 | 0 | // We computed 2^(16+shift)/(m+2^16) |
924 | 0 | // Need to double it, and then add 1 to the quotient if doubling th |
925 | 0 | // remainder would increase the quotient. |
926 | 0 | // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits |
927 | 0 | uint16_t full_q = half_q + half_q + ((rem << 1) >= d); |
928 | 0 |
|
929 | 0 | // We rounded down in gen (hence +1) |
930 | 0 | return full_q + 1; |
931 | 0 | } |
932 | 0 | } |
933 | | |
934 | | ////////// UINT32 |
935 | | |
936 | | static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen( |
937 | 0 | uint32_t d, int branchfree) { |
938 | 0 | if (d == 0) { |
939 | 0 | LIBDIVIDE_ERROR("divider must be != 0"); |
940 | 0 | } |
941 | | |
942 | 0 | struct libdivide_u32_t result; |
943 | 0 | uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d); |
944 | | |
945 | | // Power of 2 |
946 | 0 | if ((d & (d - 1)) == 0) { |
947 | | // We need to subtract 1 from the shift value in case of an unsigned |
948 | | // branchfree divider because there is a hardcoded right shift by 1 |
949 | | // in its division algorithm. Because of this we also need to add back |
950 | | // 1 in its recovery algorithm. |
951 | 0 | result.magic = 0; |
952 | 0 | result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); |
953 | 0 | } else { |
954 | 0 | uint8_t more; |
955 | 0 | uint32_t rem, proposed_m; |
956 | 0 | proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem); |
957 | |
|
958 | 0 | LIBDIVIDE_ASSERT(rem > 0 && rem < d); |
959 | 0 | const uint32_t e = d - rem; |
960 | | |
961 | | // This power works if e < 2**floor_log_2_d. |
962 | 0 | if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) { |
963 | | // This power works |
964 | 0 | more = (uint8_t)floor_log_2_d; |
965 | 0 | } else { |
966 | | // We have to use the general 33-bit algorithm. We need to compute |
967 | | // (2**power) / d. However, we already have (2**(power-1))/d and |
968 | | // its remainder. By doubling both, and then correcting the |
969 | | // remainder, we can compute the larger division. |
970 | | // don't care about overflow here - in fact, we expect it |
971 | 0 | proposed_m += proposed_m; |
972 | 0 | const uint32_t twice_rem = rem + rem; |
973 | 0 | if (twice_rem >= d || twice_rem < rem) proposed_m += 1; |
974 | 0 | more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); |
975 | 0 | } |
976 | 0 | result.magic = 1 + proposed_m; |
977 | 0 | result.more = more; |
978 | | // result.more's shift should in general be ceil_log_2_d. But if we |
979 | | // used the smaller power, we subtract one from the shift because we're |
980 | | // using the smaller power. If we're using the larger power, we |
981 | | // subtract one from the shift because it's taken care of by the add |
982 | | // indicator. So floor_log_2_d happens to be correct in both cases. |
983 | 0 | } |
984 | 0 | return result; |
985 | 0 | } |
986 | | |
987 | 0 | static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) { |
988 | 0 | return libdivide_internal_u32_gen(d, 0); |
989 | 0 | } |
990 | | |
991 | 0 | static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { |
992 | 0 | if (d == 1) { |
993 | 0 | LIBDIVIDE_ERROR("branchfree divider must be != 1"); |
994 | 0 | } |
995 | 0 | struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1); |
996 | 0 | struct libdivide_u32_branchfree_t ret = { |
997 | 0 | tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; |
998 | 0 | return ret; |
999 | 0 | } |
1000 | | |
1001 | 0 | static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) { |
1002 | 0 | if (!magic) { |
1003 | 0 | return numer >> more; |
1004 | 0 | } else { |
1005 | 0 | uint32_t q = libdivide_mullhi_u32(numer, magic); |
1006 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
1007 | 0 | uint32_t t = ((numer - q) >> 1) + q; |
1008 | 0 | return t >> (more & LIBDIVIDE_32_SHIFT_MASK); |
1009 | 0 | } else { |
1010 | 0 | // All upper bits are 0, |
1011 | 0 | // don't need to mask them off. |
1012 | 0 | return q >> more; |
1013 | 0 | } |
1014 | 0 | } |
1015 | 0 | } |
1016 | | |
1017 | 0 | static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { |
1018 | 0 | return libdivide_u32_do_raw(numer, denom->magic, denom->more); |
1019 | 0 | } |
1020 | | |
1021 | | static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( |
1022 | 0 | uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { |
1023 | 0 | uint32_t q = libdivide_mullhi_u32(numer, denom->magic); |
1024 | 0 | uint32_t t = ((numer - q) >> 1) + q; |
1025 | 0 | return t >> denom->more; |
1026 | 0 | } |
1027 | | |
1028 | 0 | static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { |
1029 | 0 | uint8_t more = denom->more; |
1030 | 0 | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
1031 | 0 |
|
1032 | 0 | if (!denom->magic) { |
1033 | 0 | return (uint32_t)1 << shift; |
1034 | 0 | } else if (!(more & LIBDIVIDE_ADD_MARKER)) { |
1035 | 0 | // We compute q = n/d = n*m / 2^(32 + shift) |
1036 | 0 | // Therefore we have d = 2^(32 + shift) / m |
1037 | 0 | // We need to ceil it. |
1038 | 0 | // We know d is not a power of 2, so m is not a power of 2, |
1039 | 0 | // so we can just add 1 to the floor |
1040 | 0 | uint32_t hi_dividend = (uint32_t)1 << shift; |
1041 | 0 | uint32_t rem_ignored; |
1042 | 0 | return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored); |
1043 | 0 | } else { |
1044 | 0 | // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). |
1045 | 0 | // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now |
1046 | 0 | // Also note that shift may be as high as 31, so shift + 1 will |
1047 | 0 | // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and |
1048 | 0 | // then double the quotient and remainder. |
1049 | 0 | uint64_t half_n = (uint64_t)1 << (32 + shift); |
1050 | 0 | uint64_t d = ((uint64_t)1 << 32) | denom->magic; |
1051 | 0 | // Note that the quotient is guaranteed <= 32 bits, but the remainder |
1052 | 0 | // may need 33! |
1053 | 0 | uint32_t half_q = (uint32_t)(half_n / d); |
1054 | 0 | uint64_t rem = half_n % d; |
1055 | 0 | // We computed 2^(32+shift)/(m+2^32) |
1056 | 0 | // Need to double it, and then add 1 to the quotient if doubling th |
1057 | 0 | // remainder would increase the quotient. |
1058 | 0 | // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits |
1059 | 0 | uint32_t full_q = half_q + half_q + ((rem << 1) >= d); |
1060 | 0 |
|
1061 | 0 | // We rounded down in gen (hence +1) |
1062 | 0 | return full_q + 1; |
1063 | 0 | } |
1064 | 0 | } |
1065 | | |
1066 | 0 | static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) { |
1067 | 0 | uint8_t more = denom->more; |
1068 | 0 | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
1069 | 0 |
|
1070 | 0 | if (!denom->magic) { |
1071 | 0 | return (uint32_t)1 << (shift + 1); |
1072 | 0 | } else { |
1073 | 0 | // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). |
1074 | 0 | // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now |
1075 | 0 | // Also note that shift may be as high as 31, so shift + 1 will |
1076 | 0 | // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and |
1077 | 0 | // then double the quotient and remainder. |
1078 | 0 | uint64_t half_n = (uint64_t)1 << (32 + shift); |
1079 | 0 | uint64_t d = ((uint64_t)1 << 32) | denom->magic; |
1080 | 0 | // Note that the quotient is guaranteed <= 32 bits, but the remainder |
1081 | 0 | // may need 33! |
1082 | 0 | uint32_t half_q = (uint32_t)(half_n / d); |
1083 | 0 | uint64_t rem = half_n % d; |
1084 | 0 | // We computed 2^(32+shift)/(m+2^32) |
1085 | 0 | // Need to double it, and then add 1 to the quotient if doubling th |
1086 | 0 | // remainder would increase the quotient. |
1087 | 0 | // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits |
1088 | 0 | uint32_t full_q = half_q + half_q + ((rem << 1) >= d); |
1089 | 0 |
|
1090 | 0 | // We rounded down in gen (hence +1) |
1091 | 0 | return full_q + 1; |
1092 | 0 | } |
1093 | 0 | } |
1094 | | |
1095 | | ////////// UINT64 |
1096 | | |
1097 | | static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen( |
1098 | 0 | uint64_t d, int branchfree) { |
1099 | 0 | if (d == 0) { |
1100 | 0 | LIBDIVIDE_ERROR("divider must be != 0"); |
1101 | 0 | } |
1102 | 0 |
|
1103 | 0 | struct libdivide_u64_t result; |
1104 | 0 | uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d); |
1105 | 0 |
|
1106 | 0 | // Power of 2 |
1107 | 0 | if ((d & (d - 1)) == 0) { |
1108 | 0 | // We need to subtract 1 from the shift value in case of an unsigned |
1109 | 0 | // branchfree divider because there is a hardcoded right shift by 1 |
1110 | 0 | // in its division algorithm. Because of this we also need to add back |
1111 | 0 | // 1 in its recovery algorithm. |
1112 | 0 | result.magic = 0; |
1113 | 0 | result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); |
1114 | 0 | } else { |
1115 | 0 | uint64_t proposed_m, rem; |
1116 | 0 | uint8_t more; |
1117 | 0 | // (1 << (64 + floor_log_2_d)) / d |
1118 | 0 | proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem); |
1119 | 0 |
|
1120 | 0 | LIBDIVIDE_ASSERT(rem > 0 && rem < d); |
1121 | 0 | const uint64_t e = d - rem; |
1122 | 0 |
|
1123 | 0 | // This power works if e < 2**floor_log_2_d. |
1124 | 0 | if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { |
1125 | 0 | // This power works |
1126 | 0 | more = (uint8_t)floor_log_2_d; |
1127 | 0 | } else { |
1128 | 0 | // We have to use the general 65-bit algorithm. We need to compute |
1129 | 0 | // (2**power) / d. However, we already have (2**(power-1))/d and |
1130 | 0 | // its remainder. By doubling both, and then correcting the |
1131 | 0 | // remainder, we can compute the larger division. |
1132 | 0 | // don't care about overflow here - in fact, we expect it |
1133 | 0 | proposed_m += proposed_m; |
1134 | 0 | const uint64_t twice_rem = rem + rem; |
1135 | 0 | if (twice_rem >= d || twice_rem < rem) proposed_m += 1; |
1136 | 0 | more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); |
1137 | 0 | } |
1138 | 0 | result.magic = 1 + proposed_m; |
1139 | 0 | result.more = more; |
1140 | 0 | // result.more's shift should in general be ceil_log_2_d. But if we |
1141 | 0 | // used the smaller power, we subtract one from the shift because we're |
1142 | 0 | // using the smaller power. If we're using the larger power, we |
1143 | 0 | // subtract one from the shift because it's taken care of by the add |
1144 | 0 | // indicator. So floor_log_2_d happens to be correct in both cases, |
1145 | 0 | // which is why we do it outside of the if statement. |
1146 | 0 | } |
1147 | 0 | return result; |
1148 | 0 | } |
1149 | | |
1150 | 0 | static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) { |
1151 | 0 | return libdivide_internal_u64_gen(d, 0); |
1152 | 0 | } |
1153 | | |
1154 | 0 | static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { |
1155 | 0 | if (d == 1) { |
1156 | 0 | LIBDIVIDE_ERROR("branchfree divider must be != 1"); |
1157 | 0 | } |
1158 | 0 | struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1); |
1159 | 0 | struct libdivide_u64_branchfree_t ret = { |
1160 | 0 | tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; |
1161 | 0 | return ret; |
1162 | 0 | } |
1163 | | |
1164 | 0 | static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) { |
1165 | 0 | if (!magic) { |
1166 | 0 | return numer >> more; |
1167 | 0 | } else { |
1168 | 0 | uint64_t q = libdivide_mullhi_u64(numer, magic); |
1169 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
1170 | 0 | uint64_t t = ((numer - q) >> 1) + q; |
1171 | 0 | return t >> (more & LIBDIVIDE_64_SHIFT_MASK); |
1172 | 0 | } else { |
1173 | 0 | // All upper bits are 0, |
1174 | 0 | // don't need to mask them off. |
1175 | 0 | return q >> more; |
1176 | 0 | } |
1177 | 0 | } |
1178 | 0 | } |
1179 | | |
1180 | 0 | static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { |
1181 | 0 | return libdivide_u64_do_raw(numer, denom->magic, denom->more); |
1182 | 0 | } |
1183 | | |
1184 | | static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( |
1185 | 0 | uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { |
1186 | 0 | uint64_t q = libdivide_mullhi_u64(numer, denom->magic); |
1187 | 0 | uint64_t t = ((numer - q) >> 1) + q; |
1188 | 0 | return t >> denom->more; |
1189 | 0 | } |
1190 | | |
1191 | 0 | static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { |
1192 | 0 | uint8_t more = denom->more; |
1193 | 0 | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
1194 | 0 |
|
1195 | 0 | if (!denom->magic) { |
1196 | 0 | return (uint64_t)1 << shift; |
1197 | 0 | } else if (!(more & LIBDIVIDE_ADD_MARKER)) { |
1198 | 0 | // We compute q = n/d = n*m / 2^(64 + shift) |
1199 | 0 | // Therefore we have d = 2^(64 + shift) / m |
1200 | 0 | // We need to ceil it. |
1201 | 0 | // We know d is not a power of 2, so m is not a power of 2, |
1202 | 0 | // so we can just add 1 to the floor |
1203 | 0 | uint64_t hi_dividend = (uint64_t)1 << shift; |
1204 | 0 | uint64_t rem_ignored; |
1205 | 0 | return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored); |
1206 | 0 | } else { |
1207 | 0 | // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). |
1208 | 0 | // Notice (m + 2^64) is a 65 bit number. This gets hairy. See |
1209 | 0 | // libdivide_u32_recover for more on what we do here. |
1210 | 0 | // TODO: do something better than 128 bit math |
1211 | 0 |
|
1212 | 0 | // Full n is a (potentially) 129 bit value |
1213 | 0 | // half_n is a 128 bit value |
1214 | 0 | // Compute the hi half of half_n. Low half is 0. |
1215 | 0 | uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; |
1216 | 0 | // d is a 65 bit value. The high bit is always set to 1. |
1217 | 0 | const uint64_t d_hi = 1, d_lo = denom->magic; |
1218 | 0 | // Note that the quotient is guaranteed <= 64 bits, |
1219 | 0 | // but the remainder may need 65! |
1220 | 0 | uint64_t r_hi, r_lo; |
1221 | 0 | uint64_t half_q = |
1222 | 0 | libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); |
1223 | 0 | // We computed 2^(64+shift)/(m+2^64) |
1224 | 0 | // Double the remainder ('dr') and check if that is larger than d |
1225 | 0 | // Note that d is a 65 bit value, so r1 is small and so r1 + r1 |
1226 | 0 | // cannot overflow |
1227 | 0 | uint64_t dr_lo = r_lo + r_lo; |
1228 | 0 | uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry |
1229 | 0 | int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); |
1230 | 0 | uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); |
1231 | 0 | return full_q + 1; |
1232 | 0 | } |
1233 | 0 | } |
1234 | | |
1235 | 0 | static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) { |
1236 | 0 | uint8_t more = denom->more; |
1237 | 0 | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
1238 | 0 |
|
1239 | 0 | if (!denom->magic) { |
1240 | 0 | return (uint64_t)1 << (shift + 1); |
1241 | 0 | } else { |
1242 | 0 | // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). |
1243 | 0 | // Notice (m + 2^64) is a 65 bit number. This gets hairy. See |
1244 | 0 | // libdivide_u32_recover for more on what we do here. |
1245 | 0 | // TODO: do something better than 128 bit math |
1246 | 0 |
|
1247 | 0 | // Full n is a (potentially) 129 bit value |
1248 | 0 | // half_n is a 128 bit value |
1249 | 0 | // Compute the hi half of half_n. Low half is 0. |
1250 | 0 | uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; |
1251 | 0 | // d is a 65 bit value. The high bit is always set to 1. |
1252 | 0 | const uint64_t d_hi = 1, d_lo = denom->magic; |
1253 | 0 | // Note that the quotient is guaranteed <= 64 bits, |
1254 | 0 | // but the remainder may need 65! |
1255 | 0 | uint64_t r_hi, r_lo; |
1256 | 0 | uint64_t half_q = |
1257 | 0 | libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); |
1258 | 0 | // We computed 2^(64+shift)/(m+2^64) |
1259 | 0 | // Double the remainder ('dr') and check if that is larger than d |
1260 | 0 | // Note that d is a 65 bit value, so r1 is small and so r1 + r1 |
1261 | 0 | // cannot overflow |
1262 | 0 | uint64_t dr_lo = r_lo + r_lo; |
1263 | 0 | uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry |
1264 | 0 | int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); |
1265 | 0 | uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); |
1266 | 0 | return full_q + 1; |
1267 | 0 | } |
1268 | 0 | } |
1269 | | |
1270 | | ////////// SINT16 |
1271 | | |
1272 | | static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen( |
1273 | 0 | int16_t d, int branchfree) { |
1274 | 0 | if (d == 0) { |
1275 | 0 | LIBDIVIDE_ERROR("divider must be != 0"); |
1276 | 0 | } |
1277 | 0 |
|
1278 | 0 | struct libdivide_s16_t result; |
1279 | 0 |
|
1280 | 0 | // If d is a power of 2, or negative a power of 2, we have to use a shift. |
1281 | 0 | // This is especially important because the magic algorithm fails for -1. |
1282 | 0 | // To check if d is a power of 2 or its inverse, it suffices to check |
1283 | 0 | // whether its absolute value has exactly one bit set. This works even for |
1284 | 0 | // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set |
1285 | 0 | // and is a power of 2. |
1286 | 0 | uint16_t ud = (uint16_t)d; |
1287 | 0 | uint16_t absD = (d < 0) ? -ud : ud; |
1288 | 0 | uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD); |
1289 | 0 | // check if exactly one bit is set, |
1290 | 0 | // don't care if absD is 0 since that's divide by zero |
1291 | 0 | if ((absD & (absD - 1)) == 0) { |
1292 | 0 | // Branchfree and normal paths are exactly the same |
1293 | 0 | result.magic = 0; |
1294 | 0 | result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); |
1295 | 0 | } else { |
1296 | 0 | LIBDIVIDE_ASSERT(floor_log_2_d >= 1); |
1297 | 0 |
|
1298 | 0 | uint8_t more; |
1299 | 0 | // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word |
1300 | 0 | // is 0 and the high word is floor_log_2_d - 1 |
1301 | 0 | uint16_t rem, proposed_m; |
1302 | 0 | proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem); |
1303 | 0 | const uint16_t e = absD - rem; |
1304 | 0 |
|
1305 | 0 | // We are going to start with a power of floor_log_2_d - 1. |
1306 | 0 | // This works if works if e < 2**floor_log_2_d. |
1307 | 0 | if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) { |
1308 | 0 | // This power works |
1309 | 0 | more = (uint8_t)(floor_log_2_d - 1); |
1310 | 0 | } else { |
1311 | 0 | // We need to go one higher. This should not make proposed_m |
1312 | 0 | // overflow, but it will make it negative when interpreted as an |
1313 | 0 | // int16_t. |
1314 | 0 | proposed_m += proposed_m; |
1315 | 0 | const uint16_t twice_rem = rem + rem; |
1316 | 0 | if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; |
1317 | 0 | more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); |
1318 | 0 | } |
1319 | 0 |
|
1320 | 0 | proposed_m += 1; |
1321 | 0 | int16_t magic = (int16_t)proposed_m; |
1322 | 0 |
|
1323 | 0 | // Mark if we are negative. Note we only negate the magic number in the |
1324 | 0 | // branchfull case. |
1325 | 0 | if (d < 0) { |
1326 | 0 | more |= LIBDIVIDE_NEGATIVE_DIVISOR; |
1327 | 0 | if (!branchfree) { |
1328 | 0 | magic = -magic; |
1329 | 0 | } |
1330 | 0 | } |
1331 | 0 |
|
1332 | 0 | result.more = more; |
1333 | 0 | result.magic = magic; |
1334 | 0 | } |
1335 | 0 | return result; |
1336 | 0 | } |
1337 | | |
1338 | 0 | static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) { |
1339 | 0 | return libdivide_internal_s16_gen(d, 0); |
1340 | 0 | } |
1341 | | |
1342 | 0 | static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) { |
1343 | 0 | struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1); |
1344 | 0 | struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more}; |
1345 | 0 | return result; |
1346 | 0 | } |
1347 | | |
1348 | | // The original libdivide_s16_do takes a const pointer. However, this cannot be used |
1349 | | // with a compile time constant libdivide_s16_t: it will generate a warning about |
1350 | | // taking the address of a temporary. Hence this overload. |
1351 | 0 | static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) { |
1352 | 0 | uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
1353 | 0 |
|
1354 | 0 | if (!magic) { |
1355 | 0 | uint16_t sign = (int8_t)more >> 7; |
1356 | 0 | uint16_t mask = ((uint16_t)1 << shift) - 1; |
1357 | 0 | uint16_t uq = numer + ((numer >> 15) & mask); |
1358 | 0 | int16_t q = (int16_t)uq; |
1359 | 0 | q >>= shift; |
1360 | 0 | q = (q ^ sign) - sign; |
1361 | 0 | return q; |
1362 | 0 | } else { |
1363 | 0 | uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic); |
1364 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
1365 | 0 | // must be arithmetic shift and then sign extend |
1366 | 0 | int16_t sign = (int8_t)more >> 7; |
1367 | 0 | // q += (more < 0 ? -numer : numer) |
1368 | 0 | // cast required to avoid UB |
1369 | 0 | uq += ((uint16_t)numer ^ sign) - sign; |
1370 | 0 | } |
1371 | 0 | int16_t q = (int16_t)uq; |
1372 | 0 | q >>= shift; |
1373 | 0 | q += (q < 0); |
1374 | 0 | return q; |
1375 | 0 | } |
1376 | 0 | } |
1377 | | |
1378 | 0 | static LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) { |
1379 | 0 | return libdivide_s16_do_raw(numer, denom->magic, denom->more); |
1380 | 0 | } |
1381 | | |
1382 | 0 | static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) { |
1383 | 0 | uint8_t more = denom->more; |
1384 | 0 | uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
1385 | 0 | // must be arithmetic shift and then sign extend |
1386 | 0 | int16_t sign = (int8_t)more >> 7; |
1387 | 0 | int16_t magic = denom->magic; |
1388 | 0 | int16_t q = libdivide_mullhi_s16(numer, magic); |
1389 | 0 | q += numer; |
1390 | 0 |
|
1391 | 0 | // If q is non-negative, we have nothing to do |
1392 | 0 | // If q is negative, we want to add either (2**shift)-1 if d is a power of |
1393 | 0 | // 2, or (2**shift) if it is not a power of 2 |
1394 | 0 | uint16_t is_power_of_2 = (magic == 0); |
1395 | 0 | uint16_t q_sign = (uint16_t)(q >> 15); |
1396 | 0 | q += q_sign & (((uint16_t)1 << shift) - is_power_of_2); |
1397 | 0 |
|
1398 | 0 | // Now arithmetic right shift |
1399 | 0 | q >>= shift; |
1400 | 0 | // Negate if needed |
1401 | 0 | q = (q ^ sign) - sign; |
1402 | 0 |
|
1403 | 0 | return q; |
1404 | 0 | } |
1405 | | |
1406 | 0 | static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) { |
1407 | 0 | uint8_t more = denom->more; |
1408 | 0 | uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
1409 | 0 | if (!denom->magic) { |
1410 | 0 | uint16_t absD = (uint16_t)1 << shift; |
1411 | 0 | if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { |
1412 | 0 | absD = -absD; |
1413 | 0 | } |
1414 | 0 | return (int16_t)absD; |
1415 | 0 | } else { |
1416 | 0 | // Unsigned math is much easier |
1417 | 0 | // We negate the magic number only in the branchfull case, and we don't |
1418 | 0 | // know which case we're in. However we have enough information to |
1419 | 0 | // determine the correct sign of the magic number. The divisor was |
1420 | 0 | // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, |
1421 | 0 | // the magic number's sign is opposite that of the divisor. |
1422 | 0 | // We want to compute the positive magic number. |
1423 | 0 | int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); |
1424 | 0 | int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; |
1425 | 0 |
|
1426 | 0 | // Handle the power of 2 case (including branchfree) |
1427 | 0 | if (denom->magic == 0) { |
1428 | 0 | int16_t result = (uint16_t)1 << shift; |
1429 | 0 | return negative_divisor ? -result : result; |
1430 | 0 | } |
1431 | 0 |
|
1432 | 0 | uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic); |
1433 | 0 | uint32_t n = (uint32_t)1 << (16 + shift); // this shift cannot exceed 30 |
1434 | 0 | uint16_t q = (uint16_t)(n / d); |
1435 | 0 | int16_t result = (int16_t)q; |
1436 | 0 | result += 1; |
1437 | 0 | return negative_divisor ? -result : result; |
1438 | 0 | } |
1439 | 0 | } |
1440 | | |
1441 | 0 | static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) { |
1442 | 0 | const struct libdivide_s16_t den = {denom->magic, denom->more}; |
1443 | 0 | return libdivide_s16_recover(&den); |
1444 | 0 | } |
1445 | | |
1446 | | ////////// SINT32 |
1447 | | |
1448 | | static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen( |
1449 | 0 | int32_t d, int branchfree) { |
1450 | 0 | if (d == 0) { |
1451 | 0 | LIBDIVIDE_ERROR("divider must be != 0"); |
1452 | 0 | } |
1453 | 0 |
|
1454 | 0 | struct libdivide_s32_t result; |
1455 | 0 |
|
1456 | 0 | // If d is a power of 2, or negative a power of 2, we have to use a shift. |
1457 | 0 | // This is especially important because the magic algorithm fails for -1. |
1458 | 0 | // To check if d is a power of 2 or its inverse, it suffices to check |
1459 | 0 | // whether its absolute value has exactly one bit set. This works even for |
1460 | 0 | // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set |
1461 | 0 | // and is a power of 2. |
1462 | 0 | uint32_t ud = (uint32_t)d; |
1463 | 0 | uint32_t absD = (d < 0) ? -ud : ud; |
1464 | 0 | uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD); |
1465 | 0 | // check if exactly one bit is set, |
1466 | 0 | // don't care if absD is 0 since that's divide by zero |
1467 | 0 | if ((absD & (absD - 1)) == 0) { |
1468 | 0 | // Branchfree and normal paths are exactly the same |
1469 | 0 | result.magic = 0; |
1470 | 0 | result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); |
1471 | 0 | } else { |
1472 | 0 | LIBDIVIDE_ASSERT(floor_log_2_d >= 1); |
1473 | 0 |
|
1474 | 0 | uint8_t more; |
1475 | 0 | // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word |
1476 | 0 | // is 0 and the high word is floor_log_2_d - 1 |
1477 | 0 | uint32_t rem, proposed_m; |
1478 | 0 | proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem); |
1479 | 0 | const uint32_t e = absD - rem; |
1480 | 0 |
|
1481 | 0 | // We are going to start with a power of floor_log_2_d - 1. |
1482 | 0 | // This works if works if e < 2**floor_log_2_d. |
1483 | 0 | if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) { |
1484 | 0 | // This power works |
1485 | 0 | more = (uint8_t)(floor_log_2_d - 1); |
1486 | 0 | } else { |
1487 | 0 | // We need to go one higher. This should not make proposed_m |
1488 | 0 | // overflow, but it will make it negative when interpreted as an |
1489 | 0 | // int32_t. |
1490 | 0 | proposed_m += proposed_m; |
1491 | 0 | const uint32_t twice_rem = rem + rem; |
1492 | 0 | if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; |
1493 | 0 | more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); |
1494 | 0 | } |
1495 | 0 |
|
1496 | 0 | proposed_m += 1; |
1497 | 0 | int32_t magic = (int32_t)proposed_m; |
1498 | 0 |
|
1499 | 0 | // Mark if we are negative. Note we only negate the magic number in the |
1500 | 0 | // branchfull case. |
1501 | 0 | if (d < 0) { |
1502 | 0 | more |= LIBDIVIDE_NEGATIVE_DIVISOR; |
1503 | 0 | if (!branchfree) { |
1504 | 0 | magic = -magic; |
1505 | 0 | } |
1506 | 0 | } |
1507 | 0 |
|
1508 | 0 | result.more = more; |
1509 | 0 | result.magic = magic; |
1510 | 0 | } |
1511 | 0 | return result; |
1512 | 0 | } |
1513 | | |
1514 | 0 | static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) { |
1515 | 0 | return libdivide_internal_s32_gen(d, 0); |
1516 | 0 | } |
1517 | | |
1518 | 0 | static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) { |
1519 | 0 | struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1); |
1520 | 0 | struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more}; |
1521 | 0 | return result; |
1522 | 0 | } |
1523 | | |
1524 | 0 | static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) { |
1525 | 0 | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
1526 | 0 |
|
1527 | 0 | if (!magic) { |
1528 | 0 | uint32_t sign = (int8_t)more >> 7; |
1529 | 0 | uint32_t mask = ((uint32_t)1 << shift) - 1; |
1530 | 0 | uint32_t uq = numer + ((numer >> 31) & mask); |
1531 | 0 | int32_t q = (int32_t)uq; |
1532 | 0 | q >>= shift; |
1533 | 0 | q = (q ^ sign) - sign; |
1534 | 0 | return q; |
1535 | 0 | } else { |
1536 | 0 | uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic); |
1537 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
1538 | 0 | // must be arithmetic shift and then sign extend |
1539 | 0 | int32_t sign = (int8_t)more >> 7; |
1540 | 0 | // q += (more < 0 ? -numer : numer) |
1541 | 0 | // cast required to avoid UB |
1542 | 0 | uq += ((uint32_t)numer ^ sign) - sign; |
1543 | 0 | } |
1544 | 0 | int32_t q = (int32_t)uq; |
1545 | 0 | q >>= shift; |
1546 | 0 | q += (q < 0); |
1547 | 0 | return q; |
1548 | 0 | } |
1549 | 0 | } |
1550 | | |
1551 | 0 | static LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { |
1552 | 0 | return libdivide_s32_do_raw(numer, denom->magic, denom->more); |
1553 | 0 | } |
1554 | | |
1555 | 0 | static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) { |
1556 | 0 | uint8_t more = denom->more; |
1557 | 0 | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
1558 | 0 | // must be arithmetic shift and then sign extend |
1559 | 0 | int32_t sign = (int8_t)more >> 7; |
1560 | 0 | int32_t magic = denom->magic; |
1561 | 0 | int32_t q = libdivide_mullhi_s32(numer, magic); |
1562 | 0 | q += numer; |
1563 | 0 |
|
1564 | 0 | // If q is non-negative, we have nothing to do |
1565 | 0 | // If q is negative, we want to add either (2**shift)-1 if d is a power of |
1566 | 0 | // 2, or (2**shift) if it is not a power of 2 |
1567 | 0 | uint32_t is_power_of_2 = (magic == 0); |
1568 | 0 | uint32_t q_sign = (uint32_t)(q >> 31); |
1569 | 0 | q += q_sign & (((uint32_t)1 << shift) - is_power_of_2); |
1570 | 0 |
|
1571 | 0 | // Now arithmetic right shift |
1572 | 0 | q >>= shift; |
1573 | 0 | // Negate if needed |
1574 | 0 | q = (q ^ sign) - sign; |
1575 | 0 |
|
1576 | 0 | return q; |
1577 | 0 | } |
1578 | | |
1579 | 0 | static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { |
1580 | 0 | uint8_t more = denom->more; |
1581 | 0 | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
1582 | 0 | if (!denom->magic) { |
1583 | 0 | uint32_t absD = (uint32_t)1 << shift; |
1584 | 0 | if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { |
1585 | 0 | absD = -absD; |
1586 | 0 | } |
1587 | 0 | return (int32_t)absD; |
1588 | 0 | } else { |
1589 | 0 | // Unsigned math is much easier |
1590 | 0 | // We negate the magic number only in the branchfull case, and we don't |
1591 | 0 | // know which case we're in. However we have enough information to |
1592 | 0 | // determine the correct sign of the magic number. The divisor was |
1593 | 0 | // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, |
1594 | 0 | // the magic number's sign is opposite that of the divisor. |
1595 | 0 | // We want to compute the positive magic number. |
1596 | 0 | int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); |
1597 | 0 | int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; |
1598 | 0 |
|
1599 | 0 | // Handle the power of 2 case (including branchfree) |
1600 | 0 | if (denom->magic == 0) { |
1601 | 0 | int32_t result = (uint32_t)1 << shift; |
1602 | 0 | return negative_divisor ? -result : result; |
1603 | 0 | } |
1604 | 0 |
|
1605 | 0 | uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic); |
1606 | 0 | uint64_t n = (uint64_t)1 << (32 + shift); // this shift cannot exceed 30 |
1607 | 0 | uint32_t q = (uint32_t)(n / d); |
1608 | 0 | int32_t result = (int32_t)q; |
1609 | 0 | result += 1; |
1610 | 0 | return negative_divisor ? -result : result; |
1611 | 0 | } |
1612 | 0 | } |
1613 | | |
1614 | 0 | static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) { |
1615 | 0 | const struct libdivide_s32_t den = {denom->magic, denom->more}; |
1616 | 0 | return libdivide_s32_recover(&den); |
1617 | 0 | } |
1618 | | |
1619 | | ////////// SINT64 |
1620 | | |
1621 | | static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen( |
1622 | 0 | int64_t d, int branchfree) { |
1623 | 0 | if (d == 0) { |
1624 | 0 | LIBDIVIDE_ERROR("divider must be != 0"); |
1625 | 0 | } |
1626 | 0 |
|
1627 | 0 | struct libdivide_s64_t result; |
1628 | 0 |
|
1629 | 0 | // If d is a power of 2, or negative a power of 2, we have to use a shift. |
1630 | 0 | // This is especially important because the magic algorithm fails for -1. |
1631 | 0 | // To check if d is a power of 2 or its inverse, it suffices to check |
1632 | 0 | // whether its absolute value has exactly one bit set. This works even for |
1633 | 0 | // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set |
1634 | 0 | // and is a power of 2. |
1635 | 0 | uint64_t ud = (uint64_t)d; |
1636 | 0 | uint64_t absD = (d < 0) ? -ud : ud; |
1637 | 0 | uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD); |
1638 | 0 | // check if exactly one bit is set, |
1639 | 0 | // don't care if absD is 0 since that's divide by zero |
1640 | 0 | if ((absD & (absD - 1)) == 0) { |
1641 | 0 | // Branchfree and non-branchfree cases are the same |
1642 | 0 | result.magic = 0; |
1643 | 0 | result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); |
1644 | 0 | } else { |
1645 | 0 | // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word |
1646 | 0 | // is 0 and the high word is floor_log_2_d - 1 |
1647 | 0 | uint8_t more; |
1648 | 0 | uint64_t rem, proposed_m; |
1649 | 0 | proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem); |
1650 | 0 | const uint64_t e = absD - rem; |
1651 | 0 |
|
1652 | 0 | // We are going to start with a power of floor_log_2_d - 1. |
1653 | 0 | // This works if works if e < 2**floor_log_2_d. |
1654 | 0 | if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { |
1655 | 0 | // This power works |
1656 | 0 | more = (uint8_t)(floor_log_2_d - 1); |
1657 | 0 | } else { |
1658 | 0 | // We need to go one higher. This should not make proposed_m |
1659 | 0 | // overflow, but it will make it negative when interpreted as an |
1660 | 0 | // int32_t. |
1661 | 0 | proposed_m += proposed_m; |
1662 | 0 | const uint64_t twice_rem = rem + rem; |
1663 | 0 | if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; |
1664 | 0 | // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we |
1665 | 0 | // also set ADD_MARKER this is an annoying optimization that |
1666 | 0 | // enables algorithm #4 to avoid the mask. However we always set it |
1667 | 0 | // in the branchfree case |
1668 | 0 | more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); |
1669 | 0 | } |
1670 | 0 | proposed_m += 1; |
1671 | 0 | int64_t magic = (int64_t)proposed_m; |
1672 | 0 |
|
1673 | 0 | // Mark if we are negative |
1674 | 0 | if (d < 0) { |
1675 | 0 | more |= LIBDIVIDE_NEGATIVE_DIVISOR; |
1676 | 0 | if (!branchfree) { |
1677 | 0 | magic = -magic; |
1678 | 0 | } |
1679 | 0 | } |
1680 | 0 |
|
1681 | 0 | result.more = more; |
1682 | 0 | result.magic = magic; |
1683 | 0 | } |
1684 | 0 | return result; |
1685 | 0 | } |
1686 | | |
1687 | 0 | static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) { |
1688 | 0 | return libdivide_internal_s64_gen(d, 0); |
1689 | 0 | } |
1690 | | |
1691 | 0 | static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) { |
1692 | 0 | struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1); |
1693 | 0 | struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more}; |
1694 | 0 | return ret; |
1695 | 0 | } |
1696 | | |
1697 | 0 | static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) { |
1698 | 0 | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
1699 | 0 |
|
1700 | 0 | if (!magic) { // shift path |
1701 | 0 | uint64_t mask = ((uint64_t)1 << shift) - 1; |
1702 | 0 | uint64_t uq = numer + ((numer >> 63) & mask); |
1703 | 0 | int64_t q = (int64_t)uq; |
1704 | 0 | q >>= shift; |
1705 | 0 | // must be arithmetic shift and then sign-extend |
1706 | 0 | int64_t sign = (int8_t)more >> 7; |
1707 | 0 | q = (q ^ sign) - sign; |
1708 | 0 | return q; |
1709 | 0 | } else { |
1710 | 0 | uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic); |
1711 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
1712 | 0 | // must be arithmetic shift and then sign extend |
1713 | 0 | int64_t sign = (int8_t)more >> 7; |
1714 | 0 | // q += (more < 0 ? -numer : numer) |
1715 | 0 | // cast required to avoid UB |
1716 | 0 | uq += ((uint64_t)numer ^ sign) - sign; |
1717 | 0 | } |
1718 | 0 | int64_t q = (int64_t)uq; |
1719 | 0 | q >>= shift; |
1720 | 0 | q += (q < 0); |
1721 | 0 | return q; |
1722 | 0 | } |
1723 | 0 | } |
1724 | | |
1725 | 0 | static LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { |
1726 | 0 | return libdivide_s64_do_raw(numer, denom->magic, denom->more); |
1727 | 0 | } |
1728 | | |
1729 | 0 | static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) { |
1730 | 0 | uint8_t more = denom->more; |
1731 | 0 | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
1732 | 0 | // must be arithmetic shift and then sign extend |
1733 | 0 | int64_t sign = (int8_t)more >> 7; |
1734 | 0 | int64_t magic = denom->magic; |
1735 | 0 | int64_t q = libdivide_mullhi_s64(numer, magic); |
1736 | 0 | q += numer; |
1737 | 0 |
|
1738 | 0 | // If q is non-negative, we have nothing to do. |
1739 | 0 | // If q is negative, we want to add either (2**shift)-1 if d is a power of |
1740 | 0 | // 2, or (2**shift) if it is not a power of 2. |
1741 | 0 | uint64_t is_power_of_2 = (magic == 0); |
1742 | 0 | uint64_t q_sign = (uint64_t)(q >> 63); |
1743 | 0 | q += q_sign & (((uint64_t)1 << shift) - is_power_of_2); |
1744 | 0 |
|
1745 | 0 | // Arithmetic right shift |
1746 | 0 | q >>= shift; |
1747 | 0 | // Negate if needed |
1748 | 0 | q = (q ^ sign) - sign; |
1749 | 0 |
|
1750 | 0 | return q; |
1751 | 0 | } |
1752 | | |
1753 | 0 | static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { |
1754 | 0 | uint8_t more = denom->more; |
1755 | 0 | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
1756 | 0 | if (denom->magic == 0) { // shift path |
1757 | 0 | uint64_t absD = (uint64_t)1 << shift; |
1758 | 0 | if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { |
1759 | 0 | absD = -absD; |
1760 | 0 | } |
1761 | 0 | return (int64_t)absD; |
1762 | 0 | } else { |
1763 | 0 | // Unsigned math is much easier |
1764 | 0 | int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); |
1765 | 0 | int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; |
1766 | 0 |
|
1767 | 0 | uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic); |
1768 | 0 | uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0; |
1769 | 0 | uint64_t rem_ignored; |
1770 | 0 | uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored); |
1771 | 0 | int64_t result = (int64_t)(q + 1); |
1772 | 0 | if (negative_divisor) { |
1773 | 0 | result = -result; |
1774 | 0 | } |
1775 | 0 | return result; |
1776 | 0 | } |
1777 | 0 | } |
1778 | | |
1779 | 0 | static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) { |
1780 | 0 | const struct libdivide_s64_t den = {denom->magic, denom->more}; |
1781 | 0 | return libdivide_s64_recover(&den); |
1782 | 0 | } |
1783 | | |
1784 | | // Simplest possible vector type division: treat the vector type as an array |
1785 | | // of underlying native type. |
1786 | | // |
1787 | | // Use a union to read a vector via pointer-to-integer, without violating strict |
1788 | | // aliasing. |
1789 | | #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ |
1790 | | const size_t count = sizeof(VecT) / sizeof(IntT); \ |
1791 | | union type_pun_vec { \ |
1792 | | VecT vec; \ |
1793 | | IntT arr[sizeof(VecT) / sizeof(IntT)]; \ |
1794 | | }; \ |
1795 | | union type_pun_vec result; \ |
1796 | | union type_pun_vec input; \ |
1797 | | input.vec = numers; \ |
1798 | | for (size_t loop = 0; loop < count; ++loop) { \ |
1799 | | result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \ |
1800 | | } \ |
1801 | | return result.vec; |
1802 | | |
1803 | | #if defined(LIBDIVIDE_NEON) |
1804 | | |
1805 | | static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128( |
1806 | | uint16x8_t numers, const struct libdivide_u16_t *denom); |
1807 | | static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128( |
1808 | | int16x8_t numers, const struct libdivide_s16_t *denom); |
1809 | | static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128( |
1810 | | uint32x4_t numers, const struct libdivide_u32_t *denom); |
1811 | | static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128( |
1812 | | int32x4_t numers, const struct libdivide_s32_t *denom); |
1813 | | static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128( |
1814 | | uint64x2_t numers, const struct libdivide_u64_t *denom); |
1815 | | static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128( |
1816 | | int64x2_t numers, const struct libdivide_s64_t *denom); |
1817 | | |
1818 | | static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128( |
1819 | | uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom); |
1820 | | static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128( |
1821 | | int16x8_t numers, const struct libdivide_s16_branchfree_t *denom); |
1822 | | static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128( |
1823 | | uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom); |
1824 | | static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128( |
1825 | | int32x4_t numers, const struct libdivide_s32_branchfree_t *denom); |
1826 | | static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128( |
1827 | | uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom); |
1828 | | static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128( |
1829 | | int64x2_t numers, const struct libdivide_s64_branchfree_t *denom); |
1830 | | |
1831 | | //////// Internal Utility Functions |
1832 | | |
1833 | | // Logical right shift by runtime value. |
1834 | | // NEON implements right shift as left shits by negative values. |
1835 | | static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) { |
1836 | | int32_t wamt = (int32_t)(amt); |
1837 | | return vshlq_u32(v, vdupq_n_s32(-wamt)); |
1838 | | } |
1839 | | |
1840 | | static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) { |
1841 | | int64_t wamt = (int64_t)(amt); |
1842 | | return vshlq_u64(v, vdupq_n_s64(-wamt)); |
1843 | | } |
1844 | | |
1845 | | // Arithmetic right shift by runtime value. |
1846 | | static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) { |
1847 | | int32_t wamt = (int32_t)(amt); |
1848 | | return vshlq_s32(v, vdupq_n_s32(-wamt)); |
1849 | | } |
1850 | | |
1851 | | static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) { |
1852 | | int64_t wamt = (int64_t)(amt); |
1853 | | return vshlq_s64(v, vdupq_n_s64(-wamt)); |
1854 | | } |
1855 | | |
1856 | | static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); } |
1857 | | |
1858 | | static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) { |
1859 | | // Desire is [x0, x1, x2, x3] |
1860 | | uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b)); // [_, x0, _, x1] |
1861 | | uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b)); //[_, x2, _, x3] |
1862 | | return vuzp2q_u32(w1, w2); // [x0, x1, x2, x3] |
1863 | | } |
1864 | | |
1865 | | static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) { |
1866 | | int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b)); // [_, x0, _, x1] |
1867 | | int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b)); //[_, x2, _, x3] |
1868 | | return vuzp2q_s32(w1, w2); // [x0, x1, x2, x3] |
1869 | | } |
1870 | | |
1871 | | static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) { |
1872 | | // full 128 bits product is: |
1873 | | // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) |
1874 | | // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. |
1875 | | |
1876 | | // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits. |
1877 | | uint64x2_t y = vdupq_n_u64(sy); |
1878 | | uint32x2_t x0 = vmovn_u64(x); |
1879 | | uint32x2_t y0 = vmovn_u64(y); |
1880 | | uint32x2_t x1 = vshrn_n_u64(x, 32); |
1881 | | uint32x2_t y1 = vshrn_n_u64(y, 32); |
1882 | | |
1883 | | // Compute x0*y0. |
1884 | | uint64x2_t x0y0 = vmull_u32(x0, y0); |
1885 | | uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32); |
1886 | | |
1887 | | // Compute other intermediate products. |
1888 | | uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0); // temp = x0y0_hi + x1*y0; |
1889 | | // We want to split temp into its low 32 bits and high 32 bits, both |
1890 | | // in the low half of 64 bit registers. |
1891 | | // Use shifts to avoid needing a reg for the mask. |
1892 | | uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32); // temp_lo = temp & 0xFFFFFFFF; |
1893 | | uint64x2_t temp_hi = vshrq_n_u64(temp, 32); // temp_hi = temp >> 32; |
1894 | | |
1895 | | temp_lo = vmlal_u32(temp_lo, x0, y1); // temp_lo += x0*y0 |
1896 | | temp_lo = vshrq_n_u64(temp_lo, 32); // temp_lo >>= 32 |
1897 | | temp_hi = vmlal_u32(temp_hi, x1, y1); // temp_hi += x1*y1 |
1898 | | uint64x2_t result = vaddq_u64(temp_hi, temp_lo); |
1899 | | return result; |
1900 | | } |
1901 | | |
1902 | | static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) { |
1903 | | int64x2_t p = vreinterpretq_s64_u64( |
1904 | | libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy))); |
1905 | | int64x2_t y = vdupq_n_s64(sy); |
1906 | | int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y); |
1907 | | int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x); |
1908 | | p = vsubq_s64(p, t1); |
1909 | | p = vsubq_s64(p, t2); |
1910 | | return p; |
1911 | | } |
1912 | | |
1913 | | ////////// UINT16 |
1914 | | |
1915 | | uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){ |
1916 | | SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)} |
1917 | | |
1918 | | uint16x8_t libdivide_u16_branchfree_do_vec128( |
1919 | | uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){ |
1920 | | SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)} |
1921 | | |
1922 | | ////////// UINT32 |
1923 | | |
1924 | | uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) { |
1925 | | uint8_t more = denom->more; |
1926 | | if (!denom->magic) { |
1927 | | return libdivide_u32_neon_srl(numers, more); |
1928 | | } else { |
1929 | | uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); |
1930 | | if (more & LIBDIVIDE_ADD_MARKER) { |
1931 | | // uint32_t t = ((numer - q) >> 1) + q; |
1932 | | // return t >> denom->shift; |
1933 | | // Note we can use halving-subtract to avoid the shift. |
1934 | | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
1935 | | uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); |
1936 | | return libdivide_u32_neon_srl(t, shift); |
1937 | | } else { |
1938 | | return libdivide_u32_neon_srl(q, more); |
1939 | | } |
1940 | | } |
1941 | | } |
1942 | | |
1943 | | uint32x4_t libdivide_u32_branchfree_do_vec128( |
1944 | | uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) { |
1945 | | uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic); |
1946 | | uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q); |
1947 | | return libdivide_u32_neon_srl(t, denom->more); |
1948 | | } |
1949 | | |
1950 | | ////////// UINT64 |
1951 | | |
1952 | | uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) { |
1953 | | uint8_t more = denom->more; |
1954 | | if (!denom->magic) { |
1955 | | return libdivide_u64_neon_srl(numers, more); |
1956 | | } else { |
1957 | | uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); |
1958 | | if (more & LIBDIVIDE_ADD_MARKER) { |
1959 | | // uint32_t t = ((numer - q) >> 1) + q; |
1960 | | // return t >> denom->shift; |
1961 | | // No 64-bit halving subtracts in NEON :( |
1962 | | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
1963 | | uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); |
1964 | | return libdivide_u64_neon_srl(t, shift); |
1965 | | } else { |
1966 | | return libdivide_u64_neon_srl(q, more); |
1967 | | } |
1968 | | } |
1969 | | } |
1970 | | |
1971 | | uint64x2_t libdivide_u64_branchfree_do_vec128( |
1972 | | uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) { |
1973 | | uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic); |
1974 | | uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q); |
1975 | | return libdivide_u64_neon_srl(t, denom->more); |
1976 | | } |
1977 | | |
1978 | | ////////// SINT16 |
1979 | | |
1980 | | int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){ |
1981 | | SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)} |
1982 | | |
1983 | | int16x8_t libdivide_s16_branchfree_do_vec128( |
1984 | | int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){ |
1985 | | SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)} |
1986 | | |
1987 | | ////////// SINT32 |
1988 | | |
1989 | | int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) { |
1990 | | uint8_t more = denom->more; |
1991 | | if (!denom->magic) { |
1992 | | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
1993 | | uint32_t mask = ((uint32_t)1 << shift) - 1; |
1994 | | int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask); |
1995 | | // q = numer + ((numer >> 31) & roundToZeroTweak); |
1996 | | int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak)); |
1997 | | q = libdivide_s32_neon_sra(q, shift); |
1998 | | int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); |
1999 | | // q = (q ^ sign) - sign; |
2000 | | q = vsubq_s32(veorq_s32(q, sign), sign); |
2001 | | return q; |
2002 | | } else { |
2003 | | int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic); |
2004 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2005 | | // must be arithmetic shift |
2006 | | int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); |
2007 | | // q += ((numer ^ sign) - sign); |
2008 | | q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign)); |
2009 | | } |
2010 | | // q >>= shift |
2011 | | q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK); |
2012 | | q = vaddq_s32( |
2013 | | q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31))); // q += (q < 0) |
2014 | | return q; |
2015 | | } |
2016 | | } |
2017 | | |
2018 | | int32x4_t libdivide_s32_branchfree_do_vec128( |
2019 | | int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) { |
2020 | | int32_t magic = denom->magic; |
2021 | | uint8_t more = denom->more; |
2022 | | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2023 | | // must be arithmetic shift |
2024 | | int32x4_t sign = vdupq_n_s32((int8_t)more >> 7); |
2025 | | int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic); |
2026 | | q = vaddq_s32(q, numers); // q += numers |
2027 | | |
2028 | | // If q is non-negative, we have nothing to do |
2029 | | // If q is negative, we want to add either (2**shift)-1 if d is |
2030 | | // a power of 2, or (2**shift) if it is not a power of 2 |
2031 | | uint32_t is_power_of_2 = (magic == 0); |
2032 | | int32x4_t q_sign = vshrq_n_s32(q, 31); // q_sign = q >> 31 |
2033 | | int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2); |
2034 | | q = vaddq_s32(q, vandq_s32(q_sign, mask)); // q = q + (q_sign & mask) |
2035 | | q = libdivide_s32_neon_sra(q, shift); // q >>= shift |
2036 | | q = vsubq_s32(veorq_s32(q, sign), sign); // q = (q ^ sign) - sign |
2037 | | return q; |
2038 | | } |
2039 | | |
2040 | | ////////// SINT64 |
2041 | | |
2042 | | int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) { |
2043 | | uint8_t more = denom->more; |
2044 | | int64_t magic = denom->magic; |
2045 | | if (magic == 0) { // shift path |
2046 | | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2047 | | uint64_t mask = ((uint64_t)1 << shift) - 1; |
2048 | | int64x2_t roundToZeroTweak = vdupq_n_s64(mask); // TODO: no need to sign extend |
2049 | | // q = numer + ((numer >> 63) & roundToZeroTweak); |
2050 | | int64x2_t q = |
2051 | | vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak)); |
2052 | | q = libdivide_s64_neon_sra(q, shift); |
2053 | | // q = (q ^ sign) - sign; |
2054 | | int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7)); |
2055 | | q = vsubq_s64(veorq_s64(q, sign), sign); |
2056 | | return q; |
2057 | | } else { |
2058 | | int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); |
2059 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2060 | | // must be arithmetic shift |
2061 | | int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: no need to widen |
2062 | | // q += ((numer ^ sign) - sign); |
2063 | | q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign)); |
2064 | | } |
2065 | | // q >>= denom->mult_path.shift |
2066 | | q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK); |
2067 | | q = vaddq_s64( |
2068 | | q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63))); // q += (q < 0) |
2069 | | return q; |
2070 | | } |
2071 | | } |
2072 | | |
2073 | | int64x2_t libdivide_s64_branchfree_do_vec128( |
2074 | | int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) { |
2075 | | int64_t magic = denom->magic; |
2076 | | uint8_t more = denom->more; |
2077 | | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2078 | | // must be arithmetic shift |
2079 | | int64x2_t sign = vdupq_n_s64((int8_t)more >> 7); // TODO: avoid sign extend |
2080 | | |
2081 | | // libdivide_mullhi_s64(numers, magic); |
2082 | | int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic); |
2083 | | q = vaddq_s64(q, numers); // q += numers |
2084 | | |
2085 | | // If q is non-negative, we have nothing to do. |
2086 | | // If q is negative, we want to add either (2**shift)-1 if d is |
2087 | | // a power of 2, or (2**shift) if it is not a power of 2. |
2088 | | uint32_t is_power_of_2 = (magic == 0); |
2089 | | int64x2_t q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 |
2090 | | int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2); |
2091 | | q = vaddq_s64(q, vandq_s64(q_sign, mask)); // q = q + (q_sign & mask) |
2092 | | q = libdivide_s64_neon_sra(q, shift); // q >>= shift |
2093 | | q = vsubq_s64(veorq_s64(q, sign), sign); // q = (q ^ sign) - sign |
2094 | | return q; |
2095 | | } |
2096 | | |
2097 | | #endif |
2098 | | |
2099 | | #if defined(LIBDIVIDE_AVX512) |
2100 | | |
2101 | | static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512( |
2102 | | __m512i numers, const struct libdivide_u16_t *denom); |
2103 | | static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512( |
2104 | | __m512i numers, const struct libdivide_s16_t *denom); |
2105 | | static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512( |
2106 | | __m512i numers, const struct libdivide_u32_t *denom); |
2107 | | static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512( |
2108 | | __m512i numers, const struct libdivide_s32_t *denom); |
2109 | | static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512( |
2110 | | __m512i numers, const struct libdivide_u64_t *denom); |
2111 | | static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512( |
2112 | | __m512i numers, const struct libdivide_s64_t *denom); |
2113 | | |
2114 | | static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512( |
2115 | | __m512i numers, const struct libdivide_u16_branchfree_t *denom); |
2116 | | static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512( |
2117 | | __m512i numers, const struct libdivide_s16_branchfree_t *denom); |
2118 | | static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512( |
2119 | | __m512i numers, const struct libdivide_u32_branchfree_t *denom); |
2120 | | static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512( |
2121 | | __m512i numers, const struct libdivide_s32_branchfree_t *denom); |
2122 | | static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512( |
2123 | | __m512i numers, const struct libdivide_u64_branchfree_t *denom); |
2124 | | static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512( |
2125 | | __m512i numers, const struct libdivide_s64_branchfree_t *denom); |
2126 | | |
2127 | | //////// Internal Utility Functions |
2128 | | |
2129 | | static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) { |
2130 | | ; |
2131 | | return _mm512_srai_epi64(v, 63); |
2132 | | } |
2133 | | |
2134 | | static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) { |
2135 | | return _mm512_srai_epi64(v, amt); |
2136 | | } |
2137 | | |
2138 | | // Here, b is assumed to contain one 32-bit value repeated. |
2139 | | static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) { |
2140 | | __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32); |
2141 | | __m512i a1X3X = _mm512_srli_epi64(a, 32); |
2142 | | __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); |
2143 | | __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask); |
2144 | | return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); |
2145 | | } |
2146 | | |
2147 | | // b is one 32-bit value repeated. |
2148 | | static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) { |
2149 | | __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32); |
2150 | | __m512i a1X3X = _mm512_srli_epi64(a, 32); |
2151 | | __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); |
2152 | | __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask); |
2153 | | return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); |
2154 | | } |
2155 | | |
2156 | | // Here, y is assumed to contain one 64-bit value repeated. |
2157 | | static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) { |
2158 | | // see m128i variant for comments. |
2159 | | __m512i x0y0 = _mm512_mul_epu32(x, y); |
2160 | | __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32); |
2161 | | |
2162 | | __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); |
2163 | | __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1)); |
2164 | | |
2165 | | __m512i x0y1 = _mm512_mul_epu32(x, y1); |
2166 | | __m512i x1y0 = _mm512_mul_epu32(x1, y); |
2167 | | __m512i x1y1 = _mm512_mul_epu32(x1, y1); |
2168 | | |
2169 | | __m512i mask = _mm512_set1_epi64(0xFFFFFFFF); |
2170 | | __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi); |
2171 | | __m512i temp_lo = _mm512_and_si512(temp, mask); |
2172 | | __m512i temp_hi = _mm512_srli_epi64(temp, 32); |
2173 | | |
2174 | | temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32); |
2175 | | temp_hi = _mm512_add_epi64(x1y1, temp_hi); |
2176 | | return _mm512_add_epi64(temp_lo, temp_hi); |
2177 | | } |
2178 | | |
2179 | | // y is one 64-bit value repeated. |
2180 | | static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) { |
2181 | | __m512i p = libdivide_mullhi_u64_vec512(x, y); |
2182 | | __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y); |
2183 | | __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x); |
2184 | | p = _mm512_sub_epi64(p, t1); |
2185 | | p = _mm512_sub_epi64(p, t2); |
2186 | | return p; |
2187 | | } |
2188 | | |
2189 | | ////////// UINT16 |
2190 | | |
2191 | | __m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){ |
2192 | | SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)} |
2193 | | |
2194 | | __m512i libdivide_u16_branchfree_do_vec512( |
2195 | | __m512i numers, const struct libdivide_u16_branchfree_t *denom){ |
2196 | | SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)} |
2197 | | |
2198 | | ////////// UINT32 |
2199 | | |
2200 | | __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) { |
2201 | | uint8_t more = denom->more; |
2202 | | if (!denom->magic) { |
2203 | | return _mm512_srli_epi32(numers, more); |
2204 | | } else { |
2205 | | __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); |
2206 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2207 | | // uint32_t t = ((numer - q) >> 1) + q; |
2208 | | // return t >> denom->shift; |
2209 | | uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2210 | | __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); |
2211 | | return _mm512_srli_epi32(t, shift); |
2212 | | } else { |
2213 | | return _mm512_srli_epi32(q, more); |
2214 | | } |
2215 | | } |
2216 | | } |
2217 | | |
2218 | | __m512i libdivide_u32_branchfree_do_vec512( |
2219 | | __m512i numers, const struct libdivide_u32_branchfree_t *denom) { |
2220 | | __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic)); |
2221 | | __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); |
2222 | | return _mm512_srli_epi32(t, denom->more); |
2223 | | } |
2224 | | |
2225 | | ////////// UINT64 |
2226 | | |
2227 | | __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) { |
2228 | | uint8_t more = denom->more; |
2229 | | if (!denom->magic) { |
2230 | | return _mm512_srli_epi64(numers, more); |
2231 | | } else { |
2232 | | __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); |
2233 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2234 | | // uint32_t t = ((numer - q) >> 1) + q; |
2235 | | // return t >> denom->shift; |
2236 | | uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2237 | | __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); |
2238 | | return _mm512_srli_epi64(t, shift); |
2239 | | } else { |
2240 | | return _mm512_srli_epi64(q, more); |
2241 | | } |
2242 | | } |
2243 | | } |
2244 | | |
2245 | | __m512i libdivide_u64_branchfree_do_vec512( |
2246 | | __m512i numers, const struct libdivide_u64_branchfree_t *denom) { |
2247 | | __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic)); |
2248 | | __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); |
2249 | | return _mm512_srli_epi64(t, denom->more); |
2250 | | } |
2251 | | |
2252 | | ////////// SINT16 |
2253 | | |
2254 | | __m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){ |
2255 | | SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)} |
2256 | | |
2257 | | __m512i libdivide_s16_branchfree_do_vec512( |
2258 | | __m512i numers, const struct libdivide_s16_branchfree_t *denom){ |
2259 | | SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)} |
2260 | | |
2261 | | ////////// SINT32 |
2262 | | |
2263 | | __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) { |
2264 | | uint8_t more = denom->more; |
2265 | | if (!denom->magic) { |
2266 | | uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2267 | | uint32_t mask = ((uint32_t)1 << shift) - 1; |
2268 | | __m512i roundToZeroTweak = _mm512_set1_epi32(mask); |
2269 | | // q = numer + ((numer >> 31) & roundToZeroTweak); |
2270 | | __m512i q = _mm512_add_epi32( |
2271 | | numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); |
2272 | | q = _mm512_srai_epi32(q, shift); |
2273 | | __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); |
2274 | | // q = (q ^ sign) - sign; |
2275 | | q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); |
2276 | | return q; |
2277 | | } else { |
2278 | | __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic)); |
2279 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2280 | | // must be arithmetic shift |
2281 | | __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); |
2282 | | // q += ((numer ^ sign) - sign); |
2283 | | q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign)); |
2284 | | } |
2285 | | // q >>= shift |
2286 | | q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); |
2287 | | q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) |
2288 | | return q; |
2289 | | } |
2290 | | } |
2291 | | |
2292 | | __m512i libdivide_s32_branchfree_do_vec512( |
2293 | | __m512i numers, const struct libdivide_s32_branchfree_t *denom) { |
2294 | | int32_t magic = denom->magic; |
2295 | | uint8_t more = denom->more; |
2296 | | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2297 | | // must be arithmetic shift |
2298 | | __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); |
2299 | | __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic)); |
2300 | | q = _mm512_add_epi32(q, numers); // q += numers |
2301 | | |
2302 | | // If q is non-negative, we have nothing to do |
2303 | | // If q is negative, we want to add either (2**shift)-1 if d is |
2304 | | // a power of 2, or (2**shift) if it is not a power of 2 |
2305 | | uint32_t is_power_of_2 = (magic == 0); |
2306 | | __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 |
2307 | | __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); |
2308 | | q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) |
2309 | | q = _mm512_srai_epi32(q, shift); // q >>= shift |
2310 | | q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign |
2311 | | return q; |
2312 | | } |
2313 | | |
2314 | | ////////// SINT64 |
2315 | | |
2316 | | __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) { |
2317 | | uint8_t more = denom->more; |
2318 | | int64_t magic = denom->magic; |
2319 | | if (magic == 0) { // shift path |
2320 | | uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2321 | | uint64_t mask = ((uint64_t)1 << shift) - 1; |
2322 | | __m512i roundToZeroTweak = _mm512_set1_epi64(mask); |
2323 | | // q = numer + ((numer >> 63) & roundToZeroTweak); |
2324 | | __m512i q = _mm512_add_epi64( |
2325 | | numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak)); |
2326 | | q = libdivide_s64_shift_right_vec512(q, shift); |
2327 | | __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); |
2328 | | // q = (q ^ sign) - sign; |
2329 | | q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); |
2330 | | return q; |
2331 | | } else { |
2332 | | __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); |
2333 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2334 | | // must be arithmetic shift |
2335 | | __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); |
2336 | | // q += ((numer ^ sign) - sign); |
2337 | | q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign)); |
2338 | | } |
2339 | | // q >>= denom->mult_path.shift |
2340 | | q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK); |
2341 | | q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) |
2342 | | return q; |
2343 | | } |
2344 | | } |
2345 | | |
2346 | | __m512i libdivide_s64_branchfree_do_vec512( |
2347 | | __m512i numers, const struct libdivide_s64_branchfree_t *denom) { |
2348 | | int64_t magic = denom->magic; |
2349 | | uint8_t more = denom->more; |
2350 | | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2351 | | // must be arithmetic shift |
2352 | | __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); |
2353 | | |
2354 | | // libdivide_mullhi_s64(numers, magic); |
2355 | | __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic)); |
2356 | | q = _mm512_add_epi64(q, numers); // q += numers |
2357 | | |
2358 | | // If q is non-negative, we have nothing to do. |
2359 | | // If q is negative, we want to add either (2**shift)-1 if d is |
2360 | | // a power of 2, or (2**shift) if it is not a power of 2. |
2361 | | uint32_t is_power_of_2 = (magic == 0); |
2362 | | __m512i q_sign = libdivide_s64_signbits_vec512(q); // q_sign = q >> 63 |
2363 | | __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2); |
2364 | | q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) |
2365 | | q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift |
2366 | | q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign |
2367 | | return q; |
2368 | | } |
2369 | | |
2370 | | #endif |
2371 | | |
2372 | | #if defined(LIBDIVIDE_AVX2) |
2373 | | |
2374 | | static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256( |
2375 | | __m256i numers, const struct libdivide_u16_t *denom); |
2376 | | static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256( |
2377 | | __m256i numers, const struct libdivide_s16_t *denom); |
2378 | | static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256( |
2379 | | __m256i numers, const struct libdivide_u32_t *denom); |
2380 | | static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256( |
2381 | | __m256i numers, const struct libdivide_s32_t *denom); |
2382 | | static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256( |
2383 | | __m256i numers, const struct libdivide_u64_t *denom); |
2384 | | static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256( |
2385 | | __m256i numers, const struct libdivide_s64_t *denom); |
2386 | | |
2387 | | static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256( |
2388 | | __m256i numers, const struct libdivide_u16_branchfree_t *denom); |
2389 | | static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256( |
2390 | | __m256i numers, const struct libdivide_s16_branchfree_t *denom); |
2391 | | static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256( |
2392 | | __m256i numers, const struct libdivide_u32_branchfree_t *denom); |
2393 | | static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256( |
2394 | | __m256i numers, const struct libdivide_s32_branchfree_t *denom); |
2395 | | static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256( |
2396 | | __m256i numers, const struct libdivide_u64_branchfree_t *denom); |
2397 | | static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256( |
2398 | | __m256i numers, const struct libdivide_s64_branchfree_t *denom); |
2399 | | |
2400 | | //////// Internal Utility Functions |
2401 | | |
2402 | | // Implementation of _mm256_srai_epi64(v, 63) (from AVX512). |
2403 | | static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) { |
2404 | | __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); |
2405 | | __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31); |
2406 | | return signBits; |
2407 | | } |
2408 | | |
2409 | | // Implementation of _mm256_srai_epi64 (from AVX512). |
2410 | | static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) { |
2411 | | const int b = 64 - amt; |
2412 | | __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1)); |
2413 | | __m256i x = _mm256_srli_epi64(v, amt); |
2414 | | __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m); |
2415 | | return result; |
2416 | | } |
2417 | | |
2418 | | // Here, b is assumed to contain one 32-bit value repeated. |
2419 | | static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) { |
2420 | | __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32); |
2421 | | __m256i a1X3X = _mm256_srli_epi64(a, 32); |
2422 | | __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); |
2423 | | __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask); |
2424 | | return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); |
2425 | | } |
2426 | | |
2427 | | // b is one 32-bit value repeated. |
2428 | | static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) { |
2429 | | __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32); |
2430 | | __m256i a1X3X = _mm256_srli_epi64(a, 32); |
2431 | | __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); |
2432 | | __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask); |
2433 | | return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); |
2434 | | } |
2435 | | |
2436 | | // Here, y is assumed to contain one 64-bit value repeated. |
2437 | | static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) { |
2438 | | // see m128i variant for comments. |
2439 | | __m256i x0y0 = _mm256_mul_epu32(x, y); |
2440 | | __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32); |
2441 | | |
2442 | | __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); |
2443 | | __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); |
2444 | | |
2445 | | __m256i x0y1 = _mm256_mul_epu32(x, y1); |
2446 | | __m256i x1y0 = _mm256_mul_epu32(x1, y); |
2447 | | __m256i x1y1 = _mm256_mul_epu32(x1, y1); |
2448 | | |
2449 | | __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF); |
2450 | | __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi); |
2451 | | __m256i temp_lo = _mm256_and_si256(temp, mask); |
2452 | | __m256i temp_hi = _mm256_srli_epi64(temp, 32); |
2453 | | |
2454 | | temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32); |
2455 | | temp_hi = _mm256_add_epi64(x1y1, temp_hi); |
2456 | | return _mm256_add_epi64(temp_lo, temp_hi); |
2457 | | } |
2458 | | |
2459 | | // y is one 64-bit value repeated. |
2460 | | static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) { |
2461 | | __m256i p = libdivide_mullhi_u64_vec256(x, y); |
2462 | | __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y); |
2463 | | __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x); |
2464 | | p = _mm256_sub_epi64(p, t1); |
2465 | | p = _mm256_sub_epi64(p, t2); |
2466 | | return p; |
2467 | | } |
2468 | | |
2469 | | ////////// UINT16 |
2470 | | |
2471 | | __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) { |
2472 | | uint8_t more = denom->more; |
2473 | | if (!denom->magic) { |
2474 | | return _mm256_srli_epi16(numers, more); |
2475 | | } else { |
2476 | | __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); |
2477 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2478 | | __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); |
2479 | | return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); |
2480 | | } else { |
2481 | | return _mm256_srli_epi16(q, more); |
2482 | | } |
2483 | | } |
2484 | | } |
2485 | | |
2486 | | __m256i libdivide_u16_branchfree_do_vec256( |
2487 | | __m256i numers, const struct libdivide_u16_branchfree_t *denom) { |
2488 | | __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); |
2489 | | __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); |
2490 | | return _mm256_srli_epi16(t, denom->more); |
2491 | | } |
2492 | | |
2493 | | ////////// UINT32 |
2494 | | |
2495 | | __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) { |
2496 | | uint8_t more = denom->more; |
2497 | | if (!denom->magic) { |
2498 | | return _mm256_srli_epi32(numers, more); |
2499 | | } else { |
2500 | | __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); |
2501 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2502 | | // uint32_t t = ((numer - q) >> 1) + q; |
2503 | | // return t >> denom->shift; |
2504 | | uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2505 | | __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); |
2506 | | return _mm256_srli_epi32(t, shift); |
2507 | | } else { |
2508 | | return _mm256_srli_epi32(q, more); |
2509 | | } |
2510 | | } |
2511 | | } |
2512 | | |
2513 | | __m256i libdivide_u32_branchfree_do_vec256( |
2514 | | __m256i numers, const struct libdivide_u32_branchfree_t *denom) { |
2515 | | __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic)); |
2516 | | __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); |
2517 | | return _mm256_srli_epi32(t, denom->more); |
2518 | | } |
2519 | | |
2520 | | ////////// UINT64 |
2521 | | |
2522 | | __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) { |
2523 | | uint8_t more = denom->more; |
2524 | | if (!denom->magic) { |
2525 | | return _mm256_srli_epi64(numers, more); |
2526 | | } else { |
2527 | | __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); |
2528 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2529 | | // uint32_t t = ((numer - q) >> 1) + q; |
2530 | | // return t >> denom->shift; |
2531 | | uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2532 | | __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); |
2533 | | return _mm256_srli_epi64(t, shift); |
2534 | | } else { |
2535 | | return _mm256_srli_epi64(q, more); |
2536 | | } |
2537 | | } |
2538 | | } |
2539 | | |
2540 | | __m256i libdivide_u64_branchfree_do_vec256( |
2541 | | __m256i numers, const struct libdivide_u64_branchfree_t *denom) { |
2542 | | __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic)); |
2543 | | __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); |
2544 | | return _mm256_srli_epi64(t, denom->more); |
2545 | | } |
2546 | | |
2547 | | ////////// SINT16 |
2548 | | |
2549 | | __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) { |
2550 | | uint8_t more = denom->more; |
2551 | | if (!denom->magic) { |
2552 | | uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
2553 | | uint16_t mask = ((uint16_t)1 << shift) - 1; |
2554 | | __m256i roundToZeroTweak = _mm256_set1_epi16(mask); |
2555 | | // q = numer + ((numer >> 15) & roundToZeroTweak); |
2556 | | __m256i q = _mm256_add_epi16( |
2557 | | numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak)); |
2558 | | q = _mm256_srai_epi16(q, shift); |
2559 | | __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); |
2560 | | // q = (q ^ sign) - sign; |
2561 | | q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); |
2562 | | return q; |
2563 | | } else { |
2564 | | __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic)); |
2565 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2566 | | // must be arithmetic shift |
2567 | | __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); |
2568 | | // q += ((numer ^ sign) - sign); |
2569 | | q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign)); |
2570 | | } |
2571 | | // q >>= shift |
2572 | | q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); |
2573 | | q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0) |
2574 | | return q; |
2575 | | } |
2576 | | } |
2577 | | |
2578 | | __m256i libdivide_s16_branchfree_do_vec256( |
2579 | | __m256i numers, const struct libdivide_s16_branchfree_t *denom) { |
2580 | | int16_t magic = denom->magic; |
2581 | | uint8_t more = denom->more; |
2582 | | uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
2583 | | // must be arithmetic shift |
2584 | | __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); |
2585 | | __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic)); |
2586 | | q = _mm256_add_epi16(q, numers); // q += numers |
2587 | | |
2588 | | // If q is non-negative, we have nothing to do |
2589 | | // If q is negative, we want to add either (2**shift)-1 if d is |
2590 | | // a power of 2, or (2**shift) if it is not a power of 2 |
2591 | | uint16_t is_power_of_2 = (magic == 0); |
2592 | | __m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15 |
2593 | | __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); |
2594 | | q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) |
2595 | | q = _mm256_srai_epi16(q, shift); // q >>= shift |
2596 | | q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign |
2597 | | return q; |
2598 | | } |
2599 | | |
2600 | | ////////// SINT32 |
2601 | | |
2602 | | __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) { |
2603 | | uint8_t more = denom->more; |
2604 | | if (!denom->magic) { |
2605 | | uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2606 | | uint32_t mask = ((uint32_t)1 << shift) - 1; |
2607 | | __m256i roundToZeroTweak = _mm256_set1_epi32(mask); |
2608 | | // q = numer + ((numer >> 31) & roundToZeroTweak); |
2609 | | __m256i q = _mm256_add_epi32( |
2610 | | numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); |
2611 | | q = _mm256_srai_epi32(q, shift); |
2612 | | __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); |
2613 | | // q = (q ^ sign) - sign; |
2614 | | q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); |
2615 | | return q; |
2616 | | } else { |
2617 | | __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic)); |
2618 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2619 | | // must be arithmetic shift |
2620 | | __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); |
2621 | | // q += ((numer ^ sign) - sign); |
2622 | | q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign)); |
2623 | | } |
2624 | | // q >>= shift |
2625 | | q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); |
2626 | | q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) |
2627 | | return q; |
2628 | | } |
2629 | | } |
2630 | | |
2631 | | __m256i libdivide_s32_branchfree_do_vec256( |
2632 | | __m256i numers, const struct libdivide_s32_branchfree_t *denom) { |
2633 | | int32_t magic = denom->magic; |
2634 | | uint8_t more = denom->more; |
2635 | | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2636 | | // must be arithmetic shift |
2637 | | __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); |
2638 | | __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic)); |
2639 | | q = _mm256_add_epi32(q, numers); // q += numers |
2640 | | |
2641 | | // If q is non-negative, we have nothing to do |
2642 | | // If q is negative, we want to add either (2**shift)-1 if d is |
2643 | | // a power of 2, or (2**shift) if it is not a power of 2 |
2644 | | uint32_t is_power_of_2 = (magic == 0); |
2645 | | __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 |
2646 | | __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); |
2647 | | q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) |
2648 | | q = _mm256_srai_epi32(q, shift); // q >>= shift |
2649 | | q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign |
2650 | | return q; |
2651 | | } |
2652 | | |
2653 | | ////////// SINT64 |
2654 | | |
2655 | | __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) { |
2656 | | uint8_t more = denom->more; |
2657 | | int64_t magic = denom->magic; |
2658 | | if (magic == 0) { // shift path |
2659 | | uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2660 | | uint64_t mask = ((uint64_t)1 << shift) - 1; |
2661 | | __m256i roundToZeroTweak = _mm256_set1_epi64x(mask); |
2662 | | // q = numer + ((numer >> 63) & roundToZeroTweak); |
2663 | | __m256i q = _mm256_add_epi64( |
2664 | | numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak)); |
2665 | | q = libdivide_s64_shift_right_vec256(q, shift); |
2666 | | __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); |
2667 | | // q = (q ^ sign) - sign; |
2668 | | q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); |
2669 | | return q; |
2670 | | } else { |
2671 | | __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); |
2672 | | if (more & LIBDIVIDE_ADD_MARKER) { |
2673 | | // must be arithmetic shift |
2674 | | __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); |
2675 | | // q += ((numer ^ sign) - sign); |
2676 | | q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign)); |
2677 | | } |
2678 | | // q >>= denom->mult_path.shift |
2679 | | q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK); |
2680 | | q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) |
2681 | | return q; |
2682 | | } |
2683 | | } |
2684 | | |
2685 | | __m256i libdivide_s64_branchfree_do_vec256( |
2686 | | __m256i numers, const struct libdivide_s64_branchfree_t *denom) { |
2687 | | int64_t magic = denom->magic; |
2688 | | uint8_t more = denom->more; |
2689 | | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2690 | | // must be arithmetic shift |
2691 | | __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); |
2692 | | |
2693 | | // libdivide_mullhi_s64(numers, magic); |
2694 | | __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic)); |
2695 | | q = _mm256_add_epi64(q, numers); // q += numers |
2696 | | |
2697 | | // If q is non-negative, we have nothing to do. |
2698 | | // If q is negative, we want to add either (2**shift)-1 if d is |
2699 | | // a power of 2, or (2**shift) if it is not a power of 2. |
2700 | | uint32_t is_power_of_2 = (magic == 0); |
2701 | | __m256i q_sign = libdivide_s64_signbits_vec256(q); // q_sign = q >> 63 |
2702 | | __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); |
2703 | | q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) |
2704 | | q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift |
2705 | | q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign |
2706 | | return q; |
2707 | | } |
2708 | | |
2709 | | #endif |
2710 | | |
2711 | | #if defined(LIBDIVIDE_SSE2) |
2712 | | |
2713 | | static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128( |
2714 | | __m128i numers, const struct libdivide_u16_t *denom); |
2715 | | static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128( |
2716 | | __m128i numers, const struct libdivide_s16_t *denom); |
2717 | | static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128( |
2718 | | __m128i numers, const struct libdivide_u32_t *denom); |
2719 | | static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128( |
2720 | | __m128i numers, const struct libdivide_s32_t *denom); |
2721 | | static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128( |
2722 | | __m128i numers, const struct libdivide_u64_t *denom); |
2723 | | static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128( |
2724 | | __m128i numers, const struct libdivide_s64_t *denom); |
2725 | | |
2726 | | static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128( |
2727 | | __m128i numers, const struct libdivide_u16_branchfree_t *denom); |
2728 | | static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128( |
2729 | | __m128i numers, const struct libdivide_s16_branchfree_t *denom); |
2730 | | static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128( |
2731 | | __m128i numers, const struct libdivide_u32_branchfree_t *denom); |
2732 | | static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128( |
2733 | | __m128i numers, const struct libdivide_s32_branchfree_t *denom); |
2734 | | static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128( |
2735 | | __m128i numers, const struct libdivide_u64_branchfree_t *denom); |
2736 | | static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128( |
2737 | | __m128i numers, const struct libdivide_s64_branchfree_t *denom); |
2738 | | |
2739 | | //////// Internal Utility Functions |
2740 | | |
2741 | | // Implementation of _mm_srai_epi64(v, 63) (from AVX512). |
2742 | 0 | static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) { |
2743 | 0 | __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); |
2744 | 0 | __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); |
2745 | 0 | return signBits; |
2746 | 0 | } |
2747 | | |
2748 | | // Implementation of _mm_srai_epi64 (from AVX512). |
2749 | 0 | static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) { |
2750 | 0 | const int b = 64 - amt; |
2751 | 0 | __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1)); |
2752 | 0 | __m128i x = _mm_srli_epi64(v, amt); |
2753 | 0 | __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); |
2754 | 0 | return result; |
2755 | 0 | } |
2756 | | |
2757 | | // Here, b is assumed to contain one 32-bit value repeated. |
2758 | 0 | static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) { |
2759 | 0 | __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); |
2760 | 0 | __m128i a1X3X = _mm_srli_epi64(a, 32); |
2761 | 0 | __m128i mask = _mm_set_epi32(-1, 0, -1, 0); |
2762 | 0 | __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask); |
2763 | 0 | return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); |
2764 | 0 | } |
2765 | | |
2766 | | // SSE2 does not have a signed multiplication instruction, but we can convert |
2767 | | // unsigned to signed pretty efficiently. Again, b is just a 32 bit value |
2768 | | // repeated four times. |
2769 | 0 | static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) { |
2770 | 0 | __m128i p = libdivide_mullhi_u32_vec128(a, b); |
2771 | 0 | // t1 = (a >> 31) & y, arithmetic shift |
2772 | 0 | __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); |
2773 | 0 | __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); |
2774 | 0 | p = _mm_sub_epi32(p, t1); |
2775 | 0 | p = _mm_sub_epi32(p, t2); |
2776 | 0 | return p; |
2777 | 0 | } |
2778 | | |
2779 | | // Here, y is assumed to contain one 64-bit value repeated. |
2780 | 0 | static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) { |
2781 | 0 | // full 128 bits product is: |
2782 | 0 | // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) |
2783 | 0 | // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. |
2784 | 0 |
|
2785 | 0 | // Compute x0*y0. |
2786 | 0 | // Note x1, y1 are ignored by mul_epu32. |
2787 | 0 | __m128i x0y0 = _mm_mul_epu32(x, y); |
2788 | 0 | __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32); |
2789 | 0 |
|
2790 | 0 | // Get x1, y1 in the low bits. |
2791 | 0 | // We could shuffle or right shift. Shuffles are preferred as they preserve |
2792 | 0 | // the source register for the next computation. |
2793 | 0 | __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1)); |
2794 | 0 | __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1)); |
2795 | 0 |
|
2796 | 0 | // No need to mask off top 32 bits for mul_epu32. |
2797 | 0 | __m128i x0y1 = _mm_mul_epu32(x, y1); |
2798 | 0 | __m128i x1y0 = _mm_mul_epu32(x1, y); |
2799 | 0 | __m128i x1y1 = _mm_mul_epu32(x1, y1); |
2800 | 0 |
|
2801 | 0 | // Mask here selects low bits only. |
2802 | 0 | __m128i mask = _mm_set1_epi64x(0xFFFFFFFF); |
2803 | 0 | __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); |
2804 | 0 | __m128i temp_lo = _mm_and_si128(temp, mask); |
2805 | 0 | __m128i temp_hi = _mm_srli_epi64(temp, 32); |
2806 | 0 |
|
2807 | 0 | temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); |
2808 | 0 | temp_hi = _mm_add_epi64(x1y1, temp_hi); |
2809 | 0 | return _mm_add_epi64(temp_lo, temp_hi); |
2810 | 0 | } |
2811 | | |
2812 | | // y is one 64-bit value repeated. |
2813 | 0 | static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) { |
2814 | 0 | __m128i p = libdivide_mullhi_u64_vec128(x, y); |
2815 | 0 | __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y); |
2816 | 0 | __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x); |
2817 | 0 | p = _mm_sub_epi64(p, t1); |
2818 | 0 | p = _mm_sub_epi64(p, t2); |
2819 | 0 | return p; |
2820 | 0 | } |
2821 | | |
2822 | | ////////// UINT26 |
2823 | | |
2824 | 0 | __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { |
2825 | 0 | uint8_t more = denom->more; |
2826 | 0 | if (!denom->magic) { |
2827 | 0 | return _mm_srli_epi16(numers, more); |
2828 | 0 | } else { |
2829 | 0 | __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); |
2830 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
2831 | 0 | __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); |
2832 | 0 | return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); |
2833 | 0 | } else { |
2834 | 0 | return _mm_srli_epi16(q, more); |
2835 | 0 | } |
2836 | 0 | } |
2837 | 0 | } |
2838 | | |
2839 | | __m128i libdivide_u16_branchfree_do_vec128( |
2840 | 0 | __m128i numers, const struct libdivide_u16_branchfree_t *denom) { |
2841 | 0 | __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); |
2842 | 0 | __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); |
2843 | 0 | return _mm_srli_epi16(t, denom->more); |
2844 | 0 | } |
2845 | | |
2846 | | ////////// UINT32 |
2847 | | |
2848 | 0 | __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) { |
2849 | 0 | uint8_t more = denom->more; |
2850 | 0 | if (!denom->magic) { |
2851 | 0 | return _mm_srli_epi32(numers, more); |
2852 | 0 | } else { |
2853 | 0 | __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); |
2854 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
2855 | | // uint32_t t = ((numer - q) >> 1) + q; |
2856 | | // return t >> denom->shift; |
2857 | 0 | uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2858 | 0 | __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); |
2859 | 0 | return _mm_srli_epi32(t, shift); |
2860 | 0 | } else { |
2861 | 0 | return _mm_srli_epi32(q, more); |
2862 | 0 | } |
2863 | 0 | } |
2864 | 0 | } |
2865 | | |
2866 | | __m128i libdivide_u32_branchfree_do_vec128( |
2867 | 0 | __m128i numers, const struct libdivide_u32_branchfree_t *denom) { |
2868 | 0 | __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic)); |
2869 | 0 | __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); |
2870 | 0 | return _mm_srli_epi32(t, denom->more); |
2871 | 0 | } |
2872 | | |
2873 | | ////////// UINT64 |
2874 | | |
2875 | 0 | __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) { |
2876 | 0 | uint8_t more = denom->more; |
2877 | 0 | if (!denom->magic) { |
2878 | 0 | return _mm_srli_epi64(numers, more); |
2879 | 0 | } else { |
2880 | 0 | __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); |
2881 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
2882 | 0 | // uint32_t t = ((numer - q) >> 1) + q; |
2883 | 0 | // return t >> denom->shift; |
2884 | 0 | uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
2885 | 0 | __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); |
2886 | 0 | return _mm_srli_epi64(t, shift); |
2887 | 0 | } else { |
2888 | 0 | return _mm_srli_epi64(q, more); |
2889 | 0 | } |
2890 | 0 | } |
2891 | 0 | } |
2892 | | |
2893 | | __m128i libdivide_u64_branchfree_do_vec128( |
2894 | 0 | __m128i numers, const struct libdivide_u64_branchfree_t *denom) { |
2895 | 0 | __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic)); |
2896 | 0 | __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); |
2897 | 0 | return _mm_srli_epi64(t, denom->more); |
2898 | 0 | } |
2899 | | |
2900 | | ////////// SINT16 |
2901 | | |
2902 | 0 | __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) { |
2903 | 0 | uint8_t more = denom->more; |
2904 | 0 | if (!denom->magic) { |
2905 | 0 | uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
2906 | 0 | uint16_t mask = ((uint16_t)1 << shift) - 1; |
2907 | 0 | __m128i roundToZeroTweak = _mm_set1_epi16(mask); |
2908 | 0 | // q = numer + ((numer >> 15) & roundToZeroTweak); |
2909 | 0 | __m128i q = |
2910 | 0 | _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak)); |
2911 | 0 | q = _mm_srai_epi16(q, shift); |
2912 | 0 | __m128i sign = _mm_set1_epi16((int8_t)more >> 7); |
2913 | 0 | // q = (q ^ sign) - sign; |
2914 | 0 | q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); |
2915 | 0 | return q; |
2916 | 0 | } else { |
2917 | 0 | __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic)); |
2918 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
2919 | 0 | // must be arithmetic shift |
2920 | 0 | __m128i sign = _mm_set1_epi16((int8_t)more >> 7); |
2921 | 0 | // q += ((numer ^ sign) - sign); |
2922 | 0 | q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign)); |
2923 | 0 | } |
2924 | 0 | // q >>= shift |
2925 | 0 | q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); |
2926 | 0 | q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0) |
2927 | 0 | return q; |
2928 | 0 | } |
2929 | 0 | } |
2930 | | |
2931 | | __m128i libdivide_s16_branchfree_do_vec128( |
2932 | 0 | __m128i numers, const struct libdivide_s16_branchfree_t *denom) { |
2933 | 0 | int16_t magic = denom->magic; |
2934 | 0 | uint8_t more = denom->more; |
2935 | 0 | uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; |
2936 | 0 | // must be arithmetic shift |
2937 | 0 | __m128i sign = _mm_set1_epi16((int8_t)more >> 7); |
2938 | 0 | __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic)); |
2939 | 0 | q = _mm_add_epi16(q, numers); // q += numers |
2940 | 0 |
|
2941 | 0 | // If q is non-negative, we have nothing to do |
2942 | 0 | // If q is negative, we want to add either (2**shift)-1 if d is |
2943 | 0 | // a power of 2, or (2**shift) if it is not a power of 2 |
2944 | 0 | uint16_t is_power_of_2 = (magic == 0); |
2945 | 0 | __m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15 |
2946 | 0 | __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); |
2947 | 0 | q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) |
2948 | 0 | q = _mm_srai_epi16(q, shift); // q >>= shift |
2949 | 0 | q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign |
2950 | 0 | return q; |
2951 | 0 | } |
2952 | | |
2953 | | ////////// SINT32 |
2954 | | |
2955 | 0 | __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) { |
2956 | 0 | uint8_t more = denom->more; |
2957 | 0 | if (!denom->magic) { |
2958 | 0 | uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2959 | 0 | uint32_t mask = ((uint32_t)1 << shift) - 1; |
2960 | 0 | __m128i roundToZeroTweak = _mm_set1_epi32(mask); |
2961 | 0 | // q = numer + ((numer >> 31) & roundToZeroTweak); |
2962 | 0 | __m128i q = |
2963 | 0 | _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); |
2964 | 0 | q = _mm_srai_epi32(q, shift); |
2965 | 0 | __m128i sign = _mm_set1_epi32((int8_t)more >> 7); |
2966 | 0 | // q = (q ^ sign) - sign; |
2967 | 0 | q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); |
2968 | 0 | return q; |
2969 | 0 | } else { |
2970 | 0 | __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic)); |
2971 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
2972 | 0 | // must be arithmetic shift |
2973 | 0 | __m128i sign = _mm_set1_epi32((int8_t)more >> 7); |
2974 | 0 | // q += ((numer ^ sign) - sign); |
2975 | 0 | q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); |
2976 | 0 | } |
2977 | 0 | // q >>= shift |
2978 | 0 | q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); |
2979 | 0 | q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) |
2980 | 0 | return q; |
2981 | 0 | } |
2982 | 0 | } |
2983 | | |
2984 | | __m128i libdivide_s32_branchfree_do_vec128( |
2985 | 0 | __m128i numers, const struct libdivide_s32_branchfree_t *denom) { |
2986 | 0 | int32_t magic = denom->magic; |
2987 | 0 | uint8_t more = denom->more; |
2988 | 0 | uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; |
2989 | 0 | // must be arithmetic shift |
2990 | 0 | __m128i sign = _mm_set1_epi32((int8_t)more >> 7); |
2991 | 0 | __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic)); |
2992 | 0 | q = _mm_add_epi32(q, numers); // q += numers |
2993 | 0 |
|
2994 | 0 | // If q is non-negative, we have nothing to do |
2995 | 0 | // If q is negative, we want to add either (2**shift)-1 if d is |
2996 | 0 | // a power of 2, or (2**shift) if it is not a power of 2 |
2997 | 0 | uint32_t is_power_of_2 = (magic == 0); |
2998 | 0 | __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 |
2999 | 0 | __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); |
3000 | 0 | q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) |
3001 | 0 | q = _mm_srai_epi32(q, shift); // q >>= shift |
3002 | 0 | q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign |
3003 | 0 | return q; |
3004 | 0 | } |
3005 | | |
3006 | | ////////// SINT64 |
3007 | | |
3008 | 0 | __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) { |
3009 | 0 | uint8_t more = denom->more; |
3010 | 0 | int64_t magic = denom->magic; |
3011 | 0 | if (magic == 0) { // shift path |
3012 | 0 | uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
3013 | 0 | uint64_t mask = ((uint64_t)1 << shift) - 1; |
3014 | 0 | __m128i roundToZeroTweak = _mm_set1_epi64x(mask); |
3015 | 0 | // q = numer + ((numer >> 63) & roundToZeroTweak); |
3016 | 0 | __m128i q = _mm_add_epi64( |
3017 | 0 | numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); |
3018 | 0 | q = libdivide_s64_shift_right_vec128(q, shift); |
3019 | 0 | __m128i sign = _mm_set1_epi32((int8_t)more >> 7); |
3020 | 0 | // q = (q ^ sign) - sign; |
3021 | 0 | q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); |
3022 | 0 | return q; |
3023 | 0 | } else { |
3024 | 0 | __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); |
3025 | 0 | if (more & LIBDIVIDE_ADD_MARKER) { |
3026 | 0 | // must be arithmetic shift |
3027 | 0 | __m128i sign = _mm_set1_epi32((int8_t)more >> 7); |
3028 | 0 | // q += ((numer ^ sign) - sign); |
3029 | 0 | q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); |
3030 | 0 | } |
3031 | 0 | // q >>= denom->mult_path.shift |
3032 | 0 | q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK); |
3033 | 0 | q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) |
3034 | 0 | return q; |
3035 | 0 | } |
3036 | 0 | } |
3037 | | |
3038 | | __m128i libdivide_s64_branchfree_do_vec128( |
3039 | 0 | __m128i numers, const struct libdivide_s64_branchfree_t *denom) { |
3040 | 0 | int64_t magic = denom->magic; |
3041 | 0 | uint8_t more = denom->more; |
3042 | 0 | uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; |
3043 | 0 | // must be arithmetic shift |
3044 | 0 | __m128i sign = _mm_set1_epi32((int8_t)more >> 7); |
3045 | 0 |
|
3046 | 0 | // libdivide_mullhi_s64(numers, magic); |
3047 | 0 | __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic)); |
3048 | 0 | q = _mm_add_epi64(q, numers); // q += numers |
3049 | 0 |
|
3050 | 0 | // If q is non-negative, we have nothing to do. |
3051 | 0 | // If q is negative, we want to add either (2**shift)-1 if d is |
3052 | 0 | // a power of 2, or (2**shift) if it is not a power of 2. |
3053 | 0 | uint32_t is_power_of_2 = (magic == 0); |
3054 | 0 | __m128i q_sign = libdivide_s64_signbits_vec128(q); // q_sign = q >> 63 |
3055 | 0 | __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); |
3056 | 0 | q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) |
3057 | 0 | q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift |
3058 | 0 | q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign |
3059 | 0 | return q; |
3060 | 0 | } |
3061 | | |
3062 | | #endif |
3063 | | |
3064 | | ////////// C++ stuff |
3065 | | |
3066 | | #ifdef __cplusplus |
3067 | | |
3068 | | enum Branching { |
3069 | | BRANCHFULL, // use branching algorithms |
3070 | | BRANCHFREE // use branchfree algorithms |
3071 | | }; |
3072 | | |
3073 | | namespace detail { |
3074 | | enum Signedness { |
3075 | | SIGNED, |
3076 | | UNSIGNED, |
3077 | | }; |
3078 | | |
3079 | | #if defined(LIBDIVIDE_NEON) |
3080 | | // Helper to deduce NEON vector type for integral type. |
3081 | | template <int _WIDTH, Signedness _SIGN> |
3082 | | struct NeonVec {}; |
3083 | | |
3084 | | template <> |
3085 | | struct NeonVec<16, UNSIGNED> { |
3086 | | typedef uint16x8_t type; |
3087 | | }; |
3088 | | |
3089 | | template <> |
3090 | | struct NeonVec<16, SIGNED> { |
3091 | | typedef int16x8_t type; |
3092 | | }; |
3093 | | |
3094 | | template <> |
3095 | | struct NeonVec<32, UNSIGNED> { |
3096 | | typedef uint32x4_t type; |
3097 | | }; |
3098 | | |
3099 | | template <> |
3100 | | struct NeonVec<32, SIGNED> { |
3101 | | typedef int32x4_t type; |
3102 | | }; |
3103 | | |
3104 | | template <> |
3105 | | struct NeonVec<64, UNSIGNED> { |
3106 | | typedef uint64x2_t type; |
3107 | | }; |
3108 | | |
3109 | | template <> |
3110 | | struct NeonVec<64, SIGNED> { |
3111 | | typedef int64x2_t type; |
3112 | | }; |
3113 | | |
3114 | | template <typename T> |
3115 | | struct NeonVecFor { |
3116 | | // See 'class divider' for an explanation of these template parameters. |
3117 | | typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type; |
3118 | | }; |
3119 | | |
3120 | | #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ |
3121 | | LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \ |
3122 | | typename NeonVecFor<INT_TYPE>::type n) const { \ |
3123 | | return libdivide_##ALGO##_do_vec128(n, &denom); \ |
3124 | | } |
3125 | | #else |
3126 | | #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) |
3127 | | #endif |
3128 | | |
3129 | | #if defined(LIBDIVIDE_SSE2) |
3130 | | #define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ |
3131 | 0 | LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \ |
3132 | 0 | return libdivide_##ALGO##_do_vec128(n, &denom); \ |
3133 | 0 | } Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::divide(long long __vector(2)) const |
3134 | | #else |
3135 | | #define LIBDIVIDE_DIVIDE_SSE2(ALGO) |
3136 | | #endif |
3137 | | |
3138 | | #if defined(LIBDIVIDE_AVX2) |
3139 | | #define LIBDIVIDE_DIVIDE_AVX2(ALGO) \ |
3140 | | LIBDIVIDE_INLINE __m256i divide(__m256i n) const { \ |
3141 | | return libdivide_##ALGO##_do_vec256(n, &denom); \ |
3142 | | } |
3143 | | #else |
3144 | | #define LIBDIVIDE_DIVIDE_AVX2(ALGO) |
3145 | | #endif |
3146 | | |
3147 | | #if defined(LIBDIVIDE_AVX512) |
3148 | | #define LIBDIVIDE_DIVIDE_AVX512(ALGO) \ |
3149 | | LIBDIVIDE_INLINE __m512i divide(__m512i n) const { \ |
3150 | | return libdivide_##ALGO##_do_vec512(n, &denom); \ |
3151 | | } |
3152 | | #else |
3153 | | #define LIBDIVIDE_DIVIDE_AVX512(ALGO) |
3154 | | #endif |
3155 | | |
3156 | | // The DISPATCHER_GEN() macro generates C++ methods (for the given integer |
3157 | | // and algorithm types) that redirect to libdivide's C API. |
3158 | | #define DISPATCHER_GEN(T, ALGO) \ |
3159 | | libdivide_##ALGO##_t denom; \ |
3160 | 0 | LIBDIVIDE_INLINE dispatcher() {} \ Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher() Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher() |
3161 | 0 | explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \ Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher(decltype(nullptr)) Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher(decltype(nullptr)) |
3162 | 0 | LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher(unsigned short) Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher(unsigned int) Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher(short) Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher(short) Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher(unsigned short) Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher(int) Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher(int) Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher(unsigned int) Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::dispatcher(long) Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::dispatcher(long) Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::dispatcher(unsigned long) Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::dispatcher(unsigned long) |
3163 | 0 | LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::divide(short) const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::divide(short) const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::divide(unsigned short) const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::divide(unsigned short) const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::divide(int) const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::divide(int) const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::divide(unsigned int) const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::divide(unsigned int) const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::divide(long) const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::divide(long) const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::divide(unsigned long) const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::divide(unsigned long) const |
3164 | 0 | LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<16, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<32, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)0>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)0, (libdivide::Branching)1>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)0>::recover() const Unexecuted instantiation: libdivide::detail::dispatcher<64, (libdivide::detail::Signedness)1, (libdivide::Branching)1>::recover() const |
3165 | | LIBDIVIDE_DIVIDE_NEON(ALGO, T) \ |
3166 | | LIBDIVIDE_DIVIDE_SSE2(ALGO) \ |
3167 | | LIBDIVIDE_DIVIDE_AVX2(ALGO) \ |
3168 | | LIBDIVIDE_DIVIDE_AVX512(ALGO) |
3169 | | |
3170 | | // The dispatcher selects a specific division algorithm for a given |
3171 | | // width, signedness, and ALGO using partial template specialization. |
3172 | | template <int _WIDTH, Signedness _SIGN, Branching _ALGO> |
3173 | | struct dispatcher {}; |
3174 | | |
3175 | | template <> |
3176 | | struct dispatcher<16, SIGNED, BRANCHFULL> { |
3177 | | DISPATCHER_GEN(int16_t, s16) |
3178 | | }; |
3179 | | template <> |
3180 | | struct dispatcher<16, SIGNED, BRANCHFREE> { |
3181 | | DISPATCHER_GEN(int16_t, s16_branchfree) |
3182 | | }; |
3183 | | template <> |
3184 | | struct dispatcher<16, UNSIGNED, BRANCHFULL> { |
3185 | | DISPATCHER_GEN(uint16_t, u16) |
3186 | | }; |
3187 | | template <> |
3188 | | struct dispatcher<16, UNSIGNED, BRANCHFREE> { |
3189 | | DISPATCHER_GEN(uint16_t, u16_branchfree) |
3190 | | }; |
3191 | | template <> |
3192 | | struct dispatcher<32, SIGNED, BRANCHFULL> { |
3193 | | DISPATCHER_GEN(int32_t, s32) |
3194 | | }; |
3195 | | template <> |
3196 | | struct dispatcher<32, SIGNED, BRANCHFREE> { |
3197 | | DISPATCHER_GEN(int32_t, s32_branchfree) |
3198 | | }; |
3199 | | template <> |
3200 | | struct dispatcher<32, UNSIGNED, BRANCHFULL> { |
3201 | | DISPATCHER_GEN(uint32_t, u32) |
3202 | | }; |
3203 | | template <> |
3204 | | struct dispatcher<32, UNSIGNED, BRANCHFREE> { |
3205 | | DISPATCHER_GEN(uint32_t, u32_branchfree) |
3206 | | }; |
3207 | | template <> |
3208 | | struct dispatcher<64, SIGNED, BRANCHFULL> { |
3209 | | DISPATCHER_GEN(int64_t, s64) |
3210 | | }; |
3211 | | template <> |
3212 | | struct dispatcher<64, SIGNED, BRANCHFREE> { |
3213 | | DISPATCHER_GEN(int64_t, s64_branchfree) |
3214 | | }; |
3215 | | template <> |
3216 | | struct dispatcher<64, UNSIGNED, BRANCHFULL> { |
3217 | | DISPATCHER_GEN(uint64_t, u64) |
3218 | | }; |
3219 | | template <> |
3220 | | struct dispatcher<64, UNSIGNED, BRANCHFREE> { |
3221 | | DISPATCHER_GEN(uint64_t, u64_branchfree) |
3222 | | }; |
3223 | | } // namespace detail |
3224 | | |
3225 | | #if defined(LIBDIVIDE_NEON) |
3226 | | // Allow NeonVecFor outside of detail namespace. |
3227 | | template <typename T> |
3228 | | struct NeonVecFor { |
3229 | | typedef typename detail::NeonVecFor<T>::type type; |
3230 | | }; |
3231 | | #endif |
3232 | | |
3233 | | // This is the main divider class for use by the user (C++ API). |
3234 | | // The actual division algorithm is selected using the dispatcher struct |
3235 | | // based on the integer width and algorithm template parameters. |
3236 | | template <typename T, Branching ALGO = BRANCHFULL> |
3237 | | class divider { |
3238 | | private: |
3239 | | // Dispatch based on the size and signedness. |
3240 | | // We avoid using type_traits as it's not available in AVR. |
3241 | | // Detect signedness by checking if T(-1) is less than T(0). |
3242 | | // Also throw in a shift by 0, which prevents floating point types from being passed. |
3243 | | typedef detail::dispatcher<sizeof(T) * 8, |
3244 | | (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO> |
3245 | | dispatcher_t; |
3246 | | |
3247 | | public: |
3248 | | // We leave the default constructor empty so that creating |
3249 | | // an array of dividers and then initializing them |
3250 | | // later doesn't slow us down. |
3251 | | divider() {} |
3252 | | |
3253 | | // constexpr zero-initialization to allow for use w/ static constinit |
3254 | | explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {} |
3255 | | |
3256 | | // Constructor that takes the divisor as a parameter |
3257 | 0 | LIBDIVIDE_INLINE divider(T d) : div(d) {} Unexecuted instantiation: libdivide::divider<unsigned short, (libdivide::Branching)0>::divider(unsigned short) Unexecuted instantiation: libdivide::divider<unsigned int, (libdivide::Branching)0>::divider(unsigned int) |
3258 | | |
3259 | | // Divides n by the divisor |
3260 | | LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); } |
3261 | | |
3262 | | // Recovers the divisor, returns the value that was |
3263 | | // used to initialize this divider object. |
3264 | | T recover() const { return div.recover(); } |
3265 | | |
3266 | | bool operator==(const divider<T, ALGO> &other) const { |
3267 | | return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more; |
3268 | | } |
3269 | | |
3270 | | bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); } |
3271 | | |
3272 | | // Vector variants treat the input as packed integer values with the same type as the divider |
3273 | | // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed |
3274 | | // quotients. |
3275 | | #if defined(LIBDIVIDE_SSE2) |
3276 | 0 | LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); } Unexecuted instantiation: libdivide::divider<unsigned short, (libdivide::Branching)0>::divide(long long __vector(2)) const Unexecuted instantiation: libdivide::divider<unsigned int, (libdivide::Branching)0>::divide(long long __vector(2)) const |
3277 | | #endif |
3278 | | #if defined(LIBDIVIDE_AVX2) |
3279 | | LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); } |
3280 | | #endif |
3281 | | #if defined(LIBDIVIDE_AVX512) |
3282 | | LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); } |
3283 | | #endif |
3284 | | #if defined(LIBDIVIDE_NEON) |
3285 | | LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const { |
3286 | | return div.divide(n); |
3287 | | } |
3288 | | #endif |
3289 | | |
3290 | | private: |
3291 | | // Storage for the actual divisor |
3292 | | dispatcher_t div; |
3293 | | }; |
3294 | | |
3295 | | // Overload of operator / for scalar division |
3296 | | template <typename T, Branching ALGO> |
3297 | | LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) { |
3298 | | return div.divide(n); |
3299 | | } |
3300 | | |
3301 | | // Overload of operator /= for scalar division |
3302 | | template <typename T, Branching ALGO> |
3303 | | LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) { |
3304 | | n = div.divide(n); |
3305 | | return n; |
3306 | | } |
3307 | | |
3308 | | // Overloads for vector types. |
3309 | | #if defined(LIBDIVIDE_SSE2) |
3310 | | template <typename T, Branching ALGO> |
3311 | | LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) { |
3312 | | return div.divide(n); |
3313 | | } |
3314 | | |
3315 | | template <typename T, Branching ALGO> |
3316 | 0 | LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) { |
3317 | 0 | n = div.divide(n); |
3318 | 0 | return n; |
3319 | 0 | } Unexecuted instantiation: long long __vector(2) libdivide::operator/=<unsigned short, (libdivide::Branching)0>(long long __vector(2)&, libdivide::divider<unsigned short, (libdivide::Branching)0> const&) Unexecuted instantiation: long long __vector(2) libdivide::operator/=<unsigned int, (libdivide::Branching)0>(long long __vector(2)&, libdivide::divider<unsigned int, (libdivide::Branching)0> const&) |
3320 | | #endif |
3321 | | #if defined(LIBDIVIDE_AVX2) |
3322 | | template <typename T, Branching ALGO> |
3323 | | LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) { |
3324 | | return div.divide(n); |
3325 | | } |
3326 | | |
3327 | | template <typename T, Branching ALGO> |
3328 | | LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) { |
3329 | | n = div.divide(n); |
3330 | | return n; |
3331 | | } |
3332 | | #endif |
3333 | | #if defined(LIBDIVIDE_AVX512) |
3334 | | template <typename T, Branching ALGO> |
3335 | | LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) { |
3336 | | return div.divide(n); |
3337 | | } |
3338 | | |
3339 | | template <typename T, Branching ALGO> |
3340 | | LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) { |
3341 | | n = div.divide(n); |
3342 | | return n; |
3343 | | } |
3344 | | #endif |
3345 | | |
3346 | | #if defined(LIBDIVIDE_NEON) |
3347 | | template <typename T, Branching ALGO> |
3348 | | LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/( |
3349 | | typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) { |
3350 | | return div.divide(n); |
3351 | | } |
3352 | | |
3353 | | template <typename T, Branching ALGO> |
3354 | | LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=( |
3355 | | typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) { |
3356 | | n = div.divide(n); |
3357 | | return n; |
3358 | | } |
3359 | | #endif |
3360 | | |
3361 | | #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) |
3362 | | // libdivide::branchfree_divider<T> |
3363 | | template <typename T> |
3364 | | using branchfree_divider = divider<T, BRANCHFREE>; |
3365 | | #endif |
3366 | | |
3367 | | } // namespace libdivide |
3368 | | |
3369 | | #endif // __cplusplus |
3370 | | |
3371 | | #if defined(_MSC_VER) && !defined(__clang__) |
3372 | | #pragma warning(pop) |
3373 | | #endif |
3374 | | |
3375 | | #endif // LIBDIVIDE_H |