/src/postgis/deps/ryu/d2s_intrinsics.h
Line | Count | Source |
1 | | // Copyright 2018 Ulf Adams |
2 | | // |
3 | | // The contents of this file may be used under the terms of the Apache License, |
4 | | // Version 2.0. |
5 | | // |
6 | | // (See accompanying file LICENSE-Apache or copy at |
7 | | // http://www.apache.org/licenses/LICENSE-2.0) |
8 | | // |
9 | | // Alternatively, the contents of this file may be used under the terms of |
10 | | // the Boost Software License, Version 1.0. |
11 | | // (See accompanying file LICENSE-Boost or copy at |
12 | | // https://www.boost.org/LICENSE_1_0.txt) |
13 | | // |
14 | | // Unless required by applicable law or agreed to in writing, this software |
15 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
16 | | // KIND, either express or implied. |
17 | | #ifndef RYU_D2S_INTRINSICS_H |
18 | | #define RYU_D2S_INTRINSICS_H |
19 | | |
20 | | #include <assert.h> |
21 | | #include <stdint.h> |
22 | | |
23 | | // Defines RYU_32_BIT_PLATFORM if applicable. |
24 | | #include "ryu/common.h" |
25 | | |
26 | | // ABSL avoids uint128_t on Win32 even if __SIZEOF_INT128__ is defined. |
27 | | // Let's do the same for now. |
28 | | #if defined(__SIZEOF_INT128__) && !defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) |
29 | | #define HAS_UINT128 |
30 | | #elif defined(_MSC_VER) && !defined(RYU_ONLY_64_BIT_OPS) && defined(_M_X64) |
31 | | #define HAS_64_BIT_INTRINSICS |
32 | | #endif |
33 | | |
34 | | #if defined(HAS_UINT128) |
35 | | typedef __uint128_t uint128_t; |
36 | | #endif |
37 | | |
38 | | #if defined(HAS_64_BIT_INTRINSICS) |
39 | | |
40 | | #include <intrin.h> |
41 | | |
42 | | static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { |
43 | | return _umul128(a, b, productHi); |
44 | | } |
45 | | |
46 | | static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { |
47 | | // For the __shiftright128 intrinsic, the shift value is always |
48 | | // modulo 64. |
49 | | // In the current implementation of the double-precision version |
50 | | // of Ryu, the shift value is always < 64. (In the case |
51 | | // RYU_OPTIMIZE_SIZE == 0, the shift value is in the range [49, 58]. |
52 | | // Otherwise in the range [2, 59].) |
53 | | // Check this here in case a future change requires larger shift |
54 | | // values. In this case this function needs to be adjusted. |
55 | | assert(dist < 64); |
56 | | return __shiftright128(lo, hi, (unsigned char) dist); |
57 | | } |
58 | | |
59 | | #else // defined(HAS_64_BIT_INTRINSICS) |
60 | | |
61 | 0 | static inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { |
62 | 0 | // The casts here help MSVC to avoid calls to the __allmul library function. |
63 | 0 | const uint32_t aLo = (uint32_t)a; |
64 | 0 | const uint32_t aHi = (uint32_t)(a >> 32); |
65 | 0 | const uint32_t bLo = (uint32_t)b; |
66 | 0 | const uint32_t bHi = (uint32_t)(b >> 32); |
67 | 0 |
|
68 | 0 | const uint64_t b00 = (uint64_t)aLo * bLo; |
69 | 0 | const uint64_t b01 = (uint64_t)aLo * bHi; |
70 | 0 | const uint64_t b10 = (uint64_t)aHi * bLo; |
71 | 0 | const uint64_t b11 = (uint64_t)aHi * bHi; |
72 | 0 |
|
73 | 0 | const uint32_t b00Lo = (uint32_t)b00; |
74 | 0 | const uint32_t b00Hi = (uint32_t)(b00 >> 32); |
75 | 0 |
|
76 | 0 | const uint64_t mid1 = b10 + b00Hi; |
77 | 0 | const uint32_t mid1Lo = (uint32_t)(mid1); |
78 | 0 | const uint32_t mid1Hi = (uint32_t)(mid1 >> 32); |
79 | 0 |
|
80 | 0 | const uint64_t mid2 = b01 + mid1Lo; |
81 | 0 | const uint32_t mid2Lo = (uint32_t)(mid2); |
82 | 0 | const uint32_t mid2Hi = (uint32_t)(mid2 >> 32); |
83 | 0 |
|
84 | 0 | const uint64_t pHi = b11 + mid1Hi + mid2Hi; |
85 | 0 | const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo; |
86 | 0 |
|
87 | 0 | *productHi = pHi; |
88 | 0 | return pLo; |
89 | 0 | } |
90 | | |
91 | 0 | static inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { |
92 | 0 | // We don't need to handle the case dist >= 64 here (see above). |
93 | 0 | assert(dist < 64); |
94 | 0 | #if defined(RYU_OPTIMIZE_SIZE) || !defined(RYU_32_BIT_PLATFORM) |
95 | 0 | assert(dist > 0); |
96 | 0 | return (hi << (64 - dist)) | (lo >> dist); |
97 | 0 | #else |
98 | 0 | // Avoid a 64-bit shift by taking advantage of the range of shift values. |
99 | 0 | assert(dist >= 32); |
100 | 0 | return (hi << (64 - dist)) | ((uint32_t)(lo >> 32) >> (dist - 32)); |
101 | 0 | #endif |
102 | 0 | } |
103 | | |
104 | | #endif // defined(HAS_64_BIT_INTRINSICS) |
105 | | |
106 | | #if defined(RYU_32_BIT_PLATFORM) |
107 | | |
108 | | // Returns the high 64 bits of the 128-bit product of a and b. |
109 | | static inline uint64_t umulh(const uint64_t a, const uint64_t b) { |
110 | | // Reuse the umul128 implementation. |
111 | | // Optimizers will likely eliminate the instructions used to compute the |
112 | | // low part of the product. |
113 | | uint64_t hi; |
114 | | umul128(a, b, &hi); |
115 | | return hi; |
116 | | } |
117 | | |
118 | | // On 32-bit platforms, compilers typically generate calls to library |
119 | | // functions for 64-bit divisions, even if the divisor is a constant. |
120 | | // |
121 | | // E.g.: |
122 | | // https://bugs.llvm.org/show_bug.cgi?id=37932 |
123 | | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=17958 |
124 | | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=37443 |
125 | | // |
126 | | // The functions here perform division-by-constant using multiplications |
127 | | // in the same way as 64-bit compilers would do. |
128 | | // |
129 | | // NB: |
130 | | // The multipliers and shift values are the ones generated by clang x64 |
131 | | // for expressions like x/5, x/10, etc. |
132 | | |
133 | | static inline uint64_t div5(const uint64_t x) { |
134 | | return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 2; |
135 | | } |
136 | | |
137 | | static inline uint64_t div10(const uint64_t x) { |
138 | | return umulh(x, 0xCCCCCCCCCCCCCCCDu) >> 3; |
139 | | } |
140 | | |
141 | | static inline uint64_t div100(const uint64_t x) { |
142 | | return umulh(x >> 2, 0x28F5C28F5C28F5C3u) >> 2; |
143 | | } |
144 | | |
145 | | static inline uint64_t div1e8(const uint64_t x) { |
146 | | return umulh(x, 0xABCC77118461CEFDu) >> 26; |
147 | | } |
148 | | |
149 | | static inline uint64_t div1e9(const uint64_t x) { |
150 | | return umulh(x >> 9, 0x44B82FA09B5A53u) >> 11; |
151 | | } |
152 | | |
153 | | static inline uint32_t mod1e9(const uint64_t x) { |
154 | | // Avoid 64-bit math as much as possible. |
155 | | // Returning (uint32_t) (x - 1000000000 * div1e9(x)) would |
156 | | // perform 32x64-bit multiplication and 64-bit subtraction. |
157 | | // x and 1000000000 * div1e9(x) are guaranteed to differ by |
158 | | // less than 10^9, so their highest 32 bits must be identical, |
159 | | // so we can truncate both sides to uint32_t before subtracting. |
160 | | // We can also simplify (uint32_t) (1000000000 * div1e9(x)). |
161 | | // We can truncate before multiplying instead of after, as multiplying |
162 | | // the highest 32 bits of div1e9(x) can't affect the lowest 32 bits. |
163 | | return ((uint32_t) x) - 1000000000 * ((uint32_t) div1e9(x)); |
164 | | } |
165 | | |
166 | | #else // defined(RYU_32_BIT_PLATFORM) |
167 | | |
168 | 0 | static inline uint64_t div5(const uint64_t x) { |
169 | 0 | return x / 5; |
170 | 0 | } |
171 | | |
172 | 0 | static inline uint64_t div10(const uint64_t x) { |
173 | 0 | return x / 10; |
174 | 0 | } |
175 | | |
176 | 0 | static inline uint64_t div100(const uint64_t x) { |
177 | 0 | return x / 100; |
178 | 0 | } |
179 | | |
180 | 0 | static inline uint64_t div1e8(const uint64_t x) { |
181 | 0 | return x / 100000000; |
182 | 0 | } |
183 | | |
184 | 0 | static inline uint64_t div1e9(const uint64_t x) { |
185 | 0 | return x / 1000000000; |
186 | 0 | } |
187 | | |
188 | 0 | static inline uint32_t mod1e9(const uint64_t x) { |
189 | 0 | return (uint32_t) (x - 1000000000 * div1e9(x)); |
190 | 0 | } |
191 | | |
192 | | #endif // defined(RYU_32_BIT_PLATFORM) |
193 | | |
194 | 0 | static inline uint32_t pow5Factor(uint64_t value) { |
195 | 0 | uint32_t count = 0; |
196 | 0 | for (;;) { |
197 | 0 | assert(value != 0); |
198 | 0 | const uint64_t q = div5(value); |
199 | 0 | const uint32_t r = ((uint32_t) value) - 5 * ((uint32_t) q); |
200 | 0 | if (r != 0) { |
201 | 0 | break; |
202 | 0 | } |
203 | 0 | value = q; |
204 | 0 | ++count; |
205 | 0 | } |
206 | 0 | return count; |
207 | 0 | } |
208 | | |
209 | | // Returns true if value is divisible by 5^p. |
210 | 0 | static inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) { |
211 | | // I tried a case distinction on p, but there was no performance difference. |
212 | 0 | return pow5Factor(value) >= p; |
213 | 0 | } |
214 | | |
215 | | // Returns true if value is divisible by 2^p. |
216 | 0 | static inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) { |
217 | 0 | assert(value != 0); |
218 | | // return __builtin_ctzll(value) >= p; |
219 | 0 | return (value & ((1ull << p) - 1)) == 0; |
220 | 0 | } |
221 | | |
222 | | #endif // RYU_D2S_INTRINSICS_H |