/src/node/deps/nbytes/include/nbytes.h
Line | Count | Source |
1 | | #ifndef NBYTES_H |
2 | | #define NBYTES_H |
3 | | #include <algorithm> |
4 | | #include <cmath> |
5 | | #include <cstddef> |
6 | | #include <cstdint> |
7 | | #include <cstring> |
8 | | #include <string> |
9 | | |
10 | | namespace nbytes { |
11 | | |
12 | | #if NBYTES_DEVELOPMENT_CHECKS |
13 | | #define NBYTES_STR(x) #x |
14 | | #define NBYTES_REQUIRE(EXPR) \ |
15 | | { \ |
16 | | if (!(EXPR) { abort(); }) } |
17 | | |
18 | | #define NBYTES_FAIL(MESSAGE) \ |
19 | | do { \ |
20 | | std::cerr << "FAIL: " << (MESSAGE) << std::endl; \ |
21 | | abort(); \ |
22 | | } while (0); |
23 | | #define NBYTES_ASSERT_EQUAL(LHS, RHS, MESSAGE) \ |
24 | | do { \ |
25 | | if (LHS != RHS) { \ |
26 | | std::cerr << "Mismatch: '" << LHS << "' - '" << RHS << "'" << std::endl; \ |
27 | | NBYTES_FAIL(MESSAGE); \ |
28 | | } \ |
29 | | } while (0); |
30 | | #define NBYTES_ASSERT_TRUE(COND) \ |
31 | | do { \ |
32 | | if (!(COND)) { \ |
33 | | std::cerr << "Assert at line " << __LINE__ << " of file " << __FILE__ \ |
34 | | << std::endl; \ |
35 | | NBYTES_FAIL(NBYTES_STR(COND)); \ |
36 | | } \ |
37 | | } while (0); |
38 | | #else |
39 | | #define NBYTES_FAIL(MESSAGE) |
40 | | #define NBYTES_ASSERT_EQUAL(LHS, RHS, MESSAGE) |
41 | | #define NBYTES_ASSERT_TRUE(COND) |
42 | | #endif |
43 | | |
44 | 0 | [[noreturn]] inline void unreachable() { |
45 | 0 | #ifdef __GNUC__ |
46 | 0 | __builtin_unreachable(); |
47 | | #elif defined(_MSC_VER) |
48 | | __assume(false); |
49 | | #else |
50 | | #endif |
51 | 0 | } |
52 | | |
53 | | // The nbytes (short for "node bytes") is a set of utility helpers for |
54 | | // working with bytes that are extracted from Node.js' internals. The |
55 | | // motivation for extracting these into a separate library is to make it |
56 | | // easier for other projects to implement functionality that is compatible |
57 | | // with Node.js' implementation of various byte manipulation functions. |
58 | | |
59 | | // Round up a to the next highest multiple of b. |
60 | | template <typename T> |
61 | 0 | constexpr T RoundUp(T a, T b) { |
62 | 0 | return a % b != 0 ? a + b - (a % b) : a; |
63 | 0 | } |
64 | | |
65 | | // Align ptr to an `alignment`-bytes boundary. |
66 | | template <typename T, typename U> |
67 | 0 | constexpr T *AlignUp(T *ptr, U alignment) { |
68 | 0 | return reinterpret_cast<T *>( |
69 | 0 | RoundUp(reinterpret_cast<uintptr_t>(ptr), alignment)); |
70 | 0 | } Unexecuted instantiation: char* nbytes::AlignUp<char, unsigned long>(char*, unsigned long) Unexecuted instantiation: unsigned short* nbytes::AlignUp<unsigned short, unsigned long>(unsigned short*, unsigned long) |
71 | | |
72 | | template <typename T, typename U> |
73 | 0 | inline T AlignDown(T value, U alignment) { |
74 | 0 | return reinterpret_cast<T>( |
75 | 0 | (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); |
76 | 0 | } |
77 | | |
78 | | template <typename T> |
79 | | inline T MultiplyWithOverflowCheck(T a, T b) { |
80 | | auto ret = a * b; |
81 | | if (a != 0) { |
82 | | NBYTES_ASSERT_TRUE(b == ret / a); |
83 | | } |
84 | | |
85 | | return ret; |
86 | | } |
87 | | |
88 | | void ForceAsciiSlow(const char *src, char *dst, size_t len); |
89 | | void ForceAscii(const char *src, char *dst, size_t len); |
90 | | |
91 | | // ============================================================================ |
92 | | // Byte Swapping |
93 | | |
94 | | // Swaps bytes in place. nbytes is the number of bytes to swap and must be a |
95 | | // multiple of the word size (checked by function). |
96 | | bool SwapBytes16(char *data, size_t nbytes); |
97 | | bool SwapBytes32(char *data, size_t nbytes); |
98 | | bool SwapBytes64(char *data, size_t nbytes); |
99 | | |
100 | | // ============================================================================ |
101 | | // Base64 (legacy) |
102 | | |
103 | | #ifdef _MSC_VER |
104 | | #pragma warning(push) |
105 | | // MSVC C4003: not enough actual parameters for macro 'identifier' |
106 | | #pragma warning(disable : 4003) |
107 | | #endif |
108 | | |
109 | | extern const int8_t unbase64_table[256]; |
110 | | |
111 | | template <typename TypeName> |
112 | | bool Base64DecodeGroupSlow(char *const dst, const size_t dstlen, |
113 | | const TypeName *const src, const size_t srclen, |
114 | 0 | size_t *const i, size_t *const k) { |
115 | 0 | uint8_t hi; |
116 | 0 | uint8_t lo; |
117 | 0 | #define V(expr) \ |
118 | 0 | for (;;) { \ |
119 | 0 | const uint8_t c = static_cast<uint8_t>(src[*i]); \ |
120 | 0 | lo = unbase64_table[c]; \ |
121 | 0 | *i += 1; \ |
122 | 0 | if (lo < 64) break; /* Legal character. */ \ |
123 | 0 | if (c == '=' || *i >= srclen) return false; /* Stop decoding. */ \ |
124 | 0 | } \ |
125 | 0 | expr; \ |
126 | 0 | if (*i >= srclen) return false; \ |
127 | 0 | if (*k >= dstlen) return false; \ |
128 | 0 | hi = lo; |
129 | 0 | V(/* Nothing. */); |
130 | 0 | V(dst[(*k)++] = ((hi & 0x3F) << 2) | ((lo & 0x30) >> 4)); |
131 | 0 | V(dst[(*k)++] = ((hi & 0x0F) << 4) | ((lo & 0x3C) >> 2)); |
132 | 0 | V(dst[(*k)++] = ((hi & 0x03) << 6) | ((lo & 0x3F) >> 0)); |
133 | 0 | #undef V |
134 | 0 | return true; // Continue decoding. |
135 | 0 | } Unexecuted instantiation: bool nbytes::Base64DecodeGroupSlow<char>(char*, unsigned long, char const*, unsigned long, unsigned long*, unsigned long*) Unexecuted instantiation: bool nbytes::Base64DecodeGroupSlow<unsigned short>(char*, unsigned long, unsigned short const*, unsigned long, unsigned long*, unsigned long*) |
136 | | |
137 | | enum class Base64Mode { NORMAL, URL }; |
138 | | |
139 | | inline constexpr size_t Base64EncodedSize( |
140 | 0 | size_t size, Base64Mode mode = Base64Mode::NORMAL) { |
141 | 0 | return mode == Base64Mode::NORMAL ? ((size + 2) / 3 * 4) |
142 | 0 | : static_cast<size_t>(std::ceil( |
143 | 0 | static_cast<double>(size * 4) / 3)); |
144 | 0 | } |
145 | | |
146 | | // Doesn't check for padding at the end. Can be 1-2 bytes over. |
147 | 0 | inline constexpr size_t Base64DecodedSizeFast(size_t size) { |
148 | | // 1-byte input cannot be decoded |
149 | 0 | return size > 1 ? (size / 4) * 3 + (size % 4 + 1) / 2 : 0; |
150 | 0 | } |
151 | | |
152 | 0 | inline uint32_t ReadUint32BE(const unsigned char *p) { |
153 | 0 | return static_cast<uint32_t>(p[0] << 24U) | |
154 | 0 | static_cast<uint32_t>(p[1] << 16U) | |
155 | 0 | static_cast<uint32_t>(p[2] << 8U) | static_cast<uint32_t>(p[3]); |
156 | 0 | } |
157 | | |
158 | | template <typename TypeName> |
159 | 0 | size_t Base64DecodedSize(const TypeName *src, size_t size) { |
160 | | // 1-byte input cannot be decoded |
161 | 0 | if (size < 2) return 0; |
162 | | |
163 | 0 | if (src[size - 1] == '=') { |
164 | 0 | size--; |
165 | 0 | if (src[size - 1] == '=') size--; |
166 | 0 | } |
167 | 0 | return Base64DecodedSizeFast(size); |
168 | 0 | } Unexecuted instantiation: unsigned long nbytes::Base64DecodedSize<char>(char const*, unsigned long) Unexecuted instantiation: unsigned long nbytes::Base64DecodedSize<unsigned short>(unsigned short const*, unsigned long) |
169 | | |
170 | | template <typename TypeName> |
171 | | size_t Base64DecodeFast(char *const dst, const size_t dstlen, |
172 | | const TypeName *const src, const size_t srclen, |
173 | 0 | const size_t decoded_size) { |
174 | 0 | const size_t available = dstlen < decoded_size ? dstlen : decoded_size; |
175 | 0 | const size_t max_k = available / 3 * 3; |
176 | 0 | size_t max_i = srclen / 4 * 4; |
177 | 0 | size_t i = 0; |
178 | 0 | size_t k = 0; |
179 | 0 | while (i < max_i && k < max_k) { |
180 | 0 | const unsigned char txt[] = { |
181 | 0 | static_cast<unsigned char>( |
182 | 0 | unbase64_table[static_cast<uint8_t>(src[i + 0])]), |
183 | 0 | static_cast<unsigned char>( |
184 | 0 | unbase64_table[static_cast<uint8_t>(src[i + 1])]), |
185 | 0 | static_cast<unsigned char>( |
186 | 0 | unbase64_table[static_cast<uint8_t>(src[i + 2])]), |
187 | 0 | static_cast<unsigned char>( |
188 | 0 | unbase64_table[static_cast<uint8_t>(src[i + 3])]), |
189 | 0 | }; |
190 | |
|
191 | 0 | const uint32_t v = ReadUint32BE(txt); |
192 | | // If MSB is set, input contains whitespace or is not valid base64. |
193 | 0 | if (v & 0x80808080) { |
194 | 0 | if (!Base64DecodeGroupSlow(dst, dstlen, src, srclen, &i, &k)) return k; |
195 | 0 | max_i = i + (srclen - i) / 4 * 4; // Align max_i again. |
196 | 0 | } else { |
197 | 0 | dst[k + 0] = ((v >> 22) & 0xFC) | ((v >> 20) & 0x03); |
198 | 0 | dst[k + 1] = ((v >> 12) & 0xF0) | ((v >> 10) & 0x0F); |
199 | 0 | dst[k + 2] = ((v >> 2) & 0xC0) | ((v >> 0) & 0x3F); |
200 | 0 | i += 4; |
201 | 0 | k += 3; |
202 | 0 | } |
203 | 0 | } |
204 | 0 | if (i < srclen && k < dstlen) { |
205 | 0 | Base64DecodeGroupSlow(dst, dstlen, src, srclen, &i, &k); |
206 | 0 | } |
207 | 0 | return k; |
208 | 0 | } Unexecuted instantiation: unsigned long nbytes::Base64DecodeFast<char>(char*, unsigned long, char const*, unsigned long, unsigned long) Unexecuted instantiation: unsigned long nbytes::Base64DecodeFast<unsigned short>(char*, unsigned long, unsigned short const*, unsigned long, unsigned long) |
209 | | |
210 | | template <typename TypeName> |
211 | | size_t Base64Decode(char *const dst, const size_t dstlen, |
212 | 0 | const TypeName *const src, const size_t srclen) { |
213 | 0 | const size_t decoded_size = Base64DecodedSize(src, srclen); |
214 | 0 | return Base64DecodeFast(dst, dstlen, src, srclen, decoded_size); |
215 | 0 | } Unexecuted instantiation: unsigned long nbytes::Base64Decode<char>(char*, unsigned long, char const*, unsigned long) Unexecuted instantiation: unsigned long nbytes::Base64Decode<unsigned short>(char*, unsigned long, unsigned short const*, unsigned long) |
216 | | |
217 | | #ifdef _MSC_VER |
218 | | #pragma warning(pop) |
219 | | #endif |
220 | | |
221 | | // ============================================================================ |
222 | | // Hex (legacy) |
223 | | |
224 | | extern const int8_t unhex_table[256]; |
225 | | |
226 | | template <typename TypeName> |
227 | | static size_t HexDecode(char *buf, size_t len, const TypeName *src, |
228 | 0 | const size_t srcLen) { |
229 | 0 | size_t i; |
230 | 0 | for (i = 0; i < len && i * 2 + 1 < srcLen; ++i) { |
231 | 0 | unsigned a = unhex_table[static_cast<uint8_t>(src[i * 2 + 0])]; |
232 | 0 | unsigned b = unhex_table[static_cast<uint8_t>(src[i * 2 + 1])]; |
233 | 0 | if (!~a || !~b) return i; |
234 | 0 | buf[i] = (a << 4) | b; |
235 | 0 | } |
236 | | |
237 | 0 | return i; |
238 | 0 | } Unexecuted instantiation: string_bytes.cc:unsigned long nbytes::HexDecode<char>(char*, unsigned long, char const*, unsigned long) Unexecuted instantiation: string_bytes.cc:unsigned long nbytes::HexDecode<unsigned short>(char*, unsigned long, unsigned short const*, unsigned long) |
239 | | |
240 | | size_t HexEncode(const char *src, size_t slen, char *dst, size_t dlen); |
241 | | |
242 | | std::string HexEncode(const char *src, size_t slen); |
243 | | |
244 | | // ============================================================================ |
245 | | // StringSearch |
246 | | |
247 | | namespace stringsearch { |
248 | | |
249 | | template <typename T> |
250 | | class Vector { |
251 | | public: |
252 | | Vector(T *data, size_t length, bool isForward) |
253 | 0 | : start_(data), length_(length), is_forward_(isForward) { |
254 | 0 | CHECK(length > 0 && data != nullptr); |
255 | 0 | } Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned short const>::Vector(unsigned short const*, unsigned long, bool) Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned char const>::Vector(unsigned char const*, unsigned long, bool) |
256 | | |
257 | | // Returns the start of the memory range. |
258 | | // For vector v this is NOT necessarily &v[0], see forward(). |
259 | 0 | const T *start() const { return start_; }Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned short const>::start() const Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned char const>::start() const |
260 | | |
261 | | // Returns the length of the vector, in characters. |
262 | 0 | size_t length() const { return length_; }Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned short const>::length() const Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned char const>::length() const |
263 | | |
264 | | // Returns true if the Vector is front-to-back, false if back-to-front. |
265 | | // In the latter case, v[0] corresponds to the *end* of the memory range. |
266 | 0 | bool forward() const { return is_forward_; }Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned short const>::forward() const Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned char const>::forward() const |
267 | | |
268 | | // Access individual vector elements - checks bounds in debug mode. |
269 | 0 | T &operator[](size_t index) const { |
270 | 0 | NBYTES_ASSERT_TRUE(index < length_); |
271 | 0 | return start_[is_forward_ ? index : (length_ - index - 1)]; |
272 | 0 | } Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned short const>::operator[](unsigned long) const Unexecuted instantiation: nbytes::stringsearch::Vector<unsigned char const>::operator[](unsigned long) const |
273 | | |
274 | | private: |
275 | | T *start_; |
276 | | size_t length_; |
277 | | bool is_forward_; |
278 | | }; |
279 | | |
280 | | //--------------------------------------------------------------------- |
281 | | // String Search object. |
282 | | //--------------------------------------------------------------------- |
283 | | |
284 | | // Class holding constants and methods that apply to all string search variants, |
285 | | // independently of subject and pattern char size. |
286 | | class StringSearchBase { |
287 | | protected: |
288 | | // Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
289 | | // limit, we can fix the size of tables. For a needle longer than this limit, |
290 | | // search will not be optimal, since we only build tables for a suffix |
291 | | // of the string, but it is a safe approximation. |
292 | | static const int kBMMaxShift = 250; |
293 | | |
294 | | // Reduce alphabet to this size. |
295 | | // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
296 | | // proportional to the input alphabet. We reduce the alphabet size by |
297 | | // equating input characters modulo a smaller alphabet size. This gives |
298 | | // a potentially less efficient searching, but is a safe approximation. |
299 | | // For needles using only characters in the same Unicode 256-code point page, |
300 | | // there is no search speed degradation. |
301 | | static const int kLatin1AlphabetSize = 256; |
302 | | static const int kUC16AlphabetSize = 256; |
303 | | |
304 | | // Bad-char shift table stored in the state. It's length is the alphabet size. |
305 | | // For patterns below this length, the skip length of Boyer-Moore is too short |
306 | | // to compensate for the algorithmic overhead compared to simple brute force. |
307 | | static const int kBMMinPatternLength = 8; |
308 | | |
309 | | // Store for the BoyerMoore(Horspool) bad char shift table. |
310 | | int bad_char_shift_table_[kUC16AlphabetSize]; |
311 | | // Store for the BoyerMoore good suffix shift table. |
312 | | int good_suffix_shift_table_[kBMMaxShift + 1]; |
313 | | // Table used temporarily while building the BoyerMoore good suffix |
314 | | // shift table. |
315 | | int suffix_table_[kBMMaxShift + 1]; |
316 | | }; |
317 | | |
318 | | template <typename Char> |
319 | | class StringSearch : private StringSearchBase { |
320 | | public: |
321 | | typedef stringsearch::Vector<const Char> Vector; |
322 | | |
323 | 0 | explicit StringSearch(Vector pattern) : pattern_(pattern), start_(0) { |
324 | 0 | if (pattern.length() >= kBMMaxShift) { |
325 | 0 | start_ = pattern.length() - kBMMaxShift; |
326 | 0 | } |
327 | |
|
328 | 0 | size_t pattern_length = pattern_.length(); |
329 | 0 | NBYTES_ASSERT_TRUE(pattern_length > 0); |
330 | 0 | if (pattern_length < kBMMinPatternLength) { |
331 | 0 | if (pattern_length == 1) { |
332 | 0 | strategy_ = SearchStrategy::kSingleChar; |
333 | 0 | return; |
334 | 0 | } |
335 | 0 | strategy_ = SearchStrategy::kLinear; |
336 | 0 | return; |
337 | 0 | } |
338 | 0 | strategy_ = SearchStrategy::kInitial; |
339 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::StringSearch(nbytes::stringsearch::Vector<unsigned short const>) Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::StringSearch(nbytes::stringsearch::Vector<unsigned char const>) |
340 | | |
341 | 0 | size_t Search(Vector subject, size_t index) { |
342 | 0 | switch (strategy_) { |
343 | 0 | case kBoyerMooreHorspool: |
344 | 0 | return BoyerMooreHorspoolSearch(subject, index); |
345 | 0 | case kBoyerMoore: |
346 | 0 | return BoyerMooreSearch(subject, index); |
347 | 0 | case kInitial: |
348 | 0 | return InitialSearch(subject, index); |
349 | 0 | case kLinear: |
350 | 0 | return LinearSearch(subject, index); |
351 | 0 | case kSingleChar: |
352 | 0 | return SingleCharSearch(subject, index); |
353 | 0 | } |
354 | 0 | unreachable(); |
355 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::Search(nbytes::stringsearch::Vector<unsigned short const>, unsigned long) Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::Search(nbytes::stringsearch::Vector<unsigned char const>, unsigned long) |
356 | | |
357 | 0 | static inline int AlphabetSize() { |
358 | 0 | if (sizeof(Char) == 1) { |
359 | | // Latin1 needle. |
360 | 0 | return kLatin1AlphabetSize; |
361 | 0 | } else { |
362 | | // UC16 needle. |
363 | 0 | return kUC16AlphabetSize; |
364 | 0 | } |
365 | | |
366 | 0 | static_assert( |
367 | 0 | sizeof(Char) == sizeof(uint8_t) || sizeof(Char) == sizeof(uint16_t), |
368 | 0 | "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); |
369 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::AlphabetSize() Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::AlphabetSize() |
370 | | |
371 | | private: |
372 | | typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); |
373 | | size_t SingleCharSearch(Vector subject, size_t start_index); |
374 | | size_t LinearSearch(Vector subject, size_t start_index); |
375 | | size_t InitialSearch(Vector subject, size_t start_index); |
376 | | size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); |
377 | | size_t BoyerMooreSearch(Vector subject, size_t start_index); |
378 | | |
379 | | void PopulateBoyerMooreHorspoolTable(); |
380 | | |
381 | | void PopulateBoyerMooreTable(); |
382 | | |
383 | 0 | static inline int CharOccurrence(int *bad_char_occurrence, Char char_code) { |
384 | 0 | if (sizeof(Char) == 1) { |
385 | 0 | return bad_char_occurrence[static_cast<int>(char_code)]; |
386 | 0 | } |
387 | | // Both pattern and subject are UC16. Reduce character to equivalence class. |
388 | 0 | int equiv_class = char_code % kUC16AlphabetSize; |
389 | 0 | return bad_char_occurrence[equiv_class]; |
390 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::CharOccurrence(int*, unsigned short) Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::CharOccurrence(int*, unsigned char) |
391 | | |
392 | | enum SearchStrategy { |
393 | | kBoyerMooreHorspool, |
394 | | kBoyerMoore, |
395 | | kInitial, |
396 | | kLinear, |
397 | | kSingleChar, |
398 | | }; |
399 | | |
400 | | // The pattern to search for. |
401 | | Vector pattern_; |
402 | | SearchStrategy strategy_; |
403 | | // Cache value of Max(0, pattern_length() - kBMMaxShift) |
404 | | size_t start_; |
405 | | }; |
406 | | |
407 | 0 | inline uint8_t GetHighestValueByte(uint16_t character) { |
408 | 0 | return std::max(static_cast<uint8_t>(character & 0xFF), |
409 | 0 | static_cast<uint8_t>(character >> 8)); |
410 | 0 | } |
411 | | |
412 | 0 | inline uint8_t GetHighestValueByte(uint8_t character) { return character; } |
413 | | |
414 | | // Searches for a byte value in a memory buffer, back to front. |
415 | | // Uses memrchr(3) on systems which support it, for speed. |
416 | | // Falls back to a vanilla for loop on non-GNU systems such as Windows. |
417 | | inline const void *MemrchrFill(const void *haystack, uint8_t needle, |
418 | 0 | size_t haystack_len) { |
419 | 0 | #ifdef _GNU_SOURCE |
420 | 0 | return memrchr(haystack, needle, haystack_len); |
421 | | #else |
422 | | const uint8_t *haystack8 = static_cast<const uint8_t *>(haystack); |
423 | | for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) { |
424 | | if (haystack8[i] == needle) { |
425 | | return haystack8 + i; |
426 | | } |
427 | | } |
428 | | return nullptr; |
429 | | #endif |
430 | 0 | } |
431 | | |
432 | | // Finds the first occurrence of *two-byte* character pattern[0] in the string |
433 | | // `subject`. Does not check that the whole pattern matches. |
434 | | template <typename Char> |
435 | | inline size_t FindFirstCharacter(Vector<const Char> pattern, |
436 | 0 | Vector<const Char> subject, size_t index) { |
437 | 0 | const Char pattern_first_char = pattern[0]; |
438 | 0 | const size_t max_n = (subject.length() - pattern.length() + 1); |
439 | | |
440 | | // For speed, search for the more `rare` of the two bytes in pattern[0] |
441 | | // using memchr / memrchr (which are much faster than a simple for loop). |
442 | 0 | const uint8_t search_byte = GetHighestValueByte(pattern_first_char); |
443 | 0 | size_t pos = index; |
444 | 0 | do { |
445 | 0 | const size_t bytes_to_search = (max_n - pos) * sizeof(Char); |
446 | 0 | const void *void_pos; |
447 | 0 | if (subject.forward()) { |
448 | | // Assert that bytes_to_search won't overflow |
449 | 0 | NBYTES_ASSERT_TRUE(pos <= max_n); |
450 | 0 | NBYTES_ASSERT_TRUE(max_n - pos <= SIZE_MAX / sizeof(Char)); |
451 | 0 | void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); |
452 | 0 | } else { |
453 | 0 | NBYTES_ASSERT_TRUE(pos <= subject.length()); |
454 | 0 | NBYTES_ASSERT_TRUE(subject.length() - pos <= SIZE_MAX / sizeof(Char)); |
455 | 0 | void_pos = MemrchrFill(subject.start() + pattern.length() - 1, |
456 | 0 | search_byte, bytes_to_search); |
457 | 0 | } |
458 | 0 | const Char *char_pos = static_cast<const Char *>(void_pos); |
459 | 0 | if (char_pos == nullptr) return subject.length(); |
460 | | |
461 | | // Then, for each match, verify that the full two bytes match pattern[0]. |
462 | 0 | char_pos = AlignDown(char_pos, sizeof(Char)); |
463 | 0 | size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
464 | 0 | pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); |
465 | 0 | if (subject[pos] == pattern_first_char) { |
466 | | // Match found, hooray. |
467 | 0 | return pos; |
468 | 0 | } |
469 | | // Search byte matched, but the other byte of pattern[0] didn't. Keep going. |
470 | 0 | } while (++pos < max_n); |
471 | | |
472 | 0 | return subject.length(); |
473 | 0 | } |
474 | | |
475 | | // Finds the first occurrence of the byte pattern[0] in string `subject`. |
476 | | // Does not verify that the whole pattern matches. |
477 | | template <> |
478 | | inline size_t FindFirstCharacter(Vector<const uint8_t> pattern, |
479 | 0 | Vector<const uint8_t> subject, size_t index) { |
480 | 0 | const uint8_t pattern_first_char = pattern[0]; |
481 | 0 | const size_t subj_len = subject.length(); |
482 | 0 | const size_t max_n = (subject.length() - pattern.length() + 1); |
483 | |
|
484 | 0 | const void *pos; |
485 | 0 | if (subject.forward()) { |
486 | 0 | pos = memchr(subject.start() + index, pattern_first_char, max_n - index); |
487 | 0 | } else { |
488 | 0 | pos = MemrchrFill(subject.start() + pattern.length() - 1, |
489 | 0 | pattern_first_char, max_n - index); |
490 | 0 | } |
491 | 0 | const uint8_t *char_pos = static_cast<const uint8_t *>(pos); |
492 | 0 | if (char_pos == nullptr) { |
493 | 0 | return subj_len; |
494 | 0 | } |
495 | | |
496 | 0 | size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
497 | 0 | return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); |
498 | 0 | } |
499 | | |
500 | | //--------------------------------------------------------------------- |
501 | | // Single Character Pattern Search Strategy |
502 | | //--------------------------------------------------------------------- |
503 | | |
504 | | template <typename Char> |
505 | 0 | size_t StringSearch<Char>::SingleCharSearch(Vector subject, size_t index) { |
506 | 0 | NBYTES_ASSERT_TRUE(1 == pattern_.length()); |
507 | 0 | return FindFirstCharacter(pattern_, subject, index); |
508 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::SingleCharSearch(nbytes::stringsearch::Vector<unsigned short const>, unsigned long) Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::SingleCharSearch(nbytes::stringsearch::Vector<unsigned char const>, unsigned long) |
509 | | |
510 | | //--------------------------------------------------------------------- |
511 | | // Linear Search Strategy |
512 | | //--------------------------------------------------------------------- |
513 | | |
514 | | // Simple linear search for short patterns. Never bails out. |
515 | | template <typename Char> |
516 | 0 | size_t StringSearch<Char>::LinearSearch(Vector subject, size_t index) { |
517 | 0 | NBYTES_ASSERT_TRUE(pattern_.length() > 1); |
518 | 0 | const size_t n = subject.length() - pattern_.length(); |
519 | 0 | for (size_t i = index; i <= n; i++) { |
520 | 0 | i = FindFirstCharacter(pattern_, subject, i); |
521 | 0 | if (i == subject.length()) return subject.length(); |
522 | 0 | NBYTES_ASSERT_TRUE(i <= n); |
523 | |
|
524 | 0 | bool matches = true; |
525 | 0 | for (size_t j = 1; j < pattern_.length(); j++) { |
526 | 0 | if (pattern_[j] != subject[i + j]) { |
527 | 0 | matches = false; |
528 | 0 | break; |
529 | 0 | } |
530 | 0 | } |
531 | 0 | if (matches) { |
532 | 0 | return i; |
533 | 0 | } |
534 | 0 | } |
535 | 0 | return subject.length(); |
536 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::LinearSearch(nbytes::stringsearch::Vector<unsigned short const>, unsigned long) Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::LinearSearch(nbytes::stringsearch::Vector<unsigned char const>, unsigned long) |
537 | | |
538 | | //--------------------------------------------------------------------- |
539 | | // Boyer-Moore string search |
540 | | //--------------------------------------------------------------------- |
541 | | |
542 | | template <typename Char> |
543 | | size_t StringSearch<Char>::BoyerMooreSearch(Vector subject, |
544 | 0 | size_t start_index) { |
545 | 0 | const size_t subject_length = subject.length(); |
546 | 0 | const size_t pattern_length = pattern_.length(); |
547 | | // Only preprocess at most kBMMaxShift last characters of pattern. |
548 | 0 | size_t start = start_; |
549 | |
|
550 | 0 | int *bad_char_occurrence = bad_char_shift_table_; |
551 | 0 | int *good_suffix_shift = good_suffix_shift_table_ - start_; |
552 | |
|
553 | 0 | Char last_char = pattern_[pattern_length - 1]; |
554 | 0 | size_t index = start_index; |
555 | | // Continue search from i. |
556 | 0 | while (index <= subject_length - pattern_length) { |
557 | 0 | size_t j = pattern_length - 1; |
558 | 0 | int c; |
559 | 0 | while (last_char != (c = subject[index + j])) { |
560 | 0 | int shift = j - CharOccurrence(bad_char_occurrence, c); |
561 | 0 | index += shift; |
562 | 0 | if (index > subject_length - pattern_length) { |
563 | 0 | return subject.length(); |
564 | 0 | } |
565 | 0 | } |
566 | 0 | while (pattern_[j] == (c = subject[index + j])) { |
567 | 0 | if (j == 0) { |
568 | 0 | return index; |
569 | 0 | } |
570 | 0 | j--; |
571 | 0 | } |
572 | 0 | if (j < start) { |
573 | | // we have matched more than our tables allow us to be smart about. |
574 | | // Fall back on BMH shift. |
575 | 0 | index += |
576 | 0 | pattern_length - 1 - CharOccurrence(bad_char_occurrence, last_char); |
577 | 0 | } else { |
578 | 0 | int gs_shift = good_suffix_shift[j + 1]; |
579 | 0 | int bc_occ = CharOccurrence(bad_char_occurrence, c); |
580 | 0 | int shift = j - bc_occ; |
581 | 0 | if (gs_shift > shift) { |
582 | 0 | shift = gs_shift; |
583 | 0 | } |
584 | 0 | index += shift; |
585 | 0 | } |
586 | 0 | } |
587 | | |
588 | 0 | return subject.length(); |
589 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::BoyerMooreSearch(nbytes::stringsearch::Vector<unsigned short const>, unsigned long) Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::BoyerMooreSearch(nbytes::stringsearch::Vector<unsigned char const>, unsigned long) |
590 | | |
591 | | template <typename Char> |
592 | 0 | void StringSearch<Char>::PopulateBoyerMooreTable() { |
593 | 0 | const size_t pattern_length = pattern_.length(); |
594 | | // Only look at the last kBMMaxShift characters of pattern (from start_ |
595 | | // to pattern_length). |
596 | 0 | const size_t start = start_; |
597 | 0 | const size_t length = pattern_length - start; |
598 | | |
599 | | // Biased tables so that we can use pattern indices as table indices, |
600 | | // even if we only cover the part of the pattern from offset start. |
601 | 0 | int *shift_table = good_suffix_shift_table_ - start_; |
602 | 0 | int *suffix_table = suffix_table_ - start_; |
603 | | |
604 | | // Initialize table. |
605 | 0 | for (size_t i = start; i < pattern_length; i++) { |
606 | 0 | shift_table[i] = length; |
607 | 0 | } |
608 | 0 | shift_table[pattern_length] = 1; |
609 | 0 | suffix_table[pattern_length] = pattern_length + 1; |
610 | |
|
611 | 0 | if (pattern_length <= start) { |
612 | 0 | return; |
613 | 0 | } |
614 | | |
615 | | // Find suffixes. |
616 | 0 | Char last_char = pattern_[pattern_length - 1]; |
617 | 0 | size_t suffix = pattern_length + 1; |
618 | 0 | { |
619 | 0 | size_t i = pattern_length; |
620 | 0 | while (i > start) { |
621 | 0 | Char c = pattern_[i - 1]; |
622 | 0 | while (suffix <= pattern_length && c != pattern_[suffix - 1]) { |
623 | 0 | if (static_cast<size_t>(shift_table[suffix]) == length) { |
624 | 0 | shift_table[suffix] = suffix - i; |
625 | 0 | } |
626 | 0 | suffix = suffix_table[suffix]; |
627 | 0 | } |
628 | 0 | suffix_table[--i] = --suffix; |
629 | 0 | if (suffix == pattern_length) { |
630 | | // No suffix to extend, so we check against last_char only. |
631 | 0 | while ((i > start) && (pattern_[i - 1] != last_char)) { |
632 | 0 | if (static_cast<size_t>(shift_table[pattern_length]) == length) { |
633 | 0 | shift_table[pattern_length] = pattern_length - i; |
634 | 0 | } |
635 | 0 | suffix_table[--i] = pattern_length; |
636 | 0 | } |
637 | 0 | if (i > start) { |
638 | 0 | suffix_table[--i] = --suffix; |
639 | 0 | } |
640 | 0 | } |
641 | 0 | } |
642 | 0 | } |
643 | | // Build shift table using suffixes. |
644 | 0 | if (suffix < pattern_length) { |
645 | 0 | for (size_t i = start; i <= pattern_length; i++) { |
646 | 0 | if (static_cast<size_t>(shift_table[i]) == length) { |
647 | 0 | shift_table[i] = suffix - start; |
648 | 0 | } |
649 | 0 | if (i == suffix) { |
650 | 0 | suffix = suffix_table[suffix]; |
651 | 0 | } |
652 | 0 | } |
653 | 0 | } |
654 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::PopulateBoyerMooreTable() Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::PopulateBoyerMooreTable() |
655 | | |
656 | | //--------------------------------------------------------------------- |
657 | | // Boyer-Moore-Horspool string search. |
658 | | //--------------------------------------------------------------------- |
659 | | |
660 | | template <typename Char> |
661 | | size_t StringSearch<Char>::BoyerMooreHorspoolSearch(Vector subject, |
662 | 0 | size_t start_index) { |
663 | 0 | const size_t subject_length = subject.length(); |
664 | 0 | const size_t pattern_length = pattern_.length(); |
665 | 0 | int *char_occurrences = bad_char_shift_table_; |
666 | 0 | int64_t badness = -static_cast<int64_t>(pattern_length); |
667 | | |
668 | | // How bad we are doing without a good-suffix table. |
669 | 0 | Char last_char = pattern_[pattern_length - 1]; |
670 | 0 | int last_char_shift = |
671 | 0 | pattern_length - 1 - CharOccurrence(char_occurrences, last_char); |
672 | | |
673 | | // Perform search |
674 | 0 | size_t index = start_index; // No matches found prior to this index. |
675 | 0 | while (index <= subject_length - pattern_length) { |
676 | 0 | size_t j = pattern_length - 1; |
677 | 0 | int subject_char; |
678 | 0 | while (last_char != (subject_char = subject[index + j])) { |
679 | 0 | int bc_occ = CharOccurrence(char_occurrences, subject_char); |
680 | 0 | int shift = j - bc_occ; |
681 | 0 | index += shift; |
682 | 0 | badness += 1 - shift; // at most zero, so badness cannot increase. |
683 | 0 | if (index > subject_length - pattern_length) { |
684 | 0 | return subject_length; |
685 | 0 | } |
686 | 0 | } |
687 | 0 | j--; |
688 | 0 | while (pattern_[j] == (subject[index + j])) { |
689 | 0 | if (j == 0) { |
690 | 0 | return index; |
691 | 0 | } |
692 | 0 | j--; |
693 | 0 | } |
694 | 0 | index += last_char_shift; |
695 | | // Badness increases by the number of characters we have |
696 | | // checked, and decreases by the number of characters we |
697 | | // can skip by shifting. It's a measure of how we are doing |
698 | | // compared to reading each character exactly once. |
699 | 0 | badness += (pattern_length - j) - last_char_shift; |
700 | 0 | if (badness > 0) { |
701 | 0 | PopulateBoyerMooreTable(); |
702 | 0 | strategy_ = SearchStrategy::kBoyerMoore; |
703 | 0 | return BoyerMooreSearch(subject, index); |
704 | 0 | } |
705 | 0 | } |
706 | 0 | return subject.length(); |
707 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::BoyerMooreHorspoolSearch(nbytes::stringsearch::Vector<unsigned short const>, unsigned long) Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::BoyerMooreHorspoolSearch(nbytes::stringsearch::Vector<unsigned char const>, unsigned long) |
708 | | |
709 | | template <typename Char> |
710 | 0 | void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() { |
711 | 0 | const size_t pattern_length = pattern_.length(); |
712 | |
|
713 | 0 | int *bad_char_occurrence = bad_char_shift_table_; |
714 | | |
715 | | // Only preprocess at most kBMMaxShift last characters of pattern. |
716 | 0 | const size_t start = start_; |
717 | | // Run forwards to populate bad_char_table, so that *last* instance |
718 | | // of character equivalence class is the one registered. |
719 | | // Notice: Doesn't include the last character. |
720 | 0 | const size_t table_size = AlphabetSize(); |
721 | 0 | if (start == 0) { |
722 | | // All patterns less than kBMMaxShift in length. |
723 | 0 | memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); |
724 | 0 | } else { |
725 | 0 | for (size_t i = 0; i < table_size; i++) { |
726 | 0 | bad_char_occurrence[i] = start - 1; |
727 | 0 | } |
728 | 0 | } |
729 | 0 | for (size_t i = start; i < pattern_length - 1; i++) { |
730 | 0 | Char c = pattern_[i]; |
731 | 0 | int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); |
732 | 0 | bad_char_occurrence[bucket] = i; |
733 | 0 | } |
734 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::PopulateBoyerMooreHorspoolTable() Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::PopulateBoyerMooreHorspoolTable() |
735 | | |
736 | | //--------------------------------------------------------------------- |
737 | | // Linear string search with bailout to BMH. |
738 | | //--------------------------------------------------------------------- |
739 | | |
740 | | // Simple linear search for short patterns, which bails out if the string |
741 | | // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
742 | | template <typename Char> |
743 | 0 | size_t StringSearch<Char>::InitialSearch(Vector subject, size_t index) { |
744 | 0 | const size_t pattern_length = pattern_.length(); |
745 | | // Badness is a count of how much work we have done. When we have |
746 | | // done enough work we decide it's probably worth switching to a better |
747 | | // algorithm. |
748 | 0 | int64_t badness = -10 - (pattern_length << 2); |
749 | | |
750 | | // We know our pattern is at least 2 characters, we cache the first so |
751 | | // the common case of the first character not matching is faster. |
752 | 0 | for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { |
753 | 0 | badness++; |
754 | 0 | if (badness <= 0) { |
755 | 0 | i = FindFirstCharacter(pattern_, subject, i); |
756 | 0 | if (i == subject.length()) return subject.length(); |
757 | 0 | NBYTES_ASSERT_TRUE(i <= n); |
758 | 0 | size_t j = 1; |
759 | 0 | do { |
760 | 0 | if (pattern_[j] != subject[i + j]) { |
761 | 0 | break; |
762 | 0 | } |
763 | 0 | j++; |
764 | 0 | } while (j < pattern_length); |
765 | 0 | if (j == pattern_length) { |
766 | 0 | return i; |
767 | 0 | } |
768 | 0 | badness += j; |
769 | 0 | } else { |
770 | 0 | PopulateBoyerMooreHorspoolTable(); |
771 | 0 | strategy_ = SearchStrategy::kBoyerMooreHorspool; |
772 | 0 | return BoyerMooreHorspoolSearch(subject, i); |
773 | 0 | } |
774 | 0 | } |
775 | 0 | return subject.length(); |
776 | 0 | } Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned short>::InitialSearch(nbytes::stringsearch::Vector<unsigned short const>, unsigned long) Unexecuted instantiation: nbytes::stringsearch::StringSearch<unsigned char>::InitialSearch(nbytes::stringsearch::Vector<unsigned char const>, unsigned long) |
777 | | |
778 | | // Perform a single stand-alone search. |
779 | | // If searching multiple times for the same pattern, a search |
780 | | // object should be constructed once and the Search function then called |
781 | | // for each search. |
782 | | template <typename Char> |
783 | | size_t SearchString(Vector<const Char> subject, Vector<const Char> pattern, |
784 | 0 | size_t start_index) { |
785 | 0 | StringSearch<Char> search(pattern); |
786 | 0 | return search.Search(subject, start_index); |
787 | 0 | } Unexecuted instantiation: unsigned long nbytes::stringsearch::SearchString<unsigned short>(nbytes::stringsearch::Vector<unsigned short const>, nbytes::stringsearch::Vector<unsigned short const>, unsigned long) Unexecuted instantiation: unsigned long nbytes::stringsearch::SearchString<unsigned char>(nbytes::stringsearch::Vector<unsigned char const>, nbytes::stringsearch::Vector<unsigned char const>, unsigned long) |
788 | | } // namespace stringsearch |
789 | | |
790 | | template <typename Char> |
791 | | size_t SearchString(const Char *haystack, size_t haystack_length, |
792 | | const Char *needle, size_t needle_length, |
793 | 0 | size_t start_index, bool is_forward) { |
794 | 0 | if (haystack_length < needle_length) return haystack_length; |
795 | | // To do a reverse search (lastIndexOf instead of indexOf) without redundant |
796 | | // code, create two vectors that are reversed views into the input strings. |
797 | | // For example, v_needle[0] would return the *last* character of the needle. |
798 | | // So we're searching for the first instance of rev(needle) in rev(haystack) |
799 | 0 | stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward); |
800 | 0 | stringsearch::Vector<const Char> v_haystack(haystack, haystack_length, |
801 | 0 | is_forward); |
802 | 0 | size_t diff = haystack_length - needle_length; |
803 | 0 | size_t relative_start_index; |
804 | 0 | if (is_forward) { |
805 | 0 | relative_start_index = start_index; |
806 | 0 | } else if (diff < start_index) { |
807 | 0 | relative_start_index = 0; |
808 | 0 | } else { |
809 | 0 | relative_start_index = diff - start_index; |
810 | 0 | } |
811 | 0 | size_t pos = |
812 | 0 | stringsearch::SearchString(v_haystack, v_needle, relative_start_index); |
813 | 0 | if (pos == haystack_length) { |
814 | | // not found |
815 | 0 | return pos; |
816 | 0 | } |
817 | 0 | return is_forward ? pos : (haystack_length - needle_length - pos); |
818 | 0 | } Unexecuted instantiation: unsigned long nbytes::SearchString<unsigned short>(unsigned short const*, unsigned long, unsigned short const*, unsigned long, unsigned long, bool) Unexecuted instantiation: unsigned long nbytes::SearchString<unsigned char>(unsigned char const*, unsigned long, unsigned char const*, unsigned long, unsigned long, bool) |
819 | | |
820 | | template <size_t N> |
821 | | size_t SearchString(const char *haystack, size_t haystack_length, |
822 | | const char (&needle)[N]) { |
823 | | return SearchString( |
824 | | reinterpret_cast<const uint8_t *>(haystack), haystack_length, |
825 | | reinterpret_cast<const uint8_t *>(needle), N - 1, 0, true); |
826 | | } |
827 | | |
828 | | // ============================================================================ |
829 | | // Version metadata |
830 | 72 | #define NBYTES_VERSION "0.1.1" |
831 | | |
832 | | enum { |
833 | | NBYTES_VERSION_MAJOR = 0, |
834 | | NBYTES_VERSION_MINOR = 1, |
835 | | NBYTES_VERSION_REVISION = 1, |
836 | | }; |
837 | | |
838 | | } // namespace nbytes |
839 | | |
840 | | #endif // NBYTES_H |