Line data Source code
1 : // Copyright 2011 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_STRING_SEARCH_H_
6 : #define V8_STRING_SEARCH_H_
7 :
8 : #include "src/isolate.h"
9 : #include "src/vector.h"
10 :
11 : namespace v8 {
12 : namespace internal {
13 :
14 :
15 : //---------------------------------------------------------------------
16 : // String Search object.
17 : //---------------------------------------------------------------------
18 :
19 : // Class holding constants and methods that apply to all string search variants,
20 : // independently of subject and pattern char size.
21 : class StringSearchBase {
22 : protected:
23 : // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
24 : // limit, we can fix the size of tables. For a needle longer than this limit,
25 : // search will not be optimal, since we only build tables for a suffix
26 : // of the string, but it is a safe approximation.
27 : static const int kBMMaxShift = Isolate::kBMMaxShift;
28 :
29 : // Reduce alphabet to this size.
30 : // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
31 : // proportional to the input alphabet. We reduce the alphabet size by
32 : // equating input characters modulo a smaller alphabet size. This gives
33 : // a potentially less efficient searching, but is a safe approximation.
34 : // For needles using only characters in the same Unicode 256-code point page,
35 : // there is no search speed degradation.
36 : static const int kLatin1AlphabetSize = 256;
37 : static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
38 :
39 : // Bad-char shift table stored in the state. It's length is the alphabet size.
40 : // For patterns below this length, the skip length of Boyer-Moore is too short
41 : // to compensate for the algorithmic overhead compared to simple brute force.
42 : static const int kBMMinPatternLength = 7;
43 :
44 : static inline bool IsOneByteString(Vector<const uint8_t> string) {
45 : return true;
46 : }
47 :
48 : static inline bool IsOneByteString(Vector<const uc16> string) {
49 : return String::IsOneByte(string.start(), string.length());
50 : }
51 :
52 : friend class Isolate;
53 : };
54 :
55 :
56 : template <typename PatternChar, typename SubjectChar>
57 : class StringSearch : private StringSearchBase {
58 : public:
59 597559 : StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
60 : : isolate_(isolate),
61 : pattern_(pattern),
62 8300637 : start_(Max(0, pattern.length() - kBMMaxShift)) {
63 : if (sizeof(PatternChar) > sizeof(SubjectChar)) {
64 597559 : if (!IsOneByteString(pattern_)) {
65 597559 : strategy_ = &FailSearch;
66 597559 : return;
67 : }
68 : }
69 : int pattern_length = pattern_.length();
70 2108158 : if (pattern_length < kBMMinPatternLength) {
71 1328876 : if (pattern_length == 1) {
72 1215214 : strategy_ = &SingleCharSearch;
73 0 : return;
74 : }
75 297162 : strategy_ = &LinearSearch;
76 0 : return;
77 : }
78 779282 : strategy_ = &InitialSearch;
79 : }
80 :
81 : int Search(Vector<const SubjectChar> subject, int index) {
82 2889234 : return strategy_(this, subject, index);
83 : }
84 :
85 : static inline int AlphabetSize() {
86 : if (sizeof(PatternChar) == 1) {
87 : // Latin1 needle.
88 : return kLatin1AlphabetSize;
89 : } else {
90 : DCHECK_EQ(sizeof(PatternChar), 2);
91 : // UC16 needle.
92 : return kUC16AlphabetSize;
93 : }
94 : }
95 :
96 : private:
97 : typedef int (*SearchFunction)( // NOLINT - it's not a cast!
98 : StringSearch<PatternChar, SubjectChar>*,
99 : Vector<const SubjectChar>,
100 : int);
101 :
102 597559 : static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
103 : Vector<const SubjectChar>,
104 : int) {
105 597559 : return -1;
106 : }
107 :
108 : static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
109 : Vector<const SubjectChar> subject,
110 : int start_index);
111 :
112 : static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
113 : Vector<const SubjectChar> subject,
114 : int start_index);
115 :
116 : static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
117 : Vector<const SubjectChar> subject,
118 : int start_index);
119 :
120 : static int BoyerMooreHorspoolSearch(
121 : StringSearch<PatternChar, SubjectChar>* search,
122 : Vector<const SubjectChar> subject,
123 : int start_index);
124 :
125 : static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
126 : Vector<const SubjectChar> subject,
127 : int start_index);
128 :
129 : void PopulateBoyerMooreHorspoolTable();
130 :
131 : void PopulateBoyerMooreTable();
132 :
133 : static inline bool exceedsOneByte(uint8_t c) {
134 : return false;
135 : }
136 :
137 : static inline bool exceedsOneByte(uint16_t c) {
138 : return c > String::kMaxOneByteCharCodeU;
139 : }
140 :
141 : static inline int CharOccurrence(int* bad_char_occurrence,
142 : SubjectChar char_code) {
143 : if (sizeof(SubjectChar) == 1) {
144 515396 : return bad_char_occurrence[static_cast<int>(char_code)];
145 : }
146 : if (sizeof(PatternChar) == 1) {
147 0 : if (exceedsOneByte(char_code)) {
148 : return -1;
149 : }
150 0 : return bad_char_occurrence[static_cast<unsigned int>(char_code)];
151 : }
152 : // Both pattern and subject are UC16. Reduce character to equivalence class.
153 : int equiv_class = char_code % kUC16AlphabetSize;
154 0 : return bad_char_occurrence[equiv_class];
155 : }
156 :
157 : // The following tables are shared by all searches.
158 : // TODO(lrn): Introduce a way for a pattern to keep its tables
159 : // between searches (e.g., for an Atom RegExp).
160 :
161 : // Store for the BoyerMoore(Horspool) bad char shift table.
162 : // Return a table covering the last kBMMaxShift+1 positions of
163 : // pattern.
164 : int* bad_char_table() {
165 : return isolate_->bad_char_shift_table();
166 : }
167 :
168 : // Store for the BoyerMoore good suffix shift table.
169 : int* good_suffix_shift_table() {
170 : // Return biased pointer that maps the range [start_..pattern_.length()
171 : // to the kGoodSuffixShiftTable array.
172 36 : return isolate_->good_suffix_shift_table() - start_;
173 : }
174 :
175 : // Table used temporarily while building the BoyerMoore good suffix
176 : // shift table.
177 : int* suffix_table() {
178 : // Return biased pointer that maps the range [start_..pattern_.length()
179 : // to the kSuffixTable array.
180 18 : return isolate_->suffix_table() - start_;
181 : }
182 :
183 : Isolate* isolate_;
184 : // The pattern to search for.
185 : Vector<const PatternChar> pattern_;
186 : // Pointer to implementation of the search.
187 : SearchFunction strategy_;
188 : // Cache value of Max(0, pattern_length() - kBMMaxShift)
189 : int start_;
190 : };
191 :
192 :
193 : template <typename T, typename U>
194 : inline T AlignDown(T value, U alignment) {
195 : return reinterpret_cast<T>(
196 113976 : (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
197 : }
198 :
199 :
200 : inline uint8_t GetHighestValueByte(uc16 character) {
201 2402 : return Max(static_cast<uint8_t>(character & 0xFF),
202 : static_cast<uint8_t>(character >> 8));
203 : }
204 :
205 :
206 : inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
207 :
208 :
209 : template <typename PatternChar, typename SubjectChar>
210 14365695 : inline int FindFirstCharacter(Vector<const PatternChar> pattern,
211 : Vector<const SubjectChar> subject, int index) {
212 14365695 : const PatternChar pattern_first_char = pattern[0];
213 14365695 : const int max_n = (subject.length() - pattern.length() + 1);
214 :
215 : const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
216 8923 : const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
217 : int pos = index;
218 109335 : do {
219 : DCHECK_GE(max_n - pos, 0);
220 : const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
221 14471330 : memchr(subject.start() + pos, search_byte,
222 : (max_n - pos) * sizeof(SubjectChar)));
223 14471330 : if (char_pos == nullptr) return -1;
224 : char_pos = AlignDown(char_pos, sizeof(SubjectChar));
225 13242398 : pos = static_cast<int>(char_pos - subject.start());
226 26484796 : if (subject[pos] == search_char) return pos;
227 : } while (++pos < max_n);
228 :
229 : return -1;
230 : }
231 :
232 :
233 : //---------------------------------------------------------------------
234 : // Single Character Pattern Search Strategy
235 : //---------------------------------------------------------------------
236 :
237 : template <typename PatternChar, typename SubjectChar>
238 1215211 : int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
239 : StringSearch<PatternChar, SubjectChar>* search,
240 : Vector<const SubjectChar> subject,
241 : int index) {
242 : DCHECK_EQ(1, search->pattern_.length());
243 0 : PatternChar pattern_first_char = search->pattern_[0];
244 : if (sizeof(PatternChar) > sizeof(SubjectChar)) {
245 0 : if (exceedsOneByte(pattern_first_char)) {
246 : return -1;
247 : }
248 : }
249 1215211 : return FindFirstCharacter(search->pattern_, subject, index);
250 : }
251 :
252 : //---------------------------------------------------------------------
253 : // Linear Search Strategy
254 : //---------------------------------------------------------------------
255 :
256 :
257 : template <typename PatternChar, typename SubjectChar>
258 : inline bool CharCompare(const PatternChar* pattern,
259 : const SubjectChar* subject,
260 : int length) {
261 : DCHECK_GT(length, 0);
262 : int pos = 0;
263 323812 : do {
264 413708 : if (pattern[pos] != subject[pos]) {
265 : return false;
266 : }
267 323812 : pos++;
268 : } while (pos < length);
269 : return true;
270 : }
271 :
272 :
273 : // Simple linear search for short patterns. Never bails out.
274 : template <typename PatternChar, typename SubjectChar>
275 297167 : int StringSearch<PatternChar, SubjectChar>::LinearSearch(
276 : StringSearch<PatternChar, SubjectChar>* search,
277 : Vector<const SubjectChar> subject,
278 : int index) {
279 297167 : Vector<const PatternChar> pattern = search->pattern_;
280 : DCHECK_GT(pattern.length(), 1);
281 : int pattern_length = pattern.length();
282 : int i = index;
283 297167 : int n = subject.length() - pattern_length;
284 387063 : while (i <= n) {
285 386928 : i = FindFirstCharacter(pattern, subject, i);
286 386928 : if (i == -1) return -1;
287 : DCHECK_LE(i, n);
288 375293 : i++;
289 : // Loop extracted to separate function to allow using return to do
290 : // a deeper break.
291 750586 : if (CharCompare(pattern.start() + 1,
292 : subject.start() + i,
293 : pattern_length - 1)) {
294 : return i - 1;
295 : }
296 : }
297 : return -1;
298 : }
299 :
300 : //---------------------------------------------------------------------
301 : // Boyer-Moore string search
302 : //---------------------------------------------------------------------
303 :
304 : template <typename PatternChar, typename SubjectChar>
305 18 : int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
306 : StringSearch<PatternChar, SubjectChar>* search,
307 : Vector<const SubjectChar> subject,
308 : int start_index) {
309 18 : Vector<const PatternChar> pattern = search->pattern_;
310 : int subject_length = subject.length();
311 : int pattern_length = pattern.length();
312 : // Only preprocess at most kBMMaxShift last characters of pattern.
313 18 : int start = search->start_;
314 :
315 : int* bad_char_occurence = search->bad_char_table();
316 : int* good_suffix_shift = search->good_suffix_shift_table();
317 :
318 36 : PatternChar last_char = pattern[pattern_length - 1];
319 : int index = start_index;
320 : // Continue search from i.
321 3258 : while (index <= subject_length - pattern_length) {
322 : int j = pattern_length - 1;
323 : int c;
324 6516 : while (last_char != (c = subject[index + j])) {
325 : int shift =
326 0 : j - CharOccurrence(bad_char_occurence, c);
327 0 : index += shift;
328 0 : if (index > subject_length - pattern_length) {
329 : return -1;
330 : }
331 : }
332 19836 : while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
333 3258 : if (j < 0) {
334 : return index;
335 3240 : } else if (j < start) {
336 : // we have matched more than our tables allow us to be smart about.
337 : // Fall back on BMH shift.
338 0 : index += pattern_length - 1
339 0 : - CharOccurrence(bad_char_occurence,
340 : static_cast<SubjectChar>(last_char));
341 : } else {
342 3240 : int gs_shift = good_suffix_shift[j + 1];
343 : int bc_occ =
344 : CharOccurrence(bad_char_occurence, c);
345 3240 : int shift = j - bc_occ;
346 3240 : if (gs_shift > shift) {
347 : shift = gs_shift;
348 : }
349 3240 : index += shift;
350 : }
351 : }
352 :
353 : return -1;
354 : }
355 :
356 :
357 : template <typename PatternChar, typename SubjectChar>
358 18 : void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
359 : int pattern_length = pattern_.length();
360 : const PatternChar* pattern = pattern_.start();
361 : // Only look at the last kBMMaxShift characters of pattern (from start_
362 : // to pattern_length).
363 18 : int start = start_;
364 18 : int length = pattern_length - start;
365 :
366 : // Biased tables so that we can use pattern indices as table indices,
367 : // even if we only cover the part of the pattern from offset start.
368 : int* shift_table = good_suffix_shift_table();
369 : int* suffix_table = this->suffix_table();
370 :
371 : // Initialize table.
372 270 : for (int i = start; i < pattern_length; i++) {
373 126 : shift_table[i] = length;
374 : }
375 18 : shift_table[pattern_length] = 1;
376 18 : suffix_table[pattern_length] = pattern_length + 1;
377 :
378 18 : if (pattern_length <= start) {
379 : return;
380 : }
381 :
382 : // Find suffixes.
383 18 : PatternChar last_char = pattern[pattern_length - 1];
384 : int suffix = pattern_length + 1;
385 : {
386 : int i = pattern_length;
387 72 : while (i > start) {
388 54 : PatternChar c = pattern[i - 1];
389 198 : while (suffix <= pattern_length && c != pattern[suffix - 1]) {
390 72 : if (shift_table[suffix] == length) {
391 18 : shift_table[suffix] = suffix - i;
392 : }
393 72 : suffix = suffix_table[suffix];
394 : }
395 54 : suffix_table[--i] = --suffix;
396 54 : if (suffix == pattern_length) {
397 : // No suffix to extend, so we check against last_char only.
398 90 : while ((i > start) && (pattern[i - 1] != last_char)) {
399 18 : if (shift_table[pattern_length] == length) {
400 0 : shift_table[pattern_length] = pattern_length - i;
401 : }
402 18 : suffix_table[--i] = pattern_length;
403 : }
404 54 : if (i > start) {
405 54 : suffix_table[--i] = --suffix;
406 : }
407 : }
408 : }
409 : }
410 : // Build shift table using suffixes.
411 18 : if (suffix < pattern_length) {
412 306 : for (int i = start; i <= pattern_length; i++) {
413 144 : if (shift_table[i] == length) {
414 108 : shift_table[i] = suffix - start;
415 : }
416 144 : if (i == suffix) {
417 36 : suffix = suffix_table[suffix];
418 : }
419 : }
420 : }
421 : }
422 :
423 : //---------------------------------------------------------------------
424 : // Boyer-Moore-Horspool string search.
425 : //---------------------------------------------------------------------
426 :
427 : template <typename PatternChar, typename SubjectChar>
428 11431 : int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
429 : StringSearch<PatternChar, SubjectChar>* search,
430 : Vector<const SubjectChar> subject,
431 : int start_index) {
432 11431 : Vector<const PatternChar> pattern = search->pattern_;
433 : int subject_length = subject.length();
434 : int pattern_length = pattern.length();
435 : int* char_occurrences = search->bad_char_table();
436 11431 : int badness = -pattern_length;
437 :
438 : // How bad we are doing without a good-suffix table.
439 22862 : PatternChar last_char = pattern[pattern_length - 1];
440 : int last_char_shift = pattern_length - 1 -
441 11431 : CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
442 : // Perform search
443 : int index = start_index; // No matches found prior to this index.
444 50005 : while (index <= subject_length - pattern_length) {
445 : int j = pattern_length - 1;
446 : int subject_char;
447 1100416 : while (last_char != (subject_char = subject[index + j])) {
448 : int bc_occ = CharOccurrence(char_occurrences, subject_char);
449 500725 : int shift = j - bc_occ;
450 500725 : index += shift;
451 500725 : badness += 1 - shift; // at most zero, so badness cannot increase.
452 500725 : if (index > subject_length - pattern_length) {
453 : return -1;
454 : }
455 : }
456 49483 : j--;
457 922135 : while (j >= 0 && pattern[j] == (subject[index + j])) j--;
458 49483 : if (j < 0) {
459 : return index;
460 : } else {
461 38592 : index += last_char_shift;
462 : // Badness increases by the number of characters we have
463 : // checked, and decreases by the number of characters we
464 : // can skip by shifting. It's a measure of how we are doing
465 : // compared to reading each character exactly once.
466 38592 : badness += (pattern_length - j) - last_char_shift;
467 38592 : if (badness > 0) {
468 18 : search->PopulateBoyerMooreTable();
469 18 : search->strategy_ = &BoyerMooreSearch;
470 18 : return BoyerMooreSearch(search, subject, index);
471 : }
472 : }
473 : }
474 : return -1;
475 : }
476 :
477 :
478 : template <typename PatternChar, typename SubjectChar>
479 11431 : void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
480 : int pattern_length = pattern_.length();
481 :
482 : int* bad_char_occurrence = bad_char_table();
483 :
484 : // Only preprocess at most kBMMaxShift last characters of pattern.
485 11431 : int start = start_;
486 : // Run forwards to populate bad_char_table, so that *last* instance
487 : // of character equivalence class is the one registered.
488 : // Notice: Doesn't include the last character.
489 : int table_size = AlphabetSize();
490 11431 : if (start == 0) { // All patterns less than kBMMaxShift in length.
491 : memset(bad_char_occurrence,
492 : -1,
493 : table_size * sizeof(*bad_char_occurrence));
494 : } else {
495 0 : for (int i = 0; i < table_size; i++) {
496 0 : bad_char_occurrence[i] = start - 1;
497 : }
498 : }
499 509829 : for (int i = start; i < pattern_length - 1; i++) {
500 498398 : PatternChar c = pattern_[i];
501 0 : int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
502 249199 : bad_char_occurrence[bucket] = i;
503 : }
504 11431 : }
505 :
506 : //---------------------------------------------------------------------
507 : // Linear string search with bailout to BMH.
508 : //---------------------------------------------------------------------
509 :
510 : // Simple linear search for short patterns, which bails out if the string
511 : // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
512 : template <typename PatternChar, typename SubjectChar>
513 779291 : int StringSearch<PatternChar, SubjectChar>::InitialSearch(
514 : StringSearch<PatternChar, SubjectChar>* search,
515 : Vector<const SubjectChar> subject,
516 : int index) {
517 779291 : Vector<const PatternChar> pattern = search->pattern_;
518 : int pattern_length = pattern.length();
519 : // Badness is a count of how much work we have done. When we have
520 : // done enough work we decide it's probably worth switching to a better
521 : // algorithm.
522 779291 : int badness = -10 - (pattern_length << 2);
523 :
524 : // We know our pattern is at least 2 characters, we cache the first so
525 : // the common case of the first character not matching is faster.
526 12775055 : for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
527 12774988 : badness++;
528 12774988 : if (badness <= 0) {
529 12763557 : i = FindFirstCharacter(pattern, subject, i);
530 12763557 : if (i == -1) return -1;
531 : DCHECK_LE(i, n);
532 : int j = 1;
533 186180767 : do {
534 594529593 : if (pattern[j] != subject[i + j]) {
535 : break;
536 : }
537 186180767 : j++;
538 : } while (j < pattern_length);
539 12600046 : if (j == pattern_length) {
540 : return i;
541 : }
542 11995764 : badness += j;
543 : } else {
544 11431 : search->PopulateBoyerMooreHorspoolTable();
545 11431 : search->strategy_ = &BoyerMooreHorspoolSearch;
546 11431 : return BoyerMooreHorspoolSearch(search, subject, i);
547 : }
548 : }
549 : return -1;
550 : }
551 :
552 :
553 : // Perform a a single stand-alone search.
554 : // If searching multiple times for the same pattern, a search
555 : // object should be constructed once and the Search function then called
556 : // for each search.
557 : template <typename SubjectChar, typename PatternChar>
558 2705703 : int SearchString(Isolate* isolate,
559 : Vector<const SubjectChar> subject,
560 : Vector<const PatternChar> pattern,
561 : int start_index) {
562 597559 : StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
563 2705703 : return search.Search(subject, start_index);
564 : }
565 :
566 : // A wrapper function around SearchString that wraps raw pointers to the subject
567 : // and pattern as vectors before calling SearchString. Used from the
568 : // StringIndexOf builtin.
569 : template <typename SubjectChar, typename PatternChar>
570 1468010 : intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
571 : int subject_length, const PatternChar* pattern_ptr,
572 : int pattern_length, int start_index) {
573 : DisallowHeapAllocation no_gc;
574 1468010 : Vector<const SubjectChar> subject(subject_ptr, subject_length);
575 1468010 : Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
576 1468010 : return SearchString(isolate, subject, pattern, start_index);
577 : }
578 :
579 : } // namespace internal
580 : } // namespace v8
581 :
582 : #endif // V8_STRING_SEARCH_H_
|