v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
string-search.h
Go to the documentation of this file.
1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_STRINGS_STRING_SEARCH_H_
6#define V8_STRINGS_STRING_SEARCH_H_
7
8#include "src/base/strings.h"
9#include "src/base/vector.h"
11#include "src/objects/string.h"
12
13namespace v8 {
14namespace internal {
15
16//---------------------------------------------------------------------
17// String Search object.
18//---------------------------------------------------------------------
19
20// Class holding constants and methods that apply to all string search variants,
21// independently of subject and pattern char size.
23 protected:
24 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
25 // limit, we can fix the size of tables. For a needle longer than this limit,
26 // search will not be optimal, since we only build tables for a suffix
27 // of the string, but it is a safe approximation.
29
30 // Reduce alphabet to this size.
31 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
32 // proportional to the input alphabet. We reduce the alphabet size by
33 // equating input characters modulo a smaller alphabet size. This gives
34 // a potentially less efficient searching, but is a safe approximation.
35 // For needles using only characters in the same Unicode 256-code point page,
36 // there is no search speed degradation.
37 static const int kLatin1AlphabetSize = 256;
39
40 // Bad-char shift table stored in the state. It's length is the alphabet size.
41 // For patterns below this length, the skip length of Boyer-Moore is too short
42 // to compensate for the algorithmic overhead compared to simple brute force.
43 static const int kBMMinPatternLength = 7;
44
45 static inline bool IsOneByteString(base::Vector<const uint8_t> string) {
46 return true;
47 }
48
50 return String::IsOneByte(string.begin(), string.length());
51 }
52
53 friend class Isolate;
54};
55
56template <typename PatternChar, typename SubjectChar>
58 public:
60 : isolate_(isolate),
62 start_(std::max(0, pattern.length() - kBMMaxShift)) {
63 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
66 return;
67 }
68 }
69 int pattern_length = pattern_.length();
70 if (pattern_length < kBMMinPatternLength) {
71 if (pattern_length == 1) {
73 return;
74 }
76 return;
77 }
79 }
80
81 int Search(base::Vector<const SubjectChar> subject, int index) {
82 return strategy_(this, subject, index);
83 }
84
85 static inline int AlphabetSize() {
86 if (sizeof(PatternChar) == 1) {
87 // Latin1 needle.
89 } else {
90 DCHECK_EQ(sizeof(PatternChar), 2);
91 // UC16 needle.
92 return kUC16AlphabetSize;
93 }
94 }
95
96 private:
99
104
107 int start_index);
108
111 int start_index);
112
115 int start_index);
116
117 static int BoyerMooreHorspoolSearch(
119 base::Vector<const SubjectChar> subject, int start_index);
120
123 int start_index);
124
126
128
129 static inline bool exceedsOneByte(uint8_t c) { return false; }
130
131 static inline bool exceedsOneByte(uint16_t c) {
133 }
134
135 static inline int CharOccurrence(int* bad_char_occurrence,
136 SubjectChar char_code) {
137 if (sizeof(SubjectChar) == 1) {
138 return bad_char_occurrence[static_cast<int>(char_code)];
139 }
140 if (sizeof(PatternChar) == 1) {
141 if (exceedsOneByte(char_code)) {
142 return -1;
143 }
144 return bad_char_occurrence[static_cast<unsigned int>(char_code)];
145 }
146 // Both pattern and subject are UC16. Reduce character to equivalence
147 // class.
148 int equiv_class = char_code % kUC16AlphabetSize;
149 return bad_char_occurrence[equiv_class];
150 }
151
152 // The following tables are shared by all searches.
153 // TODO(lrn): Introduce a way for a pattern to keep its tables
154 // between searches (e.g., for an Atom RegExp).
155
156 // Store for the BoyerMoore(Horspool) bad char shift table.
157 // Return a table covering the last kBMMaxShift+1 positions of
158 // pattern.
159 int* bad_char_table() { return isolate_->bad_char_shift_table(); }
160
161 // Store for the BoyerMoore good suffix shift table.
163 // Return biased pointer that maps the range [start_..pattern_.length()
164 // to the kGoodSuffixShiftTable array.
165 return isolate_->good_suffix_shift_table() - start_;
166 }
167
168 // Table used temporarily while building the BoyerMoore good suffix
169 // shift table.
171 // Return biased pointer that maps the range [start_..pattern_.length()
172 // to the kSuffixTable array.
173 return isolate_->suffix_table() - start_;
174 }
175
177 // The pattern to search for.
179 // Pointer to implementation of the search.
181 // Cache value of max(0, pattern_length() - kBMMaxShift)
183};
184
185template <typename T, typename U>
186inline T AlignDown(T value, U alignment) {
187 return reinterpret_cast<T>(
188 (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
189}
190
192 return std::max(static_cast<uint8_t>(character & 0xFF),
193 static_cast<uint8_t>(character >> 8));
194}
195
196inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
197
198template <typename PatternChar, typename SubjectChar>
201 int index) {
202 const PatternChar pattern_first_char = pattern[0];
203 const int max_n = (subject.length() - pattern.length() + 1);
204
205 if (sizeof(SubjectChar) == 2 && pattern_first_char == 0) {
206 // Special-case looking for the 0 char in other than one-byte strings.
207 // memchr mostly fails in this case due to every other byte being 0 in text
208 // that is mostly ascii characters.
209 for (int i = index; i < max_n; ++i) {
210 if (subject[i] == 0) return i;
211 }
212 return -1;
213 }
214 const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
215 const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
216 int pos = index;
217 do {
218 DCHECK_GE(max_n - pos, 0);
219 const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
220 memchr(subject.begin() + pos, search_byte,
221 (max_n - pos) * sizeof(SubjectChar)));
222 if (char_pos == nullptr) return -1;
223 char_pos = AlignDown(char_pos, sizeof(SubjectChar));
224 pos = static_cast<int>(char_pos - subject.begin());
225 if (subject[pos] == search_char) return pos;
226 } while (++pos < max_n);
227
228 return -1;
229}
230
231//---------------------------------------------------------------------
232// Single Character Pattern Search Strategy
233//---------------------------------------------------------------------
234
235template <typename PatternChar, typename SubjectChar>
238 base::Vector<const SubjectChar> subject, int index) {
239 DCHECK_EQ(1, search->pattern_.length());
240 PatternChar pattern_first_char = search->pattern_[0];
241 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
242 if (exceedsOneByte(pattern_first_char)) {
243 return -1;
244 }
245 }
246 return FindFirstCharacter(search->pattern_, subject, index);
247}
248
249//---------------------------------------------------------------------
250// Linear Search Strategy
251//---------------------------------------------------------------------
252
253template <typename PatternChar, typename SubjectChar>
254inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject,
255 int length) {
256 DCHECK_GT(length, 0);
257 int pos = 0;
258 do {
259 if (pattern[pos] != subject[pos]) {
260 return false;
261 }
262 pos++;
263 } while (pos < length);
264 return true;
265}
266
267// Simple linear search for short patterns. Never bails out.
268template <typename PatternChar, typename SubjectChar>
271 base::Vector<const SubjectChar> subject, int index) {
273 DCHECK_GT(pattern.length(), 1);
274 int pattern_length = pattern.length();
275 int i = index;
276 int n = subject.length() - pattern_length;
277 while (i <= n) {
278 i = FindFirstCharacter(pattern, subject, i);
279 if (i == -1) return -1;
280 DCHECK_LE(i, n);
281 i++;
282 // Loop extracted to separate function to allow using return to do
283 // a deeper break.
284 if (CharCompare(pattern.begin() + 1, subject.begin() + i,
285 pattern_length - 1)) {
286 return i - 1;
287 }
288 }
289 return -1;
290}
291
292//---------------------------------------------------------------------
293// Boyer-Moore string search
294//---------------------------------------------------------------------
295
296template <typename PatternChar, typename SubjectChar>
299 base::Vector<const SubjectChar> subject, int start_index) {
301 int subject_length = subject.length();
302 int pattern_length = pattern.length();
303 // Only preprocess at most kBMMaxShift last characters of pattern.
304 int start = search->start_;
305
306 int* bad_char_occurence = search->bad_char_table();
307 int* good_suffix_shift = search->good_suffix_shift_table();
308
309 PatternChar last_char = pattern[pattern_length - 1];
310 int index = start_index;
311 // Continue search from i.
312 while (index <= subject_length - pattern_length) {
313 int j = pattern_length - 1;
314 int c;
315 while (last_char != (c = subject[index + j])) {
316 int shift = j - CharOccurrence(bad_char_occurence, c);
317 index += shift;
318 if (index > subject_length - pattern_length) {
319 return -1;
320 }
321 }
322 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
323 if (j < 0) {
324 return index;
325 } else if (j < start) {
326 // we have matched more than our tables allow us to be smart about.
327 // Fall back on BMH shift.
328 index += pattern_length - 1 -
329 CharOccurrence(bad_char_occurence,
330 static_cast<SubjectChar>(last_char));
331 } else {
332 int gs_shift = good_suffix_shift[j + 1];
333 int bc_occ = CharOccurrence(bad_char_occurence, c);
334 int shift = j - bc_occ;
335 if (gs_shift > shift) {
336 shift = gs_shift;
337 }
338 index += shift;
339 }
340 }
341
342 return -1;
343}
344
345template <typename PatternChar, typename SubjectChar>
347 int pattern_length = pattern_.length();
348 const PatternChar* pattern = pattern_.begin();
349 // Only look at the last kBMMaxShift characters of pattern (from start_
350 // to pattern_length).
351 int start = start_;
352 int length = pattern_length - start;
353
354 // Biased tables so that we can use pattern indices as table indices,
355 // even if we only cover the part of the pattern from offset start.
356 int* shift_table = good_suffix_shift_table();
357 int* suffix_table = this->suffix_table();
358
359 // Initialize table.
360 for (int i = start; i < pattern_length; i++) {
361 shift_table[i] = length;
362 }
363 shift_table[pattern_length] = 1;
364 suffix_table[pattern_length] = pattern_length + 1;
365
366 if (pattern_length <= start) {
367 return;
368 }
369
370 // Find suffixes.
371 PatternChar last_char = pattern[pattern_length - 1];
372 int suffix = pattern_length + 1;
373 {
374 int i = pattern_length;
375 while (i > start) {
376 PatternChar c = pattern[i - 1];
377 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
378 if (shift_table[suffix] == length) {
379 shift_table[suffix] = suffix - i;
380 }
381 suffix = suffix_table[suffix];
382 }
383 suffix_table[--i] = --suffix;
384 if (suffix == pattern_length) {
385 // No suffix to extend, so we check against last_char only.
386 while ((i > start) && (pattern[i - 1] != last_char)) {
387 if (shift_table[pattern_length] == length) {
388 shift_table[pattern_length] = pattern_length - i;
389 }
390 suffix_table[--i] = pattern_length;
391 }
392 if (i > start) {
393 suffix_table[--i] = --suffix;
394 }
395 }
396 }
397 }
398 // Build shift table using suffixes.
399 if (suffix < pattern_length) {
400 for (int i = start; i <= pattern_length; i++) {
401 if (shift_table[i] == length) {
402 shift_table[i] = suffix - start;
403 }
404 if (i == suffix) {
405 suffix = suffix_table[suffix];
406 }
407 }
408 }
409}
410
411//---------------------------------------------------------------------
412// Boyer-Moore-Horspool string search.
413//---------------------------------------------------------------------
414
415template <typename PatternChar, typename SubjectChar>
418 base::Vector<const SubjectChar> subject, int start_index) {
420 int subject_length = subject.length();
421 int pattern_length = pattern.length();
422 int* char_occurrences = search->bad_char_table();
423 int badness = -pattern_length;
424
425 // How bad we are doing without a good-suffix table.
426 PatternChar last_char = pattern[pattern_length - 1];
427 int last_char_shift =
428 pattern_length - 1 -
429 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
430 // Perform search
431 int index = start_index; // No matches found prior to this index.
432 while (index <= subject_length - pattern_length) {
433 int j = pattern_length - 1;
434 int subject_char;
435 while (last_char != (subject_char = subject[index + j])) {
436 int bc_occ = CharOccurrence(char_occurrences, subject_char);
437 int shift = j - bc_occ;
438 index += shift;
439 badness += 1 - shift; // at most zero, so badness cannot increase.
440 if (index > subject_length - pattern_length) {
441 return -1;
442 }
443 }
444 j--;
445 while (j >= 0 && pattern[j] == (subject[index + j])) j--;
446 if (j < 0) {
447 return index;
448 } else {
449 index += last_char_shift;
450 // Badness increases by the number of characters we have
451 // checked, and decreases by the number of characters we
452 // can skip by shifting. It's a measure of how we are doing
453 // compared to reading each character exactly once.
454 badness += (pattern_length - j) - last_char_shift;
455 if (badness > 0) {
456 search->PopulateBoyerMooreTable();
457 search->strategy_ = &BoyerMooreSearch;
458 return BoyerMooreSearch(search, subject, index);
459 }
460 }
461 }
462 return -1;
463}
464
465template <typename PatternChar, typename SubjectChar>
467 int pattern_length = pattern_.length();
468
469 int* bad_char_occurrence = bad_char_table();
470
471 // Only preprocess at most kBMMaxShift last characters of pattern.
472 int start = start_;
473 // Run forwards to populate bad_char_table, so that *last* instance
474 // of character equivalence class is the one registered.
475 // Notice: Doesn't include the last character.
476 int table_size = AlphabetSize();
477 if (start == 0) { // All patterns less than kBMMaxShift in length.
478 memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
479 } else {
480 for (int i = 0; i < table_size; i++) {
481 bad_char_occurrence[i] = start - 1;
482 }
483 }
484 for (int i = start; i < pattern_length - 1; i++) {
485 PatternChar c = pattern_[i];
486 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
487 bad_char_occurrence[bucket] = i;
488 }
489}
490
491//---------------------------------------------------------------------
492// Linear string search with bailout to BMH.
493//---------------------------------------------------------------------
494
495// Simple linear search for short patterns, which bails out if the string
496// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
497template <typename PatternChar, typename SubjectChar>
500 base::Vector<const SubjectChar> subject, int index) {
502 int pattern_length = pattern.length();
503 // Badness is a count of how much work we have done. When we have
504 // done enough work we decide it's probably worth switching to a better
505 // algorithm.
506 int badness = -10 - (pattern_length << 2);
507
508 // We know our pattern is at least 2 characters, we cache the first so
509 // the common case of the first character not matching is faster.
510 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
511 badness++;
512 if (badness <= 0) {
513 i = FindFirstCharacter(pattern, subject, i);
514 if (i == -1) return -1;
515 DCHECK_LE(i, n);
516 int j = 1;
517 do {
518 if (pattern[j] != subject[i + j]) {
519 break;
520 }
521 j++;
522 } while (j < pattern_length);
523 if (j == pattern_length) {
524 return i;
525 }
526 badness += j;
527 } else {
529 search->strategy_ = &BoyerMooreHorspoolSearch;
530 return BoyerMooreHorspoolSearch(search, subject, i);
531 }
532 }
533 return -1;
534}
535
536// Perform a a single stand-alone search.
537// If searching multiple times for the same pattern, a search
538// object should be constructed once and the Search function then called
539// for each search.
540template <typename SubjectChar, typename PatternChar>
542 base::Vector<const PatternChar> pattern, int start_index) {
544 return search.Search(subject, start_index);
545}
546
547// A wrapper function around SearchString that wraps raw pointers to the subject
548// and pattern as vectors before calling SearchString. Used from the
549// StringIndexOf builtin.
550template <typename SubjectChar, typename PatternChar>
551intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
552 int subject_length, const PatternChar* pattern_ptr,
553 int pattern_length, int start_index) {
555 base::Vector<const SubjectChar> subject(subject_ptr, subject_length);
556 base::Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
557 return SearchString(isolate, subject, pattern, start_index);
558}
559
560} // namespace internal
561} // namespace v8
562
563#endif // V8_STRINGS_STRING_SEARCH_H_
SourcePosition pos
bool IsOneByte() const
Definition api.cc:5620
int length() const
Definition vector.h:64
constexpr T * begin() const
Definition vector.h:96
static const int kUC16AlphabetSize
Definition isolate.h:1137
static const int kBMMaxShift
Definition isolate.h:1138
static const int kLatin1AlphabetSize
static bool IsOneByteString(base::Vector< const uint8_t > string)
static const int kUC16AlphabetSize
static const int kBMMinPatternLength
static bool IsOneByteString(base::Vector< const base::uc16 > string)
static int BoyerMooreSearch(StringSearch< PatternChar, SubjectChar > *search, base::Vector< const SubjectChar > subject, int start_index)
int(*)(StringSearch< PatternChar, SubjectChar > *, base::Vector< const SubjectChar >, int) SearchFunction
static int LinearSearch(StringSearch< PatternChar, SubjectChar > *search, base::Vector< const SubjectChar > subject, int start_index)
static int InitialSearch(StringSearch< PatternChar, SubjectChar > *search, base::Vector< const SubjectChar > subject, int start_index)
static bool exceedsOneByte(uint16_t c)
static int SingleCharSearch(StringSearch< PatternChar, SubjectChar > *search, base::Vector< const SubjectChar > subject, int start_index)
base::Vector< const PatternChar > pattern_
int Search(base::Vector< const SubjectChar > subject, int index)
static bool exceedsOneByte(uint8_t c)
StringSearch(Isolate *isolate, base::Vector< const PatternChar > pattern)
static int FailSearch(StringSearch< PatternChar, SubjectChar > *, base::Vector< const SubjectChar >, int)
static int BoyerMooreHorspoolSearch(StringSearch< PatternChar, SubjectChar > *search, base::Vector< const SubjectChar > subject, int start_index)
static int CharOccurrence(int *bad_char_occurrence, SubjectChar char_code)
static const uint32_t kMaxOneByteCharCodeU
Definition string.h:501
uint8_t *const start_
Definition assembler.cc:131
int start
std::string pattern
STL namespace.
uint16_t uc16
Definition strings.h:18
bool CharCompare(const PatternChar *pattern, const SubjectChar *subject, int length)
BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL int character
constexpr int U
int FindFirstCharacter(base::Vector< const PatternChar > pattern, base::Vector< const SubjectChar > subject, int index)
T AlignDown(T value, U alignment)
int SearchString(Isolate *isolate, base::Vector< const SubjectChar > subject, base::Vector< const PatternChar > pattern, int start_index)
uint8_t GetHighestValueByte(base::uc16 character)
intptr_t SearchStringRaw(Isolate *isolate, const SubjectChar *subject_ptr, int subject_length, const PatternChar *pattern_ptr, int pattern_length, int start_index)
return value
Definition map-inl.h:893
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define DCHECK_GE(v1, v2)
Definition logging.h:488
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
#define DCHECK_GT(v1, v2)
Definition logging.h:487