v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
builtins-regexp.cc
Go to the documentation of this file.
1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
10#include "src/regexp/regexp.h"
12
13namespace v8 {
14namespace internal {
15
16// -----------------------------------------------------------------------------
17// ES6 section 21.2 RegExp Objects
18
19BUILTIN(RegExpPrototypeToString) {
20 HandleScope scope(isolate);
21 CHECK_RECEIVER(JSReceiver, recv, "RegExp.prototype.toString");
22
23 if (*recv == isolate->regexp_function()->prototype()) {
24 isolate->CountUsage(v8::Isolate::kRegExpPrototypeToString);
25 }
26
27 IncrementalStringBuilder builder(isolate);
28
29 builder.AppendCharacter('/');
30 {
33 isolate, source,
34 JSReceiver::GetProperty(isolate, recv,
35 isolate->factory()->source_string()));
36 DirectHandle<String> source_str;
37 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, source_str,
38 Object::ToString(isolate, source));
39 builder.AppendString(source_str);
40 }
41
42 builder.AppendCharacter('/');
43 {
46 isolate, flags,
47 JSReceiver::GetProperty(isolate, recv,
48 isolate->factory()->flags_string()));
49 DirectHandle<String> flags_str;
50 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, flags_str,
51 Object::ToString(isolate, flags));
52 builder.AppendString(flags_str);
53 }
54
55 RETURN_RESULT_OR_FAILURE(isolate, builder.Finish());
56}
57
58// The properties $1..$9 are the first nine capturing substrings of the last
59// successful match, or ''. The function RegExpMakeCaptureGetter will be
60// called with indices from 1 to 9.
61#define DEFINE_CAPTURE_GETTER(i) \
62 BUILTIN(RegExpCapture##i##Getter) { \
63 HandleScope scope(isolate); \
64 return *RegExpUtils::GenericCaptureGetter( \
65 isolate, isolate->regexp_last_match_info(), i); \
66 }
76#undef DEFINE_CAPTURE_GETTER
77
78// The properties `input` and `$_` are aliases for each other. When this
79// value is set, the value it is set to is coerced to a string.
80// Getter and setter for the input.
81
82BUILTIN(RegExpInputGetter) {
83 HandleScope scope(isolate);
84 DirectHandle<Object> obj(isolate->regexp_last_match_info()->last_input(),
85 isolate);
86 return IsUndefined(*obj, isolate) ? ReadOnlyRoots(isolate).empty_string()
87 : Cast<String>(*obj);
88}
89
90BUILTIN(RegExpInputSetter) {
91 HandleScope scope(isolate);
92 Handle<Object> value = args.atOrUndefined(isolate, 1);
95 Object::ToString(isolate, value));
96 isolate->regexp_last_match_info()->set_last_input(*str);
97 return ReadOnlyRoots(isolate).undefined_value();
98}
99
100// Getters for the static properties lastMatch, lastParen, leftContext, and
101// rightContext of the RegExp constructor. The properties are computed based
102// on the captures array of the last successful match and the subject string
103// of the last successful match.
104BUILTIN(RegExpLastMatchGetter) {
105 HandleScope scope(isolate);
107 isolate, isolate->regexp_last_match_info(), 0);
108}
109
110BUILTIN(RegExpLastParenGetter) {
111 HandleScope scope(isolate);
112 DirectHandle<RegExpMatchInfo> match_info = isolate->regexp_last_match_info();
113 const int length = match_info->number_of_capture_registers();
114 if (length <= 2) {
115 return ReadOnlyRoots(isolate).empty_string(); // No captures.
116 }
117
118 DCHECK_EQ(0, length % 2);
119 const int last_capture = (length / 2) - 1;
120
121 // We match the SpiderMonkey behavior: return the substring defined by the
122 // last pair (after the first pair) of elements of the capture array even if
123 // it is empty.
124 return *RegExpUtils::GenericCaptureGetter(isolate, match_info, last_capture);
125}
126
127BUILTIN(RegExpLeftContextGetter) {
128 HandleScope scope(isolate);
129 DirectHandle<RegExpMatchInfo> match_info = isolate->regexp_last_match_info();
130 const int start_index = match_info->capture(0);
131 Handle<String> last_subject(match_info->last_subject(), isolate);
132 return *isolate->factory()->NewSubString(last_subject, 0, start_index);
133}
134
135BUILTIN(RegExpRightContextGetter) {
136 HandleScope scope(isolate);
137 DirectHandle<RegExpMatchInfo> match_info = isolate->regexp_last_match_info();
138 const int start_index = match_info->capture(1);
139 Handle<String> last_subject(match_info->last_subject(), isolate);
140 const int len = last_subject->length();
141 return *isolate->factory()->NewSubString(last_subject, start_index, len);
142}
143
144namespace {
145
146constexpr uint8_t kNoEscape = 0;
147constexpr uint8_t kEscapeToHex = std::numeric_limits<uint8_t>::max();
148constexpr uint8_t GetAsciiEscape(char c) {
149 switch (c) {
150 // SyntaxCharacter :: one of
151 // ^ $ \ . * + ? ( ) [ ] { } |
152 //
153 // SyntaxCharacter and U+002F (SOLIDUS) are escaped as-is.
154 case '^':
155 case '$':
156 case '\\':
157 case '.':
158 case '*':
159 case '+':
160 case '?':
161 case '(':
162 case ')':
163 case '[':
164 case ']':
165 case '{':
166 case '}':
167 case '|':
168 case '/':
169 return c;
170
171 // ControlEscape :: one of
172 // f n r t v
173 case '\f':
174 return 'f';
175 case '\n':
176 return 'n';
177 case '\r':
178 return 'r';
179 case '\t':
180 return 't';
181 case '\v':
182 return 'v';
183
184 // One of ",-=<>#&!%:;@~'`", the code unit 0x0022 (QUOTATION MARK), and
185 // ASCII whitespace are escaped to hex.
186 case ',':
187 case '-':
188 case '=':
189 case '<':
190 case '>':
191 case '#':
192 case '&':
193 case '!':
194 case '%':
195 case ':':
196 case ';':
197 case '@':
198 case '~':
199 case '\'':
200 case '`':
201 case '"':
202 case ' ':
203 return kEscapeToHex;
204
205 default:
206 return kNoEscape;
207 }
208}
209
210constexpr const uint8_t kAsciiEscapes[128]{
211#define GET_ASCII_ESCAPE(c) GetAsciiEscape(c),
213#undef GET_ASCII_ESCAPE
214};
215
216template <typename CharT>
217MaybeDirectHandle<String> RegExpEscapeImpl(Isolate* isolate,
218 base::OwnedVector<CharT> source) {
219 char double_to_radix_chars[kDoubleToRadixMaxChars];
220 base::Vector<char> double_to_radix_buffer =
221 base::ArrayVector(double_to_radix_chars);
222
223 // 2. Let escaped be the empty String.
224 IncrementalStringBuilder escaped_builder(isolate);
225 if constexpr (sizeof(CharT) == 2) {
226 escaped_builder.ChangeEncoding();
227 }
228
229 // 3. Let cpList be StringToCodePoints(S).
230 // 4. For each code point c of cpList, do
231 // (Done below.)
232
233 size_t start;
234 std::remove_const_t<CharT> first_c = source[0];
235 if (IsAlphaNumeric(first_c)) {
236 // a. If escaped is the empty String and c is matched by either
237 // DecimalDigit or AsciiLetter, then
238 // i. NOTE: Escaping a leading digit ensures that output corresponds
239 // with pattern text which may be used after a \0 character escape or
240 // a DecimalEscape such as \1 and still match S rather than be
241 // interpreted as an extension of the preceding escape sequence.
242 // Escaping a leading ASCII letter does the same for the context after
243 // \c.
244 // ii. Let numericValue be the numeric value of c.
245 // iii. Let hex be Number::toString(𝔽(numericValue), 16).
246 // iv. Assert: The length of hex is 2.
247 // v. Set escaped to the string-concatenation of the code unit 0x005C
248 // (REVERSE SOLIDUS), "x", and hex.
249 start = 1;
250 escaped_builder.AppendCStringLiteral("\\x");
251 std::string_view hex =
252 DoubleToRadixStringView(first_c, 16, double_to_radix_buffer);
253 escaped_builder.AppendString(hex);
254 } else {
255 start = 0;
256 }
257
258 // EncodeForRegExpEscape
259 //
260 // 1. If c is matched by SyntaxCharacter or c is U+002F (SOLIDUS), then a.
261 // Return the string-concatenation of 0x005C (REVERSE SOLIDUS) and
262 // UTF16EncodeCodePoint(c).
263 // 2. Else if c is the code point listed in some cell of the “Code Point”
264 // column of Table 63, then a. Return the string-concatenation of 0x005C
265 // (REVERSE SOLIDUS) and the string in the “ControlEscape” column of the
266 // row whose “Code Point” column contains c.
267 // 3. Let otherPunctuators be the string-concatenation of ",-=<>#&!%:;@~'`"
268 // and the code unit 0x0022 (QUOTATION MARK).
269 // 4. Let toEscape be StringToCodePoints(otherPunctuators).
270 // 5. If toEscape contains c, c is matched by either WhiteSpace or
271 // LineTerminator, or c has the same numeric value as a leading surrogate
272 // or trailing surrogate, then a. Let cNum be the numeric value of c. b. If
273 // cNum ≤ 0xFF, then i. Let hex be Number::toString(𝔽(cNum), 16). ii.
274 // Return the string-concatenation of the code unit 0x005C (REVERSE
275 // SOLIDUS), "x", and StringPad(hex, 2, "0", start). c. Let escaped be the
276 // empty String. d. Let codeUnits be UTF16EncodeCodePoint(c). e. For each
277 // code unit cu of codeUnits, do i. Set escaped to the string-concatenation
278 // of escaped and UnicodeEscape(cu). f. Return escaped.
279 // 6. Return UTF16EncodeCodePoint(c).
280 //
281 // Steps 1-2 above are done by table lookup in kAsciiEscapes. For step 3,
282 // matching otherPuncatuators, quotation mark, and ASCII whitespace is done by
283 // table lookup in kAsciiEscapes. Non-ASCII whitespace and line terminators in
284 // step 5 are matched manually below.
285
286 for (size_t i = start; i < source.size(); i++) {
287 CharT cu = source[i];
288 base::uc32 cp = cu;
289 uint8_t cmd = kNoEscape;
290
291 if (IsAscii(cu)) {
292 cmd = kAsciiEscapes[cu];
293 } else {
294 if constexpr (sizeof(CharT) == 2) {
296 if (i + 1 < source.size() &&
297 unibrow::Utf16::IsTrailSurrogate(source[i + 1])) {
298 // Surrogate pair. Combine them.
299 cp = unibrow::Utf16::CombineSurrogatePair(cu, source[i + 1]);
300 i++;
301 } else {
302 // Lone lead surrogate.
303 cmd = kEscapeToHex;
304 }
305 } else if (unibrow::Utf16::IsTrailSurrogate(cu)) {
306 // Lone trailing surrogate.
307 cmd = kEscapeToHex;
308 }
309 }
310
311 // ASCII whitespace and line terminators are hardcoded in the
312 // kAsciiEscapes table.
314 cmd = kEscapeToHex;
315 }
316 }
317
318 if (cmd == kNoEscape) {
319 // Code point does not need to be escaped.
320 if (cp == cu) {
321 escaped_builder.Append<CharT, CharT>(cp);
322 } else {
323 DCHECK_LT(i, source.size());
326 escaped_builder.Append<CharT, CharT>(cu);
327 escaped_builder.Append<CharT, CharT>(source[i]);
328 }
329 } else if (cmd == kEscapeToHex) {
330 // An escape to hex. Output \x or \u depending on how many code units.
331 escaped_builder.AppendCStringLiteral(cp <= 0xFF ? "\\x" : "\\u");
332 std::string_view hex =
333 DoubleToRadixStringView(cp, 16, double_to_radix_buffer);
334 escaped_builder.AppendString(hex);
335 } else {
336 // A manual, non-hex escape. See table in kAsciiEscapes.
337 escaped_builder.AppendCharacter('\\');
338 escaped_builder.AppendCharacter(cmd);
339 }
340 }
341
342 return escaped_builder.Finish();
343}
344} // namespace
345
346BUILTIN(RegExpEscape) {
347 HandleScope scope(isolate);
348 Handle<Object> value = args.atOrUndefined(isolate, 1);
349
350 isolate->CountUsage(v8::Isolate::kRegExpEscape);
351
352 // 1. If S is not a String, throw a TypeError exception.
353 if (!IsString(*value)) {
355 isolate, NewTypeError(MessageTemplate::kArgumentIsNonString,
356 isolate->factory()->input_string()));
357 }
358 Handle<String> str = Cast<String>(value);
359
360 if (str->length() == 0) return ReadOnlyRoots(isolate).empty_string();
361
362 DirectHandle<String> escaped;
363
364 // A copy of the input characters is needed because RegExpEscapeImpl builds up
365 // the escaped string using IncrementalStringBuilder, which may allocate.
366 str = String::Flatten(isolate, str);
367 if (str->IsOneByteRepresentation()) {
369 {
371 copy = base::OwnedCopyOf(str->GetFlatContent(no_gc).ToOneByteVector());
372 }
374 isolate, escaped, RegExpEscapeImpl(isolate, std::move(copy)));
375 } else {
377 {
379 copy = base::OwnedCopyOf(str->GetFlatContent(no_gc).ToUC16Vector());
380 }
382 isolate, escaped, RegExpEscapeImpl(isolate, std::move(copy)));
383 }
384
385 return *escaped;
386}
387
388} // namespace internal
389} // namespace v8
#define DEFINE_CAPTURE_GETTER(i)
#define GET_ASCII_ESCAPE(c)
#define CHECK_RECEIVER(Type, name, method)
#define BUILTIN(name)
static bool IsSurrogatePair(int lead, int trail)
Definition unicode.h:103
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition unicode.h:113
static bool IsTrailSurrogate(int code)
Definition unicode.h:109
static bool IsLeadSurrogate(int code)
Definition unicode.h:106
@ kRegExpPrototypeToString
Definition v8-isolate.h:481
MaybeDirectHandle< String > Finish()
V8_INLINE void AppendCharacter(uint8_t c)
V8_INLINE void AppendString(std::string_view str)
static V8_WARN_UNUSED_RESULT MaybeHandle< Object > GetProperty(Isolate *isolate, DirectHandle< JSReceiver > receiver, const char *key)
static V8_WARN_UNUSED_RESULT HandleType< String >::MaybeType ToString(Isolate *isolate, HandleType< T > input)
static Handle< String > GenericCaptureGetter(Isolate *isolate, DirectHandle< RegExpMatchInfo > match_info, int capture, bool *ok=nullptr)
static V8_INLINE HandleType< String > Flatten(Isolate *isolate, HandleType< T > string, AllocationType allocation=AllocationType::kYoung)
int start
#define ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, dst, call)
Definition isolate.h:284
#define THROW_NEW_ERROR_RETURN_FAILURE(isolate, call)
Definition isolate.h:294
#define RETURN_RESULT_OR_FAILURE(isolate, call)
Definition isolate.h:264
base::Vector< const DirectHandle< Object > > args
Definition execution.cc:74
InstructionOperand source
constexpr Vector< T > ArrayVector(T(&arr)[N])
Definition vector.h:354
uint32_t uc32
Definition strings.h:19
OwnedVector< T > OwnedCopyOf(const T *data, size_t size)
Definition vector.h:383
constexpr bool IsAscii(base::uc32 c)
bool IsWhiteSpaceOrLineTerminator(base::uc32 c)
Flag flags[]
Definition flags.cc:3797
constexpr int kDoubleToRadixMaxChars
Definition conversions.h:86
constexpr bool IsAlphaNumeric(base::uc32 c)
constexpr Register cp
std::string_view DoubleToRadixStringView(double value, int radix, base::Vector< char > buffer)
Tagged< To > Cast(Tagged< From > value, const v8::SourceLocation &loc=INIT_SOURCE_LOCATION_IN_DEBUG)
Definition casting.h:150
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_LT(v1, v2)
Definition logging.h:489
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
#define INT_0_TO_127_LIST(V)
Definition utils.h:625