v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
builtins-string.cc
Go to the documentation of this file.
1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <optional>
6
9#include "src/heap/heap-inl.h" // For ToBoolean. TODO(jkummerow): Drop.
13#ifdef V8_INTL_SUPPORT
15#endif
16#include "src/base/strings.h"
21#include "src/strings/unicode.h"
22
23namespace v8 {
24namespace internal {
25
26namespace { // for String.fromCodePoint
27
28bool IsValidCodePoint(Isolate* isolate, DirectHandle<Object> value) {
29 if (!IsNumber(*value) && !Object::ToNumber(isolate, value).ToHandle(&value)) {
30 return false;
31 }
32
33 if (Object::IntegerValue(isolate, value).ToChecked() !=
34 Object::NumberValue(*value)) {
35 return false;
36 }
37
38 if (Object::NumberValue(*value) < 0 ||
39 Object::NumberValue(*value) > 0x10FFFF) {
40 return false;
41 }
42
43 return true;
44}
45
46static constexpr base::uc32 kInvalidCodePoint = static_cast<base::uc32>(-1);
47
48base::uc32 NextCodePoint(Isolate* isolate, BuiltinArguments args, int index) {
49 DirectHandle<Object> value = args.at(1 + index);
51 isolate, value, Object::ToNumber(isolate, value), kInvalidCodePoint);
52 if (!IsValidCodePoint(isolate, value)) {
53 isolate->Throw(*isolate->factory()->NewRangeError(
54 MessageTemplate::kInvalidCodePoint, value));
55 return kInvalidCodePoint;
56 }
57 return DoubleToUint32(Object::NumberValue(*value));
58}
59
60} // namespace
61
62// ES6 section 21.1.2.2 String.fromCodePoint ( ...codePoints )
63BUILTIN(StringFromCodePoint) {
64 HandleScope scope(isolate);
65 int const length = args.length() - 1;
66 if (length == 0) return ReadOnlyRoots(isolate).empty_string();
67 DCHECK_LT(0, length);
68
69 // Optimistically assume that the resulting String contains only one byte
70 // characters.
71 std::vector<uint8_t> one_byte_buffer;
72 one_byte_buffer.reserve(length);
73 base::uc32 code = 0;
74 int index;
75 for (index = 0; index < length; index++) {
76 code = NextCodePoint(isolate, args, index);
77 if (code == kInvalidCodePoint) {
78 return ReadOnlyRoots(isolate).exception();
79 }
80 if (code > String::kMaxOneByteCharCode) {
81 break;
82 }
83 one_byte_buffer.push_back(code);
84 }
85
86 if (index == length) {
88 isolate, isolate->factory()->NewStringFromOneByte(base::Vector<uint8_t>(
89 one_byte_buffer.data(), one_byte_buffer.size())));
90 }
91
92 std::vector<base::uc16> two_byte_buffer;
93 two_byte_buffer.reserve(length - index);
94
95 while (true) {
96 if (code <=
98 two_byte_buffer.push_back(code);
99 } else {
100 two_byte_buffer.push_back(unibrow::Utf16::LeadSurrogate(code));
101 two_byte_buffer.push_back(unibrow::Utf16::TrailSurrogate(code));
102 }
103
104 if (++index == length) {
105 break;
106 }
107 code = NextCodePoint(isolate, args, index);
108 if (code == kInvalidCodePoint) {
109 return ReadOnlyRoots(isolate).exception();
110 }
111 }
112
115 isolate, result,
116 isolate->factory()->NewRawTwoByteString(
117 static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size())));
118
120 CopyChars(result->GetChars(no_gc), one_byte_buffer.data(),
121 one_byte_buffer.size());
122 CopyChars(result->GetChars(no_gc) + one_byte_buffer.size(),
123 two_byte_buffer.data(), two_byte_buffer.size());
124
125 return *result;
126}
127
128// ES6 section 21.1.3.9
129// String.prototype.lastIndexOf ( searchString [ , position ] )
130BUILTIN(StringPrototypeLastIndexOf) {
131 HandleScope handle_scope(isolate);
132 return String::LastIndexOf(isolate, args.receiver(),
133 args.atOrUndefined(isolate, 1),
134 args.atOrUndefined(isolate, 2));
135}
136
137#ifndef V8_INTL_SUPPORT
138// ES6 section 21.1.3.10 String.prototype.localeCompare ( that )
139//
140// For now, we do not do anything locale specific.
141// If internationalization is enabled, then intl.js will override this function
142// and provide the proper functionality, so this is just a fallback.
143BUILTIN(StringPrototypeLocaleCompare) {
144 HandleScope handle_scope(isolate);
145
147 static const char* const kMethod = "String.prototype.localeCompare";
148
149 DCHECK_LE(2, args.length());
150
151 TO_THIS_STRING(str1, kMethod);
154 Object::ToString(isolate, args.at(1)));
155
156 if (str1.is_identical_to(str2)) return Smi::zero(); // Equal.
157 int str1_length = str1->length();
158 int str2_length = str2->length();
159
160 // Decide trivial cases without flattening.
161 if (str1_length == 0) {
162 if (str2_length == 0) return Smi::zero(); // Equal.
163 return Smi::FromInt(-str2_length);
164 } else {
165 if (str2_length == 0) return Smi::FromInt(str1_length);
166 }
167
168 int end = str1_length < str2_length ? str1_length : str2_length;
169
170 // No need to flatten if we are going to find the answer on the first
171 // character. At this point we know there is at least one character
172 // in each string, due to the trivial case handling above.
173 int d = str1->Get(0) - str2->Get(0);
174 if (d != 0) return Smi::FromInt(d);
175
176 str1 = String::Flatten(isolate, str1);
177 str2 = String::Flatten(isolate, str2);
178
180 String::FlatContent flat1 = str1->GetFlatContent(no_gc);
181 String::FlatContent flat2 = str2->GetFlatContent(no_gc);
182
183 for (int i = 0; i < end; i++) {
184 if (flat1.Get(i) != flat2.Get(i)) {
185 return Smi::FromInt(flat1.Get(i) - flat2.Get(i));
186 }
187 }
188
189 return Smi::FromInt(str1_length - str2_length);
190}
191
192// ES6 section 21.1.3.12 String.prototype.normalize ( [form] )
193//
194// Simply checks the argument is valid and returns the string itself.
195// If internationalization is enabled, then intl.js will override this function
196// and provide the proper functionality, so this is just a fallback.
197BUILTIN(StringPrototypeNormalize) {
198 HandleScope handle_scope(isolate);
199 TO_THIS_STRING(string, "String.prototype.normalize");
200
201 DirectHandle<Object> form_input = args.atOrUndefined(isolate, 1);
202 if (IsUndefined(*form_input, isolate)) return *string;
203
206 Object::ToString(isolate, form_input));
207
208 if (!(String::Equals(isolate, form, isolate->factory()->NFC_string()) ||
209 String::Equals(isolate, form, isolate->factory()->NFD_string()) ||
210 String::Equals(isolate, form, isolate->factory()->NFKC_string()) ||
211 String::Equals(isolate, form, isolate->factory()->NFKD_string()))) {
212 DirectHandle<String> valid_forms =
213 isolate->factory()->NewStringFromStaticChars("NFC, NFD, NFKC, NFKD");
215 isolate,
216 NewRangeError(MessageTemplate::kNormalizationForm, valid_forms));
217 }
218
219 return *string;
220}
221#endif // !V8_INTL_SUPPORT
222
223
224#ifndef V8_INTL_SUPPORT
225namespace {
226
227inline bool ToUpperOverflows(base::uc32 character) {
228 // y with umlauts and the micro sign are the only characters that stop
229 // fitting into one-byte when converting to uppercase.
230 static const base::uc32 yuml_code = 0xFF;
231 static const base::uc32 micro_code = 0xB5;
232 return (character == yuml_code || character == micro_code);
233}
234
235template <class Converter>
236V8_WARN_UNUSED_RESULT static Tagged<Object> ConvertCaseHelper(
237 Isolate* isolate, Tagged<String> string, Tagged<SeqString> result,
238 uint32_t result_length, unibrow::Mapping<Converter, 128>* mapping) {
240 // We try this twice, once with the assumption that the result is no longer
241 // than the input and, if that assumption breaks, again with the exact
242 // length. This may not be pretty, but it is nicer than what was here before
243 // and I hereby claim my vaffel-is.
244 //
245 // NOTE: This assumes that the upper/lower case of an ASCII
246 // character is also ASCII. This is currently the case, but it
247 // might break in the future if we implement more context and locale
248 // dependent upper/lower conversions.
249 bool has_changed_character = false;
250
251 // Convert all characters to upper case, assuming that they will fit
252 // in the buffer
253 StringCharacterStream stream(string);
254 unibrow::uchar chars[Converter::kMaxWidth];
255 // We can assume that the string is not empty
256 base::uc32 current = stream.GetNext();
257 bool ignore_overflow = Converter::kIsToLower || IsSeqTwoByteString(result);
258 for (uint32_t i = 0; i < result_length;) {
259 bool has_next = stream.HasMore();
260 base::uc32 next = has_next ? stream.GetNext() : 0;
261 uint32_t char_length = mapping->get(current, next, chars);
262 if (char_length == 0) {
263 // The case conversion of this character is the character itself.
264 result->Set(i, current);
265 i++;
266 } else if (char_length == 1 &&
267 (ignore_overflow || !ToUpperOverflows(current))) {
268 // Common case: converting the letter resulted in one character.
269 DCHECK(static_cast<base::uc32>(chars[0]) != current);
270 result->Set(i, chars[0]);
271 has_changed_character = true;
272 i++;
273 } else if (result_length == string->length()) {
274 bool overflows = ToUpperOverflows(current);
275 // We've assumed that the result would be as long as the
276 // input but here is a character that converts to several
277 // characters. No matter, we calculate the exact length
278 // of the result and try the whole thing again.
279 //
280 // Note that this leaves room for optimization. We could just
281 // memcpy what we already have to the result string. Also,
282 // the result string is the last object allocated we could
283 // "realloc" it and probably, in the vast majority of cases,
284 // extend the existing string to be able to hold the full
285 // result.
286 uint32_t next_length = 0;
287 if (has_next) {
288 next_length = mapping->get(next, 0, chars);
289 if (next_length == 0) next_length = 1;
290 }
291 uint32_t current_length = i + char_length + next_length;
292 while (stream.HasMore()) {
293 current = stream.GetNext();
294 overflows |= ToUpperOverflows(current);
295 // NOTE: we use 0 as the next character here because, while
296 // the next character may affect what a character converts to,
297 // it does not in any case affect the length of what it convert
298 // to.
299 int char_len = mapping->get(current, 0, chars);
300 if (char_len == 0) char_len = 1;
301 current_length += char_len;
302 if (current_length > String::kMaxLength) {
303 AllowGarbageCollection allocate_error_and_return;
305 NewInvalidStringLengthError());
306 }
307 }
308 // Try again with the real length. Return signed if we need
309 // to allocate a two-byte string for to uppercase.
310 return (overflows && !ignore_overflow) ? Smi::FromInt(-current_length)
311 : Smi::FromInt(current_length);
312 } else {
313 for (uint32_t j = 0; j < char_length; j++) {
314 result->Set(i, chars[j]);
315 i++;
316 }
317 has_changed_character = true;
318 }
319 current = next;
320 }
321 if (has_changed_character) {
322 return result;
323 } else {
324 // If we didn't actually change anything in doing the conversion
325 // we simple return the result and let the converted string
326 // become garbage; there is no reason to keep two identical strings
327 // alive.
328 return string;
329 }
330}
331
332template <class Converter>
333V8_WARN_UNUSED_RESULT static Tagged<Object> ConvertCase(
334 DirectHandle<String> s, Isolate* isolate,
336 s = String::Flatten(isolate, s);
337 uint32_t length = s->length();
338 // Assume that the string is not empty; we need this assumption later
339 if (length == 0) return *s;
340
341 // Simpler handling of ASCII strings.
342 //
343 // NOTE: This assumes that the upper/lower case of an ASCII
344 // character is also ASCII. This is currently the case, but it
345 // might break in the future if we implement more context and locale
346 // dependent upper/lower conversions.
348 uint32_t prefix;
349 {
351 String::FlatContent flat = s->GetFlatContent(no_gc);
353 reinterpret_cast<const char*>(flat.ToOneByteVector().begin()),
354 length);
355 if (prefix == length) return *s;
356 }
357 // Same length as input.
358 DirectHandle<SeqOneByteString> result =
359 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
361 String::FlatContent flat = s->GetFlatContent(no_gc);
362 DCHECK(flat.IsFlat());
363 uint8_t* dest = result->GetChars(no_gc);
364 base::Vector<const uint8_t> src = flat.ToOneByteVector();
365 std::memcpy(dest, src.begin(), prefix);
366 uint32_t index_to_first_unprocessed =
368 reinterpret_cast<char*>(dest + prefix),
369 reinterpret_cast<const char*>(src.begin() + prefix),
370 length - prefix) +
371 prefix;
372 // If not ASCII, we discard the result and take the 2 byte path.
373 if (index_to_first_unprocessed == length) return *result;
374 }
375
376 DirectHandle<SeqString> result; // Same length as input.
377 if (s->IsOneByteRepresentation()) {
378 result = isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
379 } else {
380 result = isolate->factory()->NewRawTwoByteString(length).ToHandleChecked();
381 }
382
383 Tagged<Object> answer =
384 ConvertCaseHelper(isolate, *s, *result, length, mapping);
385 if (IsException(answer, isolate) || IsString(answer)) return answer;
386
387 DCHECK(IsSmi(answer));
388 // In this case we need to retry with a new string of the given length.
389 // If the value is negative, the string must be a two-byte string.
390 int int_answer = Smi::ToInt(answer);
391 if (s->IsOneByteRepresentation() && int_answer > 0) {
392 length = int_answer;
394 isolate, result, isolate->factory()->NewRawOneByteString(length));
395 } else {
396 length = abs(int_answer);
398 isolate, result, isolate->factory()->NewRawTwoByteString(length));
399 }
400 return ConvertCaseHelper(isolate, *s, *result, length, mapping);
401}
402
403} // namespace
404
405BUILTIN(StringPrototypeToLocaleLowerCase) {
406 HandleScope scope(isolate);
407 TO_THIS_STRING(string, "String.prototype.toLocaleLowerCase");
408 return ConvertCase(string, isolate,
409 isolate->runtime_state()->to_lower_mapping());
410}
411
412BUILTIN(StringPrototypeToLocaleUpperCase) {
413 HandleScope scope(isolate);
414 TO_THIS_STRING(string, "String.prototype.toLocaleUpperCase");
415 return ConvertCase(string, isolate,
416 isolate->runtime_state()->to_upper_mapping());
417}
418
419BUILTIN(StringPrototypeToLowerCase) {
420 HandleScope scope(isolate);
421 TO_THIS_STRING(string, "String.prototype.toLowerCase");
422 return ConvertCase(string, isolate,
423 isolate->runtime_state()->to_lower_mapping());
424}
425
426BUILTIN(StringPrototypeToUpperCase) {
427 HandleScope scope(isolate);
428 TO_THIS_STRING(string, "String.prototype.toUpperCase");
429 return ConvertCase(string, isolate,
430 isolate->runtime_state()->to_upper_mapping());
431}
432#endif // !V8_INTL_SUPPORT
433
434// ES6 #sec-string.prototype.raw
435BUILTIN(StringRaw) {
436 HandleScope scope(isolate);
437 DirectHandle<Object> templ = args.atOrUndefined(isolate, 1);
438 const uint32_t argc = args.length();
439 DirectHandle<String> raw_string =
440 isolate->factory()->NewStringFromAsciiChecked("raw");
441
444 Object::ToObject(isolate, templ));
445
448 isolate, raw,
449 Cast<JSAny>(Object::GetProperty(isolate, cooked, raw_string)));
451 Object::ToObject(isolate, raw));
452 DirectHandle<Object> raw_len;
454 isolate, raw_len,
455 Object::GetProperty(isolate, raw, isolate->factory()->length_string()));
456
457 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, raw_len,
458 Object::ToLength(isolate, raw_len));
459
460 IncrementalStringBuilder result_builder(isolate);
461 // Intentional spec violation: we ignore {length} values >= 2^32, because
462 // assuming non-empty chunks they would generate too-long strings anyway.
463 const double raw_len_number = Object::NumberValue(*raw_len);
464 const uint32_t length = raw_len_number > std::numeric_limits<uint32_t>::max()
465 ? std::numeric_limits<uint32_t>::max()
466 : static_cast<uint32_t>(raw_len_number);
467 if (length > 0) {
468 DirectHandle<Object> first_element;
469 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, first_element,
470 Object::GetElement(isolate, raw, 0));
471
472 DirectHandle<String> first_string;
474 isolate, first_string, Object::ToString(isolate, first_element));
475 result_builder.AppendString(first_string);
476
477 for (uint32_t i = 1, arg_i = 2; i < length; i++, arg_i++) {
478 if (arg_i < argc) {
479 DirectHandle<String> argument_string;
481 isolate, argument_string,
482 Object::ToString(isolate, args.at(arg_i)));
483 result_builder.AppendString(argument_string);
484 }
485
486 DirectHandle<Object> element;
487 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, element,
488 Object::GetElement(isolate, raw, i));
489
490 DirectHandle<String> element_string;
491 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, element_string,
492 Object::ToString(isolate, element));
493 result_builder.AppendString(element_string);
494 }
495 }
496
497 RETURN_RESULT_OR_FAILURE(isolate, result_builder.Finish());
498}
499
500} // namespace internal
501} // namespace v8
#define TO_THIS_STRING(name, method)
#define BUILTIN(name)
int get(uchar c, uchar n, uchar *result)
Definition unicode-inl.h:32
static uint16_t LeadSurrogate(uint32_t char_code)
Definition unicode.h:126
static const uchar kMaxNonSurrogateCharCode
Definition unicode.h:116
static uint16_t TrailSurrogate(uint32_t char_code)
Definition unicode.h:129
@ kStringLocaleCompare
Definition v8-isolate.h:531
MaybeDirectHandle< String > Finish()
V8_INLINE void AppendString(std::string_view str)
static V8_WARN_UNUSED_RESULT MaybeHandle< Object > ToLength(Isolate *isolate, DirectHandle< Object > input)
static V8_WARN_UNUSED_RESULT HandleType< String >::MaybeType ToString(Isolate *isolate, HandleType< T > input)
static V8_WARN_UNUSED_RESULT HandleType< Number >::MaybeType ToNumber(Isolate *isolate, HandleType< T > input)
static V8_WARN_UNUSED_RESULT HandleType< JSReceiver >::MaybeType ToObject(Isolate *isolate, HandleType< T > object, const char *method_name=nullptr)
static double NumberValue(Tagged< Number > obj)
static V8_WARN_UNUSED_RESULT Maybe< double > IntegerValue(Isolate *isolate, HandleType< T > input)
V8_EXPORT_PRIVATE static V8_WARN_UNUSED_RESULT MaybeHandle< Object > GetProperty(LookupIterator *it, bool is_global_reference=false)
Definition objects.cc:1248
static V8_WARN_UNUSED_RESULT MaybeHandle< Object > GetElement(Isolate *isolate, DirectHandle< JSAny > object, uint32_t index)
static constexpr int ToInt(const Tagged< Object > object)
Definition smi.h:33
static constexpr Tagged< Smi > FromInt(int value)
Definition smi.h:38
static constexpr Tagged< Smi > zero()
Definition smi.h:99
base::uc16 Get(uint32_t i) const
Definition string.h:150
static const uint32_t kMaxLength
Definition string.h:511
static V8_INLINE HandleType< String > Flatten(Isolate *isolate, HandleType< T > string, AllocationType allocation=AllocationType::kYoung)
static const int32_t kMaxOneByteCharCode
Definition string.h:500
static bool IsOneByteRepresentationUnderneath(Tagged< String > string)
Definition string-inl.h:373
bool Equals(Tagged< String > other) const
Definition string-inl.h:535
static Tagged< Object > LastIndexOf(Isolate *isolate, DirectHandle< Object > receiver, DirectHandle< Object > search, DirectHandle< Object > position)
Definition string.cc:1689
int end
#define ASSIGN_RETURN_FAILURE_ON_EXCEPTION(isolate, dst, call)
Definition isolate.h:284
#define ASSIGN_RETURN_ON_EXCEPTION_VALUE(isolate, dst, call, value)
Definition isolate.h:276
#define THROW_NEW_ERROR_RETURN_FAILURE(isolate, call)
Definition isolate.h:294
#define RETURN_RESULT_OR_FAILURE(isolate, call)
Definition isolate.h:264
base::Vector< const DirectHandle< Object > > args
Definition execution.cc:74
ZoneVector< RpoNumber > & result
int s
Definition mul-fft.cc:297
unsigned int uchar
Definition unicode.h:21
uint32_t uc32
Definition strings.h:19
uint32_t DoubleToUint32(double x)
PerThreadAssertScopeDebugOnly< false, SAFEPOINTS_ASSERT, HEAP_ALLOCATION_ASSERT > DisallowGarbageCollection
bool IsNumber(Tagged< Object > obj)
Tagged(T object) -> Tagged< T >
V8_INLINE constexpr bool IsSmi(TaggedImpl< kRefType, StorageType > obj)
Definition objects.h:665
BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL int character
PerThreadAssertScopeDebugOnly< true, SAFEPOINTS_ASSERT, HEAP_ALLOCATION_ASSERT > AllowGarbageCollection
uint32_t FastAsciiConvert(char *dst, const char *src, uint32_t length)
void CopyChars(DstType *dst, const SrcType *src, size_t count) V8_NONNULL(1
uint32_t FastAsciiCasePrefixLength(const char *src, uint32_t length)
template const char * string
Tagged< To > Cast(Tagged< From > value, const v8::SourceLocation &loc=INIT_SOURCE_LOCATION_IN_DEBUG)
Definition casting.h:150
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_LT(v1, v2)
Definition logging.h:489
#define V8_WARN_UNUSED_RESULT
Definition v8config.h:671