v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
uri.cc
Go to the documentation of this file.
1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/strings/uri.h"
6
7#include <vector>
8
13
14namespace v8 {
15namespace internal {
16
17namespace { // anonymous namespace for DecodeURI helper functions
18bool IsReservedPredicate(base::uc16 c) {
19 switch (c) {
20 case '#':
21 case '$':
22 case '&':
23 case '+':
24 case ',':
25 case '/':
26 case ':':
27 case ';':
28 case '=':
29 case '?':
30 case '@':
31 return true;
32 default:
33 return false;
34 }
35}
36
37bool IsReplacementCharacter(const uint8_t* octets, int length) {
38 // The replacement character is at codepoint U+FFFD in the Unicode Specials
39 // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
40 if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF ||
41 octets[2] != 0xBD) {
42 return false;
43 }
44 return true;
45}
46
47bool DecodeOctets(const uint8_t* octets, int length,
48 std::vector<base::uc16>* buffer) {
49 size_t cursor = 0;
50 base::uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
51 if (value == unibrow::Utf8::kBadChar &&
52 !IsReplacementCharacter(octets, length)) {
53 return false;
54 }
55
56 if (value <=
58 buffer->push_back(value);
59 } else {
60 buffer->push_back(unibrow::Utf16::LeadSurrogate(value));
61 buffer->push_back(unibrow::Utf16::TrailSurrogate(value));
62 }
63 return true;
64}
65
66int TwoDigitHex(base::uc16 character1, base::uc16 character2) {
67 if (character1 > 'f') return -1;
68 int high = base::HexValue(character1);
69 if (high == -1) return -1;
70 if (character2 > 'f') return -1;
71 int low = base::HexValue(character2);
72 if (low == -1) return -1;
73 return (high << 4) + low;
74}
75
76template <typename T>
77void AddToBuffer(base::uc16 decoded, String::FlatContent* uri_content,
78 int index, bool is_uri, std::vector<T>* buffer) {
79 if (is_uri && IsReservedPredicate(decoded)) {
80 buffer->push_back('%');
81 base::uc16 first = uri_content->Get(index + 1);
82 base::uc16 second = uri_content->Get(index + 2);
83 DCHECK_GT(std::numeric_limits<T>::max(), first);
84 DCHECK_GT(std::numeric_limits<T>::max(), second);
85
86 buffer->push_back(first);
87 buffer->push_back(second);
88 } else {
89 buffer->push_back(decoded);
90 }
91}
92
93bool IntoTwoByte(int index, bool is_uri, int uri_length,
94 String::FlatContent* uri_content,
95 std::vector<base::uc16>* buffer) {
96 for (int k = index; k < uri_length; k++) {
97 base::uc16 code = uri_content->Get(k);
98 if (code == '%') {
99 int two_digits;
100 if (k + 2 >= uri_length ||
101 (two_digits = TwoDigitHex(uri_content->Get(k + 1),
102 uri_content->Get(k + 2))) < 0) {
103 return false;
104 }
105 k += 2;
106 base::uc16 decoded = static_cast<base::uc16>(two_digits);
107 if (decoded > unibrow::Utf8::kMaxOneByteChar) {
108 uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
109 octets[0] = decoded;
110
111 int number_of_continuation_bytes = 0;
112 while ((decoded << ++number_of_continuation_bytes) & 0x80) {
113 if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
114 return false;
115 }
116 if (uri_content->Get(++k) != '%' ||
117 (two_digits = TwoDigitHex(uri_content->Get(k + 1),
118 uri_content->Get(k + 2))) < 0) {
119 return false;
120 }
121 k += 2;
122 base::uc16 continuation_byte = static_cast<base::uc16>(two_digits);
123 octets[number_of_continuation_bytes] = continuation_byte;
124 }
125
126 if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
127 return false;
128 }
129 } else {
130 AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
131 }
132 } else {
133 buffer->push_back(code);
134 }
135 }
136 return true;
137}
138
139bool IntoOneAndTwoByte(DirectHandle<String> uri, bool is_uri,
140 std::vector<uint8_t>* one_byte_buffer,
141 std::vector<base::uc16>* two_byte_buffer) {
143 String::FlatContent uri_content = uri->GetFlatContent(no_gc);
144
145 int uri_length = uri->length();
146 for (int k = 0; k < uri_length; k++) {
147 base::uc16 code = uri_content.Get(k);
148 if (code == '%') {
149 int two_digits;
150 if (k + 2 >= uri_length ||
151 (two_digits = TwoDigitHex(uri_content.Get(k + 1),
152 uri_content.Get(k + 2))) < 0) {
153 return false;
154 }
155
156 base::uc16 decoded = static_cast<base::uc16>(two_digits);
157 if (decoded > unibrow::Utf8::kMaxOneByteChar) {
158 return IntoTwoByte(k, is_uri, uri_length, &uri_content,
159 two_byte_buffer);
160 }
161
162 AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
163 k += 2;
164 } else {
166 return IntoTwoByte(k, is_uri, uri_length, &uri_content,
167 two_byte_buffer);
168 }
169 one_byte_buffer->push_back(code);
170 }
171 }
172 return true;
173}
174
175} // anonymous namespace
176
178 DirectHandle<String> uri, bool is_uri) {
179 uri = String::Flatten(isolate, uri);
180 std::vector<uint8_t> one_byte_buffer;
181 std::vector<base::uc16> two_byte_buffer;
182
183 if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
184 THROW_NEW_ERROR(isolate, NewURIError());
185 }
186
187 if (two_byte_buffer.empty()) {
188 return isolate->factory()->NewStringFromOneByte(base::Vector<const uint8_t>(
189 one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size())));
190 }
191
193 int result_length =
194 static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size());
196 isolate, result, isolate->factory()->NewRawTwoByteString(result_length));
197
199 base::uc16* chars = result->GetChars(no_gc);
200 if (!one_byte_buffer.empty()) {
201 CopyChars(chars, one_byte_buffer.data(), one_byte_buffer.size());
202 chars += one_byte_buffer.size();
203 }
204 if (!two_byte_buffer.empty()) {
205 CopyChars(chars, two_byte_buffer.data(), two_byte_buffer.size());
206 }
207
208 return result;
209}
210
211namespace { // anonymous namespace for EncodeURI helper functions
212bool IsUnescapePredicateInUriComponent(base::uc16 c) {
213 if (IsAlphaNumeric(c)) {
214 return true;
215 }
216
217 switch (c) {
218 case '!':
219 case '\'':
220 case '(':
221 case ')':
222 case '*':
223 case '-':
224 case '.':
225 case '_':
226 case '~':
227 return true;
228 default:
229 return false;
230 }
231}
232
233bool IsUriSeparator(base::uc16 c) {
234 switch (c) {
235 case '#':
236 case ':':
237 case ';':
238 case '/':
239 case '?':
240 case '$':
241 case '&':
242 case '+':
243 case ',':
244 case '@':
245 case '=':
246 return true;
247 default:
248 return false;
249 }
250}
251
252void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) {
253 buffer->push_back('%');
254 buffer->push_back(base::HexCharOfValue(octet >> 4));
255 buffer->push_back(base::HexCharOfValue(octet & 0x0F));
256}
257
258void EncodeSingle(base::uc16 c, std::vector<uint8_t>* buffer) {
259 char s[4] = {};
260 int number_of_bytes;
261 number_of_bytes =
263 for (int k = 0; k < number_of_bytes; k++) {
264 AddEncodedOctetToBuffer(s[k], buffer);
265 }
266}
267
268void EncodePair(base::uc16 cc1, base::uc16 cc2, std::vector<uint8_t>* buffer) {
269 char s[4] = {};
270 int number_of_bytes =
273 for (int k = 0; k < number_of_bytes; k++) {
274 AddEncodedOctetToBuffer(s[k], buffer);
275 }
276}
277
278} // anonymous namespace
279
281 DirectHandle<String> uri, bool is_uri) {
282 uri = String::Flatten(isolate, uri);
283 int uri_length = uri->length();
284 std::vector<uint8_t> buffer;
285 buffer.reserve(uri_length);
286
287 bool throw_error = false;
288 {
290 String::FlatContent uri_content = uri->GetFlatContent(no_gc);
291
292 for (int k = 0; k < uri_length; k++) {
293 base::uc16 cc1 = uri_content.Get(k);
295 k++;
296 if (k < uri_length) {
297 base::uc16 cc2 = uri->Get(k);
299 EncodePair(cc1, cc2, &buffer);
300 continue;
301 }
302 }
303 } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
304 if (IsUnescapePredicateInUriComponent(cc1) ||
305 (is_uri && IsUriSeparator(cc1))) {
306 buffer.push_back(cc1);
307 } else {
308 EncodeSingle(cc1, &buffer);
309 }
310 continue;
311 }
312
313 // String::FlatContent DCHECKs its contents did not change during its
314 // lifetime. Throwing the error inside the loop may cause GC and move the
315 // string contents.
316 throw_error = true;
317 break;
318 }
319 }
320
321 if (throw_error) THROW_NEW_ERROR(isolate, NewURIError());
322 return isolate->factory()->NewStringFromOneByte(base::VectorOf(buffer));
323}
324
325namespace { // Anonymous namespace for Escape and Unescape
326
327template <typename Char>
328int UnescapeChar(base::Vector<const Char> vector, int i, int length,
329 int* step) {
330 uint16_t character = vector[i];
331 int32_t hi = 0;
332 int32_t lo = 0;
333 if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
334 (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
335 (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
336 *step = 6;
337 return (hi << 8) + lo;
338 } else if (character == '%' && i <= length - 3 &&
339 (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
340 *step = 3;
341 return lo;
342 } else {
343 *step = 1;
344 return character;
345 }
346}
347
348template <typename Char>
349MaybeHandle<String> UnescapeSlow(Isolate* isolate, DirectHandle<String> string,
350 int start_index) {
351 bool one_byte = true;
352 uint32_t length = string->length();
353
354 int unescaped_length = 0;
355 {
357 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
358 for (uint32_t i = start_index; i < length; unescaped_length++) {
359 int step;
360 if (UnescapeChar(vector, i, length, &step) >
362 one_byte = false;
363 }
364 i += step;
365 }
366 }
367
368 DCHECK_LT(start_index, length);
369 Handle<String> first_part =
370 isolate->factory()->NewProperSubString(string, 0, start_index);
371
372 int dest_position = 0;
373 Handle<String> second_part;
374 DCHECK_LE(unescaped_length, String::kMaxLength);
375 if (one_byte) {
376 Handle<SeqOneByteString> dest = isolate->factory()
377 ->NewRawOneByteString(unescaped_length)
378 .ToHandleChecked();
380 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
381 for (uint32_t i = start_index; i < length; dest_position++) {
382 int step;
383 dest->SeqOneByteStringSet(dest_position,
384 UnescapeChar(vector, i, length, &step));
385 i += step;
386 }
387 second_part = dest;
388 } else {
389 Handle<SeqTwoByteString> dest = isolate->factory()
390 ->NewRawTwoByteString(unescaped_length)
391 .ToHandleChecked();
393 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
394 for (uint32_t i = start_index; i < length; dest_position++) {
395 int step;
396 dest->SeqTwoByteStringSet(dest_position,
397 UnescapeChar(vector, i, length, &step));
398 i += step;
399 }
400 second_part = dest;
401 }
402 return isolate->factory()->NewConsString(first_part, second_part);
403}
404
405bool IsNotEscaped(uint16_t c) {
406 if (IsAlphaNumeric(c)) {
407 return true;
408 }
409 // @*_+-./
410 switch (c) {
411 case '@':
412 case '*':
413 case '_':
414 case '+':
415 case '-':
416 case '.':
417 case '/':
418 return true;
419 default:
420 return false;
421 }
422}
423
424template <typename Char>
425static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
426 Handle<String> source) {
427 int index;
428 {
430 StringSearch<uint8_t, Char> search(isolate, base::StaticOneByteVector("%"));
431 index = search.Search(source->GetCharVector<Char>(no_gc), 0);
432 if (index < 0) return source;
433 }
434 return UnescapeSlow<Char>(isolate, source, index);
435}
436
437template <typename Char>
438static MaybeHandle<String> EscapePrivate(Isolate* isolate,
439 Handle<String> string) {
440 DCHECK(string->IsFlat());
441 uint32_t escaped_length = 0;
442 uint32_t length = string->length();
443
444 {
446 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
447 for (uint32_t i = 0; i < length; i++) {
448 uint16_t c = vector[i];
449 if (c >= 256) {
450 escaped_length += 6;
451 } else if (IsNotEscaped(c)) {
452 escaped_length++;
453 } else {
454 escaped_length += 3;
455 }
456
457 // We don't allow strings that are longer than a maximal length.
458 DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6); // Cannot overflow.
459 if (escaped_length > String::kMaxLength) break; // Provoke exception.
460 }
461 }
462
463 // No length change implies no change. Return original string if no change.
464 if (escaped_length == length) return string;
465
468 isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length));
469 int dest_position = 0;
470
471 {
473 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
474 for (uint32_t i = 0; i < length; i++) {
475 uint16_t c = vector[i];
476 if (c >= 256) {
477 dest->SeqOneByteStringSet(dest_position, '%');
478 dest->SeqOneByteStringSet(dest_position + 1, 'u');
479 dest->SeqOneByteStringSet(dest_position + 2,
480 base::HexCharOfValue(c >> 12));
481 dest->SeqOneByteStringSet(dest_position + 3,
482 base::HexCharOfValue((c >> 8) & 0xF));
483 dest->SeqOneByteStringSet(dest_position + 4,
484 base::HexCharOfValue((c >> 4) & 0xF));
485 dest->SeqOneByteStringSet(dest_position + 5,
486 base::HexCharOfValue(c & 0xF));
487 dest_position += 6;
488 } else if (IsNotEscaped(c)) {
489 dest->SeqOneByteStringSet(dest_position, c);
490 dest_position++;
491 } else {
492 dest->SeqOneByteStringSet(dest_position, '%');
493 dest->SeqOneByteStringSet(dest_position + 1,
494 base::HexCharOfValue(c >> 4));
495 dest->SeqOneByteStringSet(dest_position + 2,
496 base::HexCharOfValue(c & 0xF));
497 dest_position += 3;
498 }
499 }
500 }
501
502 return dest;
503}
504
505} // anonymous namespace
506
508 string = String::Flatten(isolate, string);
510 ? EscapePrivate<uint8_t>(isolate, string)
511 : EscapePrivate<base::uc16>(isolate, string);
512}
513
515 Handle<String> string) {
516 string = String::Flatten(isolate, string);
518 ? UnescapePrivate<uint8_t>(isolate, string)
519 : UnescapePrivate<base::uc16>(isolate, string);
520}
521
522} // namespace internal
523} // namespace v8
static uint16_t LeadSurrogate(uint32_t char_code)
Definition unicode.h:126
static const int kNoPreviousCharacter
Definition unicode.h:102
static const uchar kMaxNonSurrogateCharCode
Definition unicode.h:116
static uint16_t TrailSurrogate(uint32_t char_code)
Definition unicode.h:129
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition unicode.h:113
static bool IsTrailSurrogate(int code)
Definition unicode.h:109
static bool IsLeadSurrogate(int code)
Definition unicode.h:106
static uchar ValueOf(const uint8_t *str, size_t length, size_t *cursor)
static const unsigned kMaxOneByteChar
Definition unicode.h:179
static const uchar kBadChar
Definition unicode.h:175
static const unsigned kMaxEncodedSize
Definition unicode.h:178
static unsigned Encode(char *out, uchar c, int previous, bool replace_invalid=false)
base::uc16 Get(uint32_t i) const
Definition string.h:150
static const uint32_t kMaxLength
Definition string.h:511
static V8_INLINE HandleType< String > Flatten(Isolate *isolate, HandleType< T > string, AllocationType allocation=AllocationType::kYoung)
static const int32_t kMaxOneByteCharCode
Definition string.h:500
static bool IsOneByteRepresentationUnderneath(Tagged< String > string)
Definition string-inl.h:373
static MaybeDirectHandle< String > Unescape(Isolate *isolate, Handle< String > string)
Definition uri.cc:514
static MaybeDirectHandle< String > Decode(Isolate *isolate, DirectHandle< String > uri, bool is_uri)
Definition uri.cc:177
static MaybeDirectHandle< String > Encode(Isolate *isolate, DirectHandle< String > uri, bool is_uri)
Definition uri.cc:280
static MaybeDirectHandle< String > Escape(Isolate *isolate, Handle< String > string)
Definition uri.cc:507
#define ASSIGN_RETURN_ON_EXCEPTION(isolate, dst, call)
Definition isolate.h:291
#define THROW_NEW_ERROR(isolate, call)
Definition isolate.h:307
double second
ZoneVector< RpoNumber > & result
InstructionOperand source
unsigned short uint16_t
Definition unicode.cc:39
char HexCharOfValue(int value)
Definition strings.h:42
uint32_t uc32
Definition strings.h:19
int HexValue(uc32 c)
Definition strings.h:34
Vector< const uint8_t > StaticOneByteVector(const char(&array)[N])
Definition vector.h:346
uint16_t uc16
Definition strings.h:18
constexpr Vector< T > VectorOf(T *start, size_t size)
Definition vector.h:360
PerThreadAssertScopeDebugOnly< false, SAFEPOINTS_ASSERT, HEAP_ALLOCATION_ASSERT > DisallowGarbageCollection
BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL int character
void CopyChars(DstType *dst, const SrcType *src, size_t count) V8_NONNULL(1
constexpr bool IsAlphaNumeric(base::uc32 c)
template const char * string
Local< T > Handle
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_LT(v1, v2)
Definition logging.h:489
#define DCHECK_GT(v1, v2)
Definition logging.h:487