v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
v8-string-conversions.cc
Go to the documentation of this file.
1// Copyright 2019 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
7#include <limits>
8#include <vector>
9
10#include "src/base/logging.h"
11
12namespace v8_inspector {
13namespace {
14using UChar = char16_t;
15using UChar32 = uint32_t;
16
17bool isASCII(UChar c) { return !(c & ~0x7F); }
18
19const UChar replacementCharacter = 0xFFFD;
20
21inline int inlineUTF8SequenceLengthNonASCII(char b0) {
22 if ((b0 & 0xC0) != 0xC0) return 0;
23 if ((b0 & 0xE0) == 0xC0) return 2;
24 if ((b0 & 0xF0) == 0xE0) return 3;
25 if ((b0 & 0xF8) == 0xF0) return 4;
26 return 0;
27}
28
29inline int inlineUTF8SequenceLength(char b0) {
30 return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
31}
32
33// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
34// into the first byte, depending on how many bytes follow. There are
35// as many entries in this table as there are UTF-8 sequence types.
36// (I.e., one byte sequence, two byte... etc.). Remember that sequences
37// for *legal* UTF-8 will be 4 or fewer bytes total.
38static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
39 0xF0, 0xF8, 0xFC};
40
41enum ConversionResult {
42 conversionOK, // conversion successful
43 sourceExhausted, // partial character in source, but hit end
44 targetExhausted, // insuff. room in target for conversion
45 sourceIllegal // source sequence is illegal/malformed
46};
47
48ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,
49 const UChar* sourceEnd, char** targetStart,
50 char* targetEnd, bool strict) {
51 ConversionResult result = conversionOK;
52 const UChar* source = *sourceStart;
53 char* target = *targetStart;
54 while (source < sourceEnd) {
55 UChar32 ch;
56 uint32_t bytesToWrite = 0;
57 const UChar32 byteMask = 0xBF;
58 const UChar32 byteMark = 0x80;
59 const UChar* oldSource =
60 source; // In case we have to back up because of target overflow.
61 ch = static_cast<uint16_t>(*source++);
62 // If we have a surrogate pair, convert to UChar32 first.
63 if (ch >= 0xD800 && ch <= 0xDBFF) {
64 // If the 16 bits following the high surrogate are in the source buffer...
65 if (source < sourceEnd) {
66 UChar32 ch2 = static_cast<uint16_t>(*source);
67 // If it's a low surrogate, convert to UChar32.
68 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
69 ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
70 ++source;
71 } else if (strict) { // it's an unpaired high surrogate
72 --source; // return to the illegal value itself
73 result = sourceIllegal;
74 break;
75 }
76 } else { // We don't have the 16 bits following the high surrogate.
77 --source; // return to the high surrogate
78 result = sourceExhausted;
79 break;
80 }
81 } else if (strict) {
82 // UTF-16 surrogate values are illegal in UTF-32
83 if (ch >= 0xDC00 && ch <= 0xDFFF) {
84 --source; // return to the illegal value itself
85 result = sourceIllegal;
86 break;
87 }
88 }
89 // Figure out how many bytes the result will require
90 if (ch < static_cast<UChar32>(0x80)) {
91 bytesToWrite = 1;
92 } else if (ch < static_cast<UChar32>(0x800)) {
93 bytesToWrite = 2;
94 } else if (ch < static_cast<UChar32>(0x10000)) {
95 bytesToWrite = 3;
96 } else if (ch < static_cast<UChar32>(0x110000)) {
97 bytesToWrite = 4;
98 } else {
99 bytesToWrite = 3;
100 ch = replacementCharacter;
101 }
102
103 target += bytesToWrite;
104 if (target > targetEnd) {
105 source = oldSource; // Back up source pointer!
106 target -= bytesToWrite;
107 result = targetExhausted;
108 break;
109 }
110 switch (bytesToWrite) {
111 case 4:
112 *--target = static_cast<char>((ch | byteMark) & byteMask);
113 ch >>= 6;
114 [[fallthrough]];
115 case 3:
116 *--target = static_cast<char>((ch | byteMark) & byteMask);
117 ch >>= 6;
118 [[fallthrough]];
119 case 2:
120 *--target = static_cast<char>((ch | byteMark) & byteMask);
121 ch >>= 6;
122 [[fallthrough]];
123 case 1:
124 *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]);
125 }
126 target += bytesToWrite;
127 }
128 *sourceStart = source;
129 *targetStart = target;
130 return result;
131}
132
139#define U_IS_BMP(c) ((uint32_t)(c) <= 0xFFFF)
140
147#define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x010000) <= 0xFFFFF)
148
155#define U_IS_SURROGATE(c) (((c)&0xFFFFF800) == 0xD800)
156
164#define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xD7C0)
165
173#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3FF) | 0xDC00)
174
175// This must be called with the length pre-determined by the first byte.
176// If presented with a length > 4, this returns false. The Unicode
177// definition of UTF-8 goes up to 4-byte sequences.
178static bool isLegalUTF8(const unsigned char* source, int length) {
179 unsigned char a;
180 const unsigned char* srcptr = source + length;
181 switch (length) {
182 default:
183 return false;
184 // Everything else falls through when "true"...
185 case 4:
186 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
187 [[fallthrough]];
188 case 3:
189 if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
190 [[fallthrough]];
191 case 2:
192 if ((a = (*--srcptr)) > 0xBF) return false;
193
194 // no fall-through in this inner switch
195 switch (*source) {
196 case 0xE0:
197 if (a < 0xA0) return false;
198 break;
199 case 0xED:
200 if (a > 0x9F) return false;
201 break;
202 case 0xF0:
203 if (a < 0x90) return false;
204 break;
205 case 0xF4:
206 if (a > 0x8F) return false;
207 break;
208 default:
209 if (a < 0x80) return false;
210 }
211 [[fallthrough]];
212
213 case 1:
214 if (*source >= 0x80 && *source < 0xC2) return false;
215 }
216 if (*source > 0xF4) return false;
217 return true;
218}
219
220// Magic values subtracted from a buffer value during UTF8 conversion.
221// This table contains as many values as there might be trailing bytes
222// in a UTF-8 sequence.
223static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,
224 0x00003080UL,
225 0x000E2080UL,
226 0x03C82080UL,
227 static_cast<UChar32>(0xFA082080UL),
228 static_cast<UChar32>(0x82082080UL)};
229
230static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) {
231 UChar32 character = 0;
232
233 // The cases all fall through.
234 switch (length) {
235 case 6:
236 character += static_cast<unsigned char>(*sequence++);
237 character <<= 6;
238 [[fallthrough]];
239 case 5:
240 character += static_cast<unsigned char>(*sequence++);
241 character <<= 6;
242 [[fallthrough]];
243 case 4:
244 character += static_cast<unsigned char>(*sequence++);
245 character <<= 6;
246 [[fallthrough]];
247 case 3:
248 character += static_cast<unsigned char>(*sequence++);
249 character <<= 6;
250 [[fallthrough]];
251 case 2:
252 character += static_cast<unsigned char>(*sequence++);
253 character <<= 6;
254 [[fallthrough]];
255 case 1:
256 character += static_cast<unsigned char>(*sequence++);
257 }
258
259 return character - offsetsFromUTF8[length - 1];
260}
261
262ConversionResult convertUTF8ToUTF16(const char** sourceStart,
263 const char* sourceEnd, UChar** targetStart,
264 UChar* targetEnd, bool* sourceAllASCII,
265 bool strict) {
266 ConversionResult result = conversionOK;
267 const char* source = *sourceStart;
268 UChar* target = *targetStart;
269 UChar orAllData = 0;
270 while (source < sourceEnd) {
271 int utf8SequenceLength = inlineUTF8SequenceLength(*source);
272 if (sourceEnd - source < utf8SequenceLength) {
273 result = sourceExhausted;
274 break;
275 }
276 // Do this check whether lenient or strict
277 if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),
278 utf8SequenceLength)) {
279 result = sourceIllegal;
280 break;
281 }
282
283 UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
284
285 if (target >= targetEnd) {
286 source -= utf8SequenceLength; // Back up source pointer!
287 result = targetExhausted;
288 break;
289 }
290
291 if (U_IS_BMP(character)) {
292 // UTF-16 surrogate values are illegal in UTF-32
293 if (U_IS_SURROGATE(character)) {
294 if (strict) {
295 source -= utf8SequenceLength; // return to the illegal value itself
296 result = sourceIllegal;
297 break;
298 }
299 *target++ = replacementCharacter;
300 orAllData |= replacementCharacter;
301 } else {
302 *target++ = static_cast<UChar>(character); // normal case
303 orAllData |= character;
304 }
305 } else if (U_IS_SUPPLEMENTARY(character)) {
306 // target is a character in range 0xFFFF - 0x10FFFF
307 if (target + 1 >= targetEnd) {
308 source -= utf8SequenceLength; // Back up source pointer!
309 result = targetExhausted;
310 break;
311 }
312 *target++ = U16_LEAD(character);
313 *target++ = U16_TRAIL(character);
314 orAllData = 0xFFFF;
315 } else {
316 if (strict) {
317 source -= utf8SequenceLength; // return to the start
318 result = sourceIllegal;
319 break; // Bail out; shouldn't continue
320 } else {
321 *target++ = replacementCharacter;
322 orAllData |= replacementCharacter;
323 }
324 }
325 }
326 *sourceStart = source;
327 *targetStart = target;
328
329 if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7F);
330
331 return result;
332}
333
334// Helper to write a three-byte UTF-8 code point to the buffer, caller must
335// check room is available.
336static inline void putUTF8Triple(char*& buffer, UChar ch) {
337 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
338 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
339 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
340}
341} // namespace
342
343std::string UTF16ToUTF8(const UChar* stringStart, size_t length) {
344 if (!stringStart || !length) return std::string();
345
346 // Allocate a buffer big enough to hold all the characters
347 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
348 // Optimization ideas, if we find this function is hot:
349 // * We could speculatively create a CStringBuffer to contain 'length'
350 // characters, and resize if necessary (i.e. if the buffer contains
351 // non-ascii characters). (Alternatively, scan the buffer first for
352 // ascii characters, so we know this will be sufficient).
353 // * We could allocate a CStringBuffer with an appropriate size to
354 // have a good chance of being able to write the string into the
355 // buffer without reallocing (say, 1.5 x length).
356 if (length > std::numeric_limits<unsigned>::max() / 3) return std::string();
357
358 std::string output(length * 3, '\0');
359 const UChar* characters = stringStart;
360 const UChar* characters_end = characters + length;
361 char* buffer = &*output.begin();
362 char* buffer_end = &*output.end();
363 while (characters < characters_end) {
364 // Use strict conversion to detect unpaired surrogates.
365 ConversionResult result = convertUTF16ToUTF8(
366 &characters, characters_end, &buffer, buffer_end, /* strict= */ true);
367 DCHECK_NE(result, targetExhausted);
368 // Conversion fails when there is an unpaired surrogate. Put
369 // replacement character (U+FFFD) instead of the unpaired
370 // surrogate.
371 if (result != conversionOK) {
372 DCHECK_LE(0xD800, *characters);
373 DCHECK_LE(*characters, 0xDFFF);
374 // There should be room left, since one UChar hasn't been
375 // converted.
376 DCHECK_LE(buffer + 3, buffer_end);
377 putUTF8Triple(buffer, replacementCharacter);
378 ++characters;
379 }
380 }
381
382 output.resize(buffer - output.data());
383 return output;
384}
385
386std::basic_string<UChar> UTF8ToUTF16(const char* stringStart, size_t length) {
387 if (!stringStart || !length) return std::basic_string<UChar>();
388 std::vector<UChar> buffer(length);
389 UChar* bufferStart = buffer.data();
390
391 UChar* bufferCurrent = bufferStart;
392 const char* stringCurrent = reinterpret_cast<const char*>(stringStart);
393 if (convertUTF8ToUTF16(&stringCurrent,
394 reinterpret_cast<const char*>(stringStart + length),
395 &bufferCurrent, bufferCurrent + buffer.size(), nullptr,
396 true) != conversionOK)
397 return std::basic_string<UChar>();
398 size_t utf16Length = bufferCurrent - bufferStart;
399 return std::basic_string<UChar>(bufferStart, bufferStart + utf16Length);
400}
401
402} // namespace v8_inspector
TNode< Object > target
std::optional< TNode< JSArray > > a
ZoneVector< RpoNumber > & result
InstructionOperand source
unsigned short uint16_t
Definition unicode.cc:39
BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL int character
std::basic_string< UChar > UTF8ToUTF16(const char *stringStart, size_t length)
char16_t UChar
Definition string-16.h:22
std::string UTF16ToUTF8(const UChar *stringStart, size_t length)
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define DCHECK_NE(v1, v2)
Definition logging.h:486
#define U_IS_SURROGATE(c)
#define U_IS_SUPPLEMENTARY(c)
#define U16_TRAIL(supplementary)
#define U16_LEAD(supplementary)
#define U_IS_BMP(c)