v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
unicode-inl.h
Go to the documentation of this file.
1// Copyright 2007-2010 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_STRINGS_UNICODE_INL_H_
6#define V8_STRINGS_UNICODE_INL_H_
7
9// Include the non-inl header before the rest of the headers.
10
11#include "src/base/logging.h"
12#include "src/utils/utils.h"
13
14namespace unibrow {
15
16#ifndef V8_INTL_SUPPORT
17template <class T, int s>
18bool Predicate<T, s>::get(uchar code_point) {
19 CacheEntry entry = entries_[code_point & kMask];
20 if (entry.code_point() == code_point) return entry.value();
21 return CalculateValue(code_point);
22}
23
24template <class T, int s>
26 bool result = T::Is(code_point);
27 entries_[code_point & kMask] = CacheEntry(code_point, result);
28 return result;
29}
30
31template <class T, int s>
33 CacheEntry entry = entries_[c & kMask];
34 if (entry.code_point_ == c) {
35 if (entry.offset_ == 0) {
36 return 0;
37 } else {
38 result[0] = c + entry.offset_;
39 return 1;
40 }
41 } else {
42 return CalculateValue(c, n, result);
43 }
44}
45
46template <class T, int s>
48 bool allow_caching = true;
49 int length = T::Convert(c, n, result, &allow_caching);
50 if (allow_caching) {
51 if (length == 1) {
52 entries_[c & kMask] = CacheEntry(c, result[0] - c);
53 return 1;
54 } else {
55 entries_[c & kMask] = CacheEntry(c, 0);
56 return 0;
57 }
58 } else {
59 return length;
60 }
61}
62#endif // !V8_INTL_SUPPORT
63
64bool Utf16::HasUnpairedSurrogate(const uint16_t* code_units, size_t length) {
65 for (size_t i = 0; i < length; ++i) {
66 const int code_unit = code_units[i];
67 if (IsLeadSurrogate(code_unit)) {
68 // The current code unit is a leading surrogate. Check if it is followed
69 // by a trailing surrogate.
70 if (i == length - 1) return true;
71 if (!IsTrailSurrogate(code_units[i + 1])) return true;
72 // Skip the paired trailing surrogate.
73 ++i;
74 } else if (IsTrailSurrogate(code_unit)) {
75 // All paired trailing surrogates are skipped above, so this branch is
76 // only for those that are unpaired.
77 return true;
78 }
79 }
80 return false;
81}
82
83// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they
84// stream in. This **must** be followed by a call to ValueOfIncrementalFinish
85// when the stream is complete, to ensure incomplete sequences are handled.
86uchar Utf8::ValueOfIncremental(const uint8_t** cursor, State* state,
87 Utf8IncrementalBuffer* buffer) {
88 DCHECK_NOT_NULL(buffer);
89 State old_state = *state;
90 uint8_t next = **cursor;
91 *cursor += 1;
92
93 if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {
94 DCHECK_EQ(0u, *buffer);
95 return static_cast<uchar>(next);
96 }
97
98 // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation
99 // char in that sequence.
100 Utf8DfaDecoder::Decode(next, state, buffer);
101
102 switch (*state) {
103 case State::kAccept: {
104 uchar t = *buffer;
105 *buffer = 0;
106 return t;
107 }
108
109 case State::kReject:
110 *state = State::kAccept;
111 *buffer = 0;
112
113 // If we hit a bad byte, we need to determine if we were trying to start
114 // a sequence or continue one. If we were trying to start a sequence,
115 // that means it's just an invalid lead byte and we need to continue to
116 // the next (which we already did above). If we were already in a
117 // sequence, we need to reprocess this same byte after resetting to the
118 // initial state.
119 if (old_state != State::kAccept) {
120 // We were trying to continue a sequence, so let's reprocess this byte
121 // next time.
122 *cursor -= 1;
123 }
124 return kBadChar;
125
126 default:
127 return kIncomplete;
128 }
129}
130
131unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {
132 static const int kMask = ~(1 << 6);
133 if (c <= kMaxOneByteChar) {
134 str[0] = c;
135 return 1;
136 } else {
137 str[0] = 0xC0 | (c >> 6);
138 str[1] = 0x80 | (c & kMask);
139 return 2;
140 }
141}
142
143// Encode encodes the UTF-16 code units c and previous into the given str
144// buffer, and combines surrogate code units into single code points. If
145// replace_invalid is set to true, orphan surrogate code units will be replaced
146// with kBadChar.
147unsigned Utf8::Encode(char* str, uchar c, int previous, bool replace_invalid) {
148 static const int kMask = ~(1 << 6);
149 if (c <= kMaxOneByteChar) {
150 str[0] = c;
151 return 1;
152 } else if (c <= kMaxTwoByteChar) {
153 str[0] = 0xC0 | (c >> 6);
154 str[1] = 0x80 | (c & kMask);
155 return 2;
156 } else if (c <= kMaxThreeByteChar) {
159 const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
160 return Encode(str - kUnmatchedSize,
162 Utf16::kNoPreviousCharacter, replace_invalid) -
163 kUnmatchedSize;
164 } else if (replace_invalid &&
166 c = kBadChar;
167 }
168 str[0] = 0xE0 | (c >> 12);
169 str[1] = 0x80 | ((c >> 6) & kMask);
170 str[2] = 0x80 | (c & kMask);
171 return 3;
172 } else {
173 str[0] = 0xF0 | (c >> 18);
174 str[1] = 0x80 | ((c >> 12) & kMask);
175 str[2] = 0x80 | ((c >> 6) & kMask);
176 str[3] = 0x80 | (c & kMask);
177 return 4;
178 }
179}
180
181uchar Utf8::ValueOf(const uint8_t* bytes, size_t length, size_t* cursor) {
182 if (length == 0) return kBadChar;
183 uint8_t first = bytes[0];
184 // Characters between 0000 and 007F are encoded as a single character
185 if (V8_LIKELY(first <= kMaxOneByteChar)) {
186 *cursor += 1;
187 return first;
188 }
189 return CalculateValue(bytes, length, cursor);
190}
191
192unsigned Utf8::LengthOneByte(uint8_t c) {
193 if (c <= kMaxOneByteChar) {
194 return 1;
195 } else {
196 return 2;
197 }
198}
199
200unsigned Utf8::Length(uchar c, int previous) {
201 if (c <= kMaxOneByteChar) {
202 return 1;
203 } else if (c <= kMaxTwoByteChar) {
204 return 2;
205 } else if (c <= kMaxThreeByteChar) {
209 }
210 return 3;
211 } else {
212 return 4;
213 }
214}
215
217 return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||
218 (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&
219 c != kBadChar);
220}
221
222template <typename Char>
224 char* buffer, size_t capacity,
225 bool write_null, bool replace_invalid_utf8) {
226 constexpr bool kSourceIsOneByte = sizeof(Char) == 1;
227
228 if constexpr (kSourceIsOneByte) {
229 // Only 16-bit characters can contain invalid unicode.
230 replace_invalid_utf8 = false;
231 }
232
233 size_t write_index = 0;
234 const Char* characters = string.begin();
235 size_t content_capacity = capacity - write_null;
236 CHECK_LE(content_capacity, capacity);
238 size_t read_index = 0;
239 for (; read_index < string.size(); read_index++) {
240 Char character = characters[read_index];
241
242 size_t required_capacity;
243 if constexpr (kSourceIsOneByte) {
244 required_capacity = Utf8::LengthOneByte(character);
245 } else {
246 required_capacity = Utf8::Length(character, last);
247 }
248 size_t remaining_capacity = content_capacity - write_index;
249 if (remaining_capacity < required_capacity) {
250 // Not enough space left, so stop here.
251 if (Utf16::IsSurrogatePair(last, character)) {
253 // We're in the middle of a surrogate pair. Delete the first part again.
254 write_index -= Utf8::kSizeOfUnmatchedSurrogate;
255 // We've already read at least one character which is a lead surrogate
256 DCHECK_NE(read_index, 0);
257 --read_index;
258 }
259 break;
260 }
261
262 if constexpr (kSourceIsOneByte) {
263 write_index += Utf8::EncodeOneByte(buffer + write_index, character);
264 } else {
265 // Handle the case where we cut off in the middle of a surrogate pair.
266 if ((read_index + 1 < string.size()) &&
267 Utf16::IsSurrogatePair(character, characters[read_index + 1])) {
268 write_index += Utf8::kSizeOfUnmatchedSurrogate;
269 } else {
270 write_index += Utf8::Encode(buffer + write_index, character, last,
271 replace_invalid_utf8);
272 }
273 }
274
275 last = character;
276 }
277 DCHECK_LE(write_index, capacity);
278
279 if (write_null) {
280 DCHECK_LT(write_index, capacity);
281 buffer[write_index++] = '\0';
282 }
283
284 size_t bytes_written = write_index;
285 size_t characters_processed = read_index;
286 return {bytes_written, characters_processed};
287}
288
289} // namespace unibrow
290
291#endif // V8_STRINGS_UNICODE_INL_H_
int CalculateValue(uchar c, uchar n, uchar *result)
Definition unicode-inl.h:47
int get(uchar c, uchar n, uchar *result)
Definition unicode-inl.h:32
bool get(uchar c)
Definition unicode-inl.h:18
bool CalculateValue(uchar c)
Definition unicode-inl.h:25
static const int kNoPreviousCharacter
Definition unicode.h:102
static bool IsSurrogatePair(int lead, int trail)
Definition unicode.h:103
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition unicode.h:113
static bool IsTrailSurrogate(int code)
Definition unicode.h:109
static bool HasUnpairedSurrogate(const uint16_t *code_units, size_t length)
Definition unicode-inl.h:64
static bool IsLeadSurrogate(int code)
Definition unicode.h:106
static uchar ValueOf(const uint8_t *str, size_t length, size_t *cursor)
static const unsigned kMaxThreeByteChar
Definition unicode.h:181
static const unsigned kMaxOneByteChar
Definition unicode.h:179
uint32_t Utf8IncrementalBuffer
Definition unicode.h:197
static const uchar kBadChar
Definition unicode.h:175
static unsigned Length(uchar chr, int previous)
static uchar CalculateValue(const uint8_t *str, size_t length, size_t *cursor)
Definition unicode.cc:202
static const uchar kIncomplete
Definition unicode.h:177
static unsigned EncodeOneByte(char *out, uint8_t c)
static bool IsValidCharacter(uchar c)
static unsigned LengthOneByte(uint8_t chr)
static const unsigned kMaxTwoByteChar
Definition unicode.h:180
static unsigned Encode(char *out, uchar c, int previous, bool replace_invalid=false)
static const unsigned kBytesSavedByCombiningSurrogates
Definition unicode.h:186
static const unsigned kSizeOfUnmatchedSurrogate
Definition unicode.h:187
Utf8DfaDecoder::State State
Definition unicode.h:163
static uchar ValueOfIncremental(const uint8_t **cursor, State *state, Utf8IncrementalBuffer *buffer)
Definition unicode-inl.h:86
LineAndColumn previous
ZoneVector< RpoNumber > & result
std::vector< EntryBuilder > entries_
LiftoffAssembler::CacheState state
unsigned int uchar
Definition unicode.h:21
unsigned short uint16_t
Definition unicode.cc:39
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define CHECK_LE(lhs, rhs)
#define DCHECK_NOT_NULL(val)
Definition logging.h:492
#define DCHECK_NE(v1, v2)
Definition logging.h:486
#define DCHECK_GE(v1, v2)
Definition logging.h:488
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_LT(v1, v2)
Definition logging.h:489
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
#define V8_LIKELY(condition)
Definition v8config.h:661