v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
unicode-decoder.cc
Go to the documentation of this file.
1// Copyright 2014 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
8#include "src/utils/memcopy.h"
9
10#if V8_ENABLE_WEBASSEMBLY
11#include "third_party/utf8-decoder/generalized-utf8-decoder.h"
12#endif
13
14namespace v8 {
15namespace internal {
16
17namespace {
18template <class Decoder>
19struct DecoderTraits;
20
21template <>
22struct DecoderTraits<Utf8Decoder> {
23 static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
24 // The DfaDecoder will only ever decode Unicode scalar values, and all
25 // sequences of USVs are valid.
28 return false;
29 }
30 static const bool kAllowIncompleteSequences = true;
31 using DfaDecoder = Utf8DfaDecoder;
32};
33
34#if V8_ENABLE_WEBASSEMBLY
35template <>
36struct DecoderTraits<Wtf8Decoder> {
37 static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
38 return unibrow::Utf16::IsSurrogatePair(lead, trail);
39 }
40 static const bool kAllowIncompleteSequences = false;
41 using DfaDecoder = GeneralizedUtf8DfaDecoder;
42};
43
44template <>
45struct DecoderTraits<StrictUtf8Decoder> {
46 static bool IsInvalidSurrogatePair(uint32_t lead, uint32_t trail) {
47 // The DfaDecoder will only ever decode Unicode scalar values, and all
48 // sequences of USVs are valid.
51 return false;
52 }
53 static const bool kAllowIncompleteSequences = false;
54 using DfaDecoder = Utf8DfaDecoder;
55};
56#endif // V8_ENABLE_WEBASSEMBLY
57} // namespace
58
59template <class Decoder>
61 : encoding_(Encoding::kAscii),
62 non_ascii_start_(NonAsciiStart(data.begin(), data.length())),
63 utf16_length_(non_ascii_start_) {
64 using Traits = DecoderTraits<Decoder>;
65 if (non_ascii_start_ == data.length()) return;
66
67 bool is_one_byte = true;
68 auto state = Traits::DfaDecoder::kAccept;
69 uint32_t current = 0;
70 uint32_t previous = 0;
71 const uint8_t* cursor = data.begin() + non_ascii_start_;
72 const uint8_t* end = data.begin() + data.length();
73
74 while (cursor < end) {
76 state == Traits::DfaDecoder::kAccept)) {
77 DCHECK_EQ(0u, current);
78 DCHECK(!Traits::IsInvalidSurrogatePair(previous, *cursor));
79 previous = *cursor;
81 cursor++;
82 continue;
83 }
84
85 auto previous_state = state;
86 Traits::DfaDecoder::Decode(*cursor, &state, &current);
87 if (state < Traits::DfaDecoder::kAccept) {
88 DCHECK_EQ(state, Traits::DfaDecoder::kReject);
89 if (Traits::kAllowIncompleteSequences) {
90 state = Traits::DfaDecoder::kAccept;
92 is_one_byte = false;
95 current = 0;
96 // If we were trying to continue a multibyte sequence, try this byte
97 // again.
98 if (previous_state != Traits::DfaDecoder::kAccept) continue;
99 } else {
101 return;
102 }
103 } else if (state == Traits::DfaDecoder::kAccept) {
104 if (Traits::IsInvalidSurrogatePair(previous, current)) {
106 return;
107 }
112 current = 0;
113 }
114 cursor++;
115 }
116
117 if (state == Traits::DfaDecoder::kAccept) {
119 } else if (Traits::kAllowIncompleteSequences) {
123 } else {
125 }
126}
127
128template <class Decoder>
129template <typename Char>
132 using Traits = DecoderTraits<Decoder>;
133 DCHECK(!is_invalid());
134 CopyChars(out, data.begin(), non_ascii_start_);
135
136 out += non_ascii_start_;
137
138 auto state = Traits::DfaDecoder::kAccept;
139 uint32_t current = 0;
140 const uint8_t* cursor = data.begin() + non_ascii_start_;
141 const uint8_t* end = data.begin() + data.length();
142
143 while (cursor < end) {
145 state == Traits::DfaDecoder::kAccept)) {
146 DCHECK_EQ(0u, current);
147 *(out++) = static_cast<Char>(*cursor);
148 cursor++;
149 continue;
150 }
151
152 auto previous_state = state;
153 Traits::DfaDecoder::Decode(*cursor, &state, &current);
154 if (Traits::kAllowIncompleteSequences &&
155 state < Traits::DfaDecoder::kAccept) {
156 state = Traits::DfaDecoder::kAccept;
157 *(out++) = static_cast<Char>(unibrow::Utf8::kBadChar);
158 current = 0;
159 // If we were trying to continue a multibyte sequence, try this byte
160 // again.
161 if (previous_state != Traits::DfaDecoder::kAccept) continue;
162 } else if (state == Traits::DfaDecoder::kAccept) {
163 if (sizeof(Char) == 1 ||
165 *(out++) = static_cast<Char>(current);
166 } else {
167 *(out++) = unibrow::Utf16::LeadSurrogate(current);
168 *(out++) = unibrow::Utf16::TrailSurrogate(current);
169 }
170 current = 0;
171 }
172 cursor++;
173 }
174
175 if (Traits::kAllowIncompleteSequences &&
176 state != Traits::DfaDecoder::kAccept) {
177 *out = static_cast<Char>(unibrow::Utf8::kBadChar);
178 } else {
179 DCHECK_EQ(state, Traits::DfaDecoder::kAccept);
180 }
181}
182
183#define DEFINE_UNICODE_DECODER(Decoder) \
184 template V8_EXPORT_PRIVATE Utf8DecoderBase<Decoder>::Utf8DecoderBase( \
185 base::Vector<const uint8_t> data); \
186 template V8_EXPORT_PRIVATE void Utf8DecoderBase<Decoder>::Decode( \
187 uint8_t* out, base::Vector<const uint8_t> data); \
188 template V8_EXPORT_PRIVATE void Utf8DecoderBase<Decoder>::Decode( \
189 uint16_t* out, base::Vector<const uint8_t> data)
190
192
193#if V8_ENABLE_WEBASSEMBLY
194DEFINE_UNICODE_DECODER(Wtf8Decoder);
195DEFINE_UNICODE_DECODER(StrictUtf8Decoder);
196#endif // V8_ENABLE_WEBASSEMBLY
197
198#undef DEFINE_UNICODE_DECODER
199
200} // namespace internal
201} // namespace v8
static const uint16_t kMaxChar
Definition unicode.h:142
static uint16_t LeadSurrogate(uint32_t char_code)
Definition unicode.h:126
static const uchar kMaxNonSurrogateCharCode
Definition unicode.h:116
static uint16_t TrailSurrogate(uint32_t char_code)
Definition unicode.h:129
static bool IsSurrogatePair(int lead, int trail)
Definition unicode.h:103
static bool IsTrailSurrogate(int code)
Definition unicode.h:109
static bool IsLeadSurrogate(int code)
Definition unicode.h:106
static const unsigned kMaxOneByteChar
Definition unicode.h:179
static const uchar kBadChar
Definition unicode.h:175
void Decode(Char *out, base::Vector< const uint8_t > data)
Utf8DecoderBase(base::Vector< const uint8_t > data)
int end
LineAndColumn current
LineAndColumn previous
LiftoffAssembler::CacheState state
void CopyChars(DstType *dst, const SrcType *src, size_t count) V8_NONNULL(1
uint32_t NonAsciiStart(const uint8_t *chars, uint32_t length)
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
#define DEFINE_UNICODE_DECODER(Decoder)
#define V8_LIKELY(condition)
Definition v8config.h:661