v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
scanner-character-streams.cc
Go to the documentation of this file.
1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
7#include <memory>
8#include <vector>
9
12#include "src/base/strings.h"
13#include "src/common/globals.h"
15#include "src/handles/handles.h"
18#include "src/parsing/scanner.h"
20
21namespace v8 {
22namespace internal {
23
25 public:
27 DCHECK(!string.is_null());
28 if (IsExternalOneByteString(string)) {
29 resource_ = Cast<ExternalOneByteString>(string)->resource();
30 } else {
31 DCHECK(IsExternalTwoByteString(string));
32 resource_ = Cast<ExternalTwoByteString>(string)->resource();
33 }
34 DCHECK(resource_);
35 resource_->Lock();
36 }
37
38 // Copying a lock increases the locking depth.
40 : resource_(other.resource_) {
41 resource_->Lock();
42 }
43
44 ~ScopedExternalStringLock() { resource_->Unlock(); }
45
46 private:
47 // Not nullptr.
49};
50
51namespace {
52const unibrow::uchar kUtf8Bom = 0xFEFF;
53} // namespace
54
55template <typename Char>
56struct Range {
57 const Char* start;
58 const Char* end;
59
60 size_t length() { return static_cast<size_t>(end - start); }
61 bool unaligned_start() const {
62 return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
63 }
64};
65
66// A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
67template <typename Char>
69 public:
71
72 OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
73 : string_(string), start_offset_(start_offset), length_(end) {}
74
78
79 // The no_gc argument is only here because of the templated way this class
80 // is used along with other implementations that require V8 heap access.
83 return {&string_->GetChars(*no_gc)[start_offset_ + std::min(length_, pos)],
84 &string_->GetChars(*no_gc)[start_offset_ + length_]};
85 }
86
87 static const bool kCanBeCloned = false;
88 static const bool kCanAccessHeap = true;
89
90 private:
92 const size_t start_offset_;
93 const size_t length_;
94};
95
96// A Char stream backed by an off-heap ExternalOneByteString or
97// ExternalTwoByteString.
98template <typename Char>
101
102 public:
103 ExternalStringStream(Tagged<ExternalString> string, size_t start_offset,
104 size_t length)
105 : lock_(string),
106 data_(string->GetChars() + start_offset),
107 length_(length) {}
108
110 : lock_(other.lock_),
111 data_(other.data_),
112 length_(other.length_) {}
113
114 // The no_gc argument is only here because of the templated way this class
115 // is used along with other implementations that require V8 heap access.
117 DisallowGarbageCollection* no_gc = nullptr) {
118 return {&data_[std::min(length_, pos)], &data_[length_]};
119 }
120
121 static const bool kCanBeCloned = true;
122 static const bool kCanAccessHeap = false;
123
124 private:
126 const Char* const data_;
127 const size_t length_;
128};
129
130// A Char stream backed by a C array. Testing only.
131template <typename Char>
133 public:
134 TestingStream(const Char* data, size_t length)
135 : data_(data), length_(length) {}
136 // The no_gc argument is only here because of the templated way this class
137 // is used along with other implementations that require V8 heap access.
139 DisallowGarbageCollection* no_gc = nullptr) {
140 return {&data_[std::min(length_, pos)], &data_[length_]};
141 }
142
143 static const bool kCanBeCloned = true;
144 static const bool kCanAccessHeap = false;
145
146 private:
147 const Char* const data_;
148 const size_t length_;
149};
150
151// A Char stream backed by multiple source-stream provided off-heap chunks.
152template <typename Char>
154 public:
156 : source_(source), chunks_(std::make_shared<std::vector<Chunk>>()) {}
157
159 : source_(nullptr),
160 chunks_(other.chunks_) {}
161
162 // The no_gc argument is only here because of the templated way this class
163 // is used along with other implementations that require V8 heap access.
165 DisallowGarbageCollection* no_gc = nullptr) {
166 Chunk& chunk = FindChunk(pos, stats);
167 size_t buffer_end = chunk.length;
168 size_t buffer_pos = std::min(buffer_end, pos - chunk.position);
169 return {&chunk.data.get()[buffer_pos], &chunk.data.get()[buffer_end]};
170 }
171
172 static const bool kCanBeCloned = true;
173 static const bool kCanAccessHeap = false;
174
175 private:
176 struct Chunk {
177 Chunk(const Char* const data, size_t position, size_t length)
178 : data(data), position(position), length(length) {}
179 std::unique_ptr<const Char[]> data;
180 // The logical position of data.
181 const size_t position;
182 const size_t length;
183 size_t end_position() const { return position + length; }
184 };
185
187 while (V8_UNLIKELY(chunks_->empty())) FetchChunk(size_t{0}, stats);
188
189 // Walk forwards while the position is in front of the current chunk.
190 while (position >= chunks_->back().end_position() &&
191 chunks_->back().length > 0) {
192 FetchChunk(chunks_->back().end_position(), stats);
193 }
194
195 // Walk backwards.
196 for (auto reverse_it = chunks_->rbegin(); reverse_it != chunks_->rend();
197 ++reverse_it) {
198 if (reverse_it->position <= position) return *reverse_it;
199 }
200
201 UNREACHABLE();
202 }
203
204 virtual void ProcessChunk(const uint8_t* data, size_t position,
205 size_t length) {
206 // Incoming data has to be aligned to Char size.
207 DCHECK_EQ(0, length % sizeof(Char));
208 chunks_->emplace_back(reinterpret_cast<const Char*>(data), position,
209 length / sizeof(Char));
210 }
211
212 void FetchChunk(size_t position, RuntimeCallStats* stats) {
213 // Cloned ChunkedStreams have a null source, and therefore can't fetch any
214 // new data.
216
217 const uint8_t* data = nullptr;
218 size_t length;
219 {
220 RCS_SCOPE(stats, RuntimeCallCounterId::kGetMoreDataCallback);
221 length = source_->GetMoreData(&data);
222 }
223 ProcessChunk(data, position, length);
224 }
225
227
228 protected:
229 std::shared_ptr<std::vector<struct Chunk>> chunks_;
230};
231
232// Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
233// Chars are buffered if either the underlying stream isn't utf-16 or the
234// underlying utf-16 stream might move (is on-heap).
235template <template <typename T> class ByteStream>
237 public:
238 template <class... TArgs>
241 }
242
243 bool can_be_cloned() const final {
244 return ByteStream<uint16_t>::kCanBeCloned;
245 }
246
247 std::unique_ptr<Utf16CharacterStream> Clone() const override {
249 return std::unique_ptr<Utf16CharacterStream>(
251 }
252
253 protected:
254 bool ReadBlock(size_t position) final {
258
260 Range<uint8_t> range =
261 byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
262 if (range.length() == 0) {
264 return false;
265 }
266
267 size_t length = std::min({kBufferSize, range.length()});
268 i::CopyChars(buffer_, range.start, length);
270 return true;
271 }
272
273 bool can_access_heap() const final {
274 return ByteStream<uint8_t>::kCanAccessHeap;
275 }
276
277 private:
280
281 static const size_t kBufferSize = 512;
283 ByteStream<uint8_t> byte_stream_;
284};
285
286// Provides an unbuffered utf-16 view on the bytes from the underlying
287// ByteStream.
288template <template <typename T> class ByteStream>
290 public:
291 template <class... TArgs>
294 }
295
296 bool can_access_heap() const final {
297 return ByteStream<uint16_t>::kCanAccessHeap;
298 }
299
300 bool can_be_cloned() const final {
301 return ByteStream<uint16_t>::kCanBeCloned;
302 }
303
304 std::unique_ptr<Utf16CharacterStream> Clone() const override {
305 return std::unique_ptr<Utf16CharacterStream>(
307 }
308
309 protected:
310 bool ReadBlock(size_t position) final {
313 Range<uint16_t> range =
314 byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
315 if (range.length() == 0) {
316 // We should not set the buffer pointers to nullptr to avoid undefined
317 // behavior, for example when incrementing buffer_cursor_. So instead use
318 // this static array.
319 static const uint16_t empty_buffer[1] = {0};
320 buffer_start_ = empty_buffer;
321 buffer_end_ = empty_buffer;
322 buffer_cursor_ = empty_buffer;
323 return false;
324 }
325 buffer_start_ = range.start;
326 buffer_end_ = range.end;
328
329 DCHECK(!range.unaligned_start());
331 return true;
332 }
333
336
337 ByteStream<uint16_t> byte_stream_;
338};
339
340// Provides an unbuffered utf-16 view on the bytes from the underlying
341// ByteStream.
343 : public UnbufferedCharacterStream<OnHeapStream> {
344 public:
345 template <class... TArgs>
346 RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
348 isolate_(isolate) {
349 isolate->main_thread_local_heap()->AddGCEpilogueCallback(
351 }
352
353 private:
358
359 static void UpdateBufferPointersCallback(void* stream) {
360 reinterpret_cast<RelocatingCharacterStream*>(stream)
362 }
363
366 Range<uint16_t> range =
368 if (range.start != buffer_start_) {
369 buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
370 buffer_start_ = range.start;
371 buffer_end_ = range.end;
372 }
373 }
374
376};
377
378// ----------------------------------------------------------------------------
379// BufferedUtf16CharacterStreams
380//
381// A buffered character stream based on a random access character
382// source (ReadBlock can be called with pos() pointing to any position,
383// even positions before the current).
384//
385// TODO(verwaest): Remove together with Utf8 external streaming streams.
387 public:
389
390 protected:
391 static const size_t kBufferSize = 512;
392
393 bool ReadBlock(size_t position) final;
394
395 // FillBuffer should read up to kBufferSize characters at position and store
396 // them into buffer_[0..]. It returns the number of characters stored.
397 virtual size_t FillBuffer(size_t position) = 0;
398
399 // Fixed sized buffer that this class reads from.
400 // The base class' buffer_start_ should always point to buffer_.
402};
403
406
417
418// ----------------------------------------------------------------------------
419// Windows1252CharacterStream - chunked streaming of windows-1252 data.
420//
421// Similar to BufferedCharacterStream, but does the translation of
422// windows-1252 that are incompatible with their latin-1 equivalents.
423
424namespace {
425
426static const base::uc16 kWindows1252ToUC16[256] = {
427 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07
428 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
429 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17
430 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
431 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27
432 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F
433 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37
434 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F
435 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47
436 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F
437 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57
438 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F
439 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67
440 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F
441 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77
442 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F
443 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
444 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
445 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
446 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
447 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7
448 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF
449 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7
450 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF
451 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7
452 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF
453 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7
454 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF
455 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7
456 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF
457 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7
458 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF // F8-FF
459};
460
461} // namespace
462
464 public:
466 size_t pos, ScriptCompiler::ExternalSourceStream* source_stream)
467 : byte_stream_(source_stream) {
469 }
470
471 bool can_be_cloned() const final {
473 }
474
475 std::unique_ptr<Utf16CharacterStream> Clone() const override {
477 return std::unique_ptr<Utf16CharacterStream>(
478 new Windows1252CharacterStream(*this));
479 }
480
481 protected:
482 bool ReadBlock(size_t position) final {
486
488 Range<uint8_t> range =
490 if (range.length() == 0) {
492 return false;
493 }
494
495 size_t length = std::min({kBufferSize, range.length()});
496 std::transform(range.start, range.start + length, &buffer_[0],
497 [](uint8_t c) { return kWindows1252ToUC16[c]; });
499 return true;
500 }
501
502 bool can_access_heap() const final {
504 }
505
506 private:
509
510 static const size_t kBufferSize = 512;
513};
514
515// ----------------------------------------------------------------------------
516// Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
517//
518// This implementation is fairly complex, since data arrives in chunks which
519// may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
520// character position is tricky because the byte position cannot be derived
521// from the character position.
522//
523// TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
524// instead so we don't need to buffer.
525
527 public:
530 : chunks_(std::make_shared<std::vector<Chunk>>()),
531 current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
532 source_stream_(source_stream) {}
534
535 bool can_access_heap() const final { return false; }
536
537 bool can_be_cloned() const final { return true; }
538
539 std::unique_ptr<Utf16CharacterStream> Clone() const override {
540 return std::unique_ptr<Utf16CharacterStream>(
541 new Utf8ExternalStreamingStream(*this));
542 }
543
544 protected:
545 size_t FillBuffer(size_t position) final;
546
547 private:
548 // A position within the data stream. It stores:
549 // - The 'physical' position (# of bytes in the stream),
550 // - the 'logical' position (# of ucs-2 characters, also within the stream),
551 // - a possibly incomplete utf-8 char at the current 'physical' position.
558
559 // Position contains a StreamPosition and the index of the chunk the position
560 // points into. (The chunk_no could be derived from pos, but that'd be
561 // an expensive search through all chunks.)
562 struct Position {
563 size_t chunk_no;
565 };
566
567 // A chunk in the list of chunks, containing:
568 // - The chunk data (data pointer and length), and
569 // - the position at the first byte of the chunk.
570 struct Chunk {
571 Chunk(const uint8_t* data, size_t length, StreamPosition start)
572 : data(data), length(length), start(start) {}
573 std::unique_ptr<const uint8_t[]> data;
574 size_t length;
576 };
577
579 V8_NOEXCEPT : chunks_(source_stream.chunks_),
580 current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
581 source_stream_(nullptr) {}
582
583 // Within the current chunk, skip forward from current_ towards position.
584 bool SkipToPosition(size_t position);
585 // Within the current chunk, fill the buffer_ (while it has capacity).
587 // Fetch a new chunk (assuming current_ is at the end of the current data).
588 bool FetchChunk();
589 // Search through the chunks and set current_ to point to the given position.
590 // (This call is potentially expensive.)
591 void SearchPosition(size_t position);
592
593 Chunk& GetChunk(size_t chunk_no) { return (*chunks_)[chunk_no]; }
594
595 std::shared_ptr<std::vector<Chunk>> chunks_;
598};
599
601 DCHECK_LE(current_.pos.chars, position); // We can only skip forward.
602
603 // Already there? Then return immediately.
604 if (current_.pos.chars == position) return true;
605
606 const Chunk& chunk = GetChunk(current_.chunk_no);
608
609 unibrow::Utf8::State state = chunk.start.state;
610 uint32_t incomplete_char = chunk.start.incomplete_char;
611 size_t it = current_.pos.bytes - chunk.start.bytes;
612 const uint8_t* cursor = &chunk.data.get()[it];
613 const uint8_t* end = &chunk.data.get()[chunk.length];
614
615 size_t chars = current_.pos.chars;
616
617 if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
618 while (cursor < end) {
620 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
621 if (t == unibrow::Utf8::kIncomplete) continue;
622 if (t != kUtf8Bom) {
623 chars++;
625 }
626 break;
627 }
628 }
629
630 while (cursor < end && chars < position) {
632 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
634 chars++;
636 }
637 }
638
639 current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data.get());
640 current_.pos.chars = chars;
641 current_.pos.incomplete_char = incomplete_char;
643 current_.chunk_no += (cursor == end);
644
645 return current_.pos.chars == position;
646}
647
652
653 const Chunk& chunk = GetChunk(current_.chunk_no);
654
655 // The buffer_ is writable, but buffer_*_ members are const. So we get a
656 // non-const pointer into buffer that points to the same char as buffer_end_.
657 uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
658 DCHECK_EQ(output_cursor, buffer_end_);
659
661 uint32_t incomplete_char = current_.pos.incomplete_char;
662
663 // If the current chunk is the last (empty) chunk we'll have to process
664 // any left-over, partial characters.
665 if (chunk.length == 0) {
669 *output_cursor = static_cast<base::uc16>(t);
670 buffer_end_++;
674 }
675 return;
676 }
677
678 size_t it = current_.pos.bytes - chunk.start.bytes;
679 const uint8_t* cursor = chunk.data.get() + it;
680 const uint8_t* end = chunk.data.get() + chunk.length;
681
682 // Deal with possible BOM.
683 if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
684 while (cursor < end) {
686 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
687 if (V8_LIKELY(t < kUtf8Bom)) {
688 *(output_cursor++) =
689 static_cast<base::uc16>(t); // The most frequent case.
690 } else if (t == unibrow::Utf8::kIncomplete) {
691 continue;
692 } else if (t == kUtf8Bom) {
693 // BOM detected at beginning of the stream. Don't copy it.
695 *(output_cursor++) = static_cast<base::uc16>(t);
696 } else {
697 *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
698 *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
699 }
700 break;
701 }
702 }
703
704 const uint16_t* max_buffer_end = buffer_start_ + kBufferSize;
705 while (cursor < end && output_cursor + 1 < max_buffer_end) {
707 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
709 *(output_cursor++) =
710 static_cast<base::uc16>(t); // The most frequent case.
711 } else if (t == unibrow::Utf8::kIncomplete) {
712 continue;
713 } else {
714 *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
715 *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
716 }
717 // Fast path for ascii sequences.
718 size_t remaining = end - cursor;
719 size_t max_buffer = max_buffer_end - output_cursor;
720 int max_length = static_cast<int>(std::min(remaining, max_buffer));
721 DCHECK_EQ(state, unibrow::Utf8::State::kAccept);
722 int ascii_length = NonAsciiStart(cursor, max_length);
723 CopyChars(output_cursor, cursor, ascii_length);
724 cursor += ascii_length;
725 output_cursor += ascii_length;
726 }
727
728 current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data.get());
729 current_.pos.chars += (output_cursor - buffer_end_);
730 current_.pos.incomplete_char = incomplete_char;
732 current_.chunk_no += (cursor == end);
733
734 buffer_end_ = output_cursor;
735}
736
738 RCS_SCOPE(runtime_call_stats(), RuntimeCallCounterId::kGetMoreDataCallback);
740 DCHECK(chunks_->empty() || chunks_->back().length != 0);
741
742 // Clone Utf8ExternalStreamingStreams have a null source stream, and
743 // therefore can't fetch any new data.
745
746 // Utf8ExternalStreamingStreams that have been cloned are not allowed to fetch
747 // any more.
748 DCHECK_EQ(chunks_.use_count(), 1);
749
750 const uint8_t* chunk = nullptr;
751 size_t length = source_stream_->GetMoreData(&chunk);
752 chunks_->emplace_back(chunk, length, current_.pos);
753 return length > 0;
754}
755
757 // If current_ already points to the right position, we're done.
758 //
759 // This is expected to be the common case, since we typically call
760 // FillBuffer right after the current buffer.
761 if (current_.pos.chars == position) return;
762
763 // No chunks. Fetch at least one, so we can assume !chunks_->empty() below.
764 if (chunks_->empty()) {
768 FetchChunk();
769 }
770
771 // Search for the last chunk whose start position is less or equal to
772 // position.
773 size_t chunk_no = chunks_->size() - 1;
774 while (chunk_no > 0 && GetChunk(chunk_no).start.chars > position) {
775 chunk_no--;
776 }
777
778 // Did we find the terminating (zero-length) chunk? Then we're seeking
779 // behind the end of the data, and position does not exist.
780 // Set current_ to point to the terminating chunk.
781 if (GetChunk(chunk_no).length == 0) {
782 current_ = {chunk_no, GetChunk(chunk_no).start};
783 return;
784 }
785
786 // Did we find the non-last chunk? Then our position must be within chunk_no.
787 if (chunk_no + 1 < chunks_->size()) {
788 // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
789 // (Many web sites declare utf-8 encoding, but use only (or almost only) the
790 // ASCII subset for their JavaScript sources. We can exploit this, by
791 // checking whether the # bytes in a chunk are equal to the # chars, and if
792 // so avoid the expensive SkipToPosition.)
793 bool ascii_only_chunk =
794 GetChunk(chunk_no).start.incomplete_char == 0 &&
795 (GetChunk(chunk_no + 1).start.bytes - GetChunk(chunk_no).start.bytes) ==
796 (GetChunk(chunk_no + 1).start.chars -
797 GetChunk(chunk_no).start.chars);
798 if (ascii_only_chunk) {
799 size_t skip = position - GetChunk(chunk_no).start.chars;
800 current_ = {chunk_no,
801 {GetChunk(chunk_no).start.bytes + skip,
802 GetChunk(chunk_no).start.chars + skip, 0,
803 unibrow::Utf8::State::kAccept}};
804 } else {
805 current_ = {chunk_no, GetChunk(chunk_no).start};
807 }
808
809 // Since position was within the chunk, SkipToPosition should have found
810 // something.
812 return;
813 }
814
815 // What's left: We're in the last, non-terminating chunk. Our position
816 // may be in the chunk, but it may also be in 'future' chunks, which we'll
817 // have to obtain.
818 DCHECK_EQ(chunk_no, chunks_->size() - 1);
819 current_ = {chunk_no, GetChunk(chunk_no).start};
820 bool have_more_data = true;
821 bool found = SkipToPosition(position);
822 while (have_more_data && !found) {
824 have_more_data = FetchChunk();
825 found = have_more_data && SkipToPosition(position);
826 }
827
828 // We'll return with a postion != the desired position only if we're out
829 // of data. In that case, we'll point to the terminating chunk.
831 DCHECK_EQ(have_more_data, chunks_->back().length != 0);
832 DCHECK_IMPLIES(!found, !have_more_data);
833 DCHECK_IMPLIES(!found, current_.chunk_no == chunks_->size() - 1);
834}
835
839
841 bool out_of_data = current_.chunk_no != chunks_->size() &&
844
845 if (out_of_data) return 0;
846
847 // Fill the buffer, until we have at least one char (or are out of data).
848 // (The embedder might give us 1-byte blocks within a utf-8 char, so we
849 // can't guarantee progress with one chunk. Thus we iterate.)
850 while (!out_of_data && buffer_cursor_ == buffer_end_) {
851 // At end of current data, but there might be more? Then fetch it.
852 if (current_.chunk_no == chunks_->size()) {
853 out_of_data = !FetchChunk();
854 }
856 }
857
859 static_cast<size_t>(buffer_end_ - buffer_cursor_));
861}
862
863// ----------------------------------------------------------------------------
864// ScannerStream: Create stream instances.
865
867 Handle<String> data) {
868 return ScannerStream::For(isolate, data, 0, data->length());
869}
870
872 int start_pos, int end_pos) {
873 CHECK_GE(start_pos, 0);
874 CHECK_LE(start_pos, end_pos);
875 CHECK_LE(end_pos, data->length());
876 size_t start_offset = 0;
877 if (IsSlicedString(*data)) {
879 start_offset = string->offset();
880 Tagged<String> parent = string->parent();
881 if (IsThinString(parent)) parent = Cast<ThinString>(parent)->actual();
882 data = handle(parent, isolate);
883 } else {
884 data = String::Flatten(isolate, data);
885 }
886 if (IsExternalOneByteString(*data)) {
888 static_cast<size_t>(start_pos), Cast<ExternalOneByteString>(*data),
889 start_offset, static_cast<size_t>(end_pos));
890 } else if (IsExternalTwoByteString(*data)) {
892 static_cast<size_t>(start_pos), Cast<ExternalTwoByteString>(*data),
893 start_offset, static_cast<size_t>(end_pos));
894 } else if (IsSeqOneByteString(*data)) {
896 static_cast<size_t>(start_pos), Cast<SeqOneByteString>(data),
897 start_offset, static_cast<size_t>(end_pos));
898 } else if (IsSeqTwoByteString(*data)) {
899 return new RelocatingCharacterStream(
900 isolate, static_cast<size_t>(start_pos), Cast<SeqTwoByteString>(data),
901 start_offset, static_cast<size_t>(end_pos));
902 } else {
903 UNREACHABLE();
904 }
905}
906
907std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
908 const char* data) {
909 return ScannerStream::ForTesting(data, strlen(data));
910}
911
912std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
913 const char* data, size_t length) {
914 if (data == nullptr) {
915 DCHECK_EQ(length, 0);
916
917 // We don't want to pass in a null pointer into the the character stream,
918 // because then the one-past-the-end pointer is undefined, so instead pass
919 // through this static array.
920 static const char non_null_empty_string[1] = {0};
921 data = non_null_empty_string;
922 }
923
924 return std::unique_ptr<Utf16CharacterStream>(
926 0, reinterpret_cast<const uint8_t*>(data), length));
927}
928
929std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
930 const uint16_t* data, size_t length) {
931 if (data == nullptr) {
932 DCHECK_EQ(length, 0);
933
934 // We don't want to pass in a null pointer into the the character stream,
935 // because then the one-past-the-end pointer is undefined, so instead pass
936 // through this static array.
937 static const uint16_t non_null_empty_uint16_t_string[1] = {0};
938 data = non_null_empty_uint16_t_string;
939 }
940
941 return std::unique_ptr<Utf16CharacterStream>(
942 new UnbufferedCharacterStream<TestingStream>(0, data, length));
943}
944
948 switch (encoding) {
951 static_cast<size_t>(0), source_stream);
953 return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
954 source_stream);
956 return new Windows1252CharacterStream(static_cast<size_t>(0),
957 source_stream);
959 return new Utf8ExternalStreamingStream(source_stream);
960 }
961 UNREACHABLE();
962}
963
964} // namespace internal
965} // namespace v8
SourcePosition pos
static uint16_t LeadSurrogate(uint32_t char_code)
Definition unicode.h:126
static const uchar kMaxNonSurrogateCharCode
Definition unicode.h:116
static uint16_t TrailSurrogate(uint32_t char_code)
Definition unicode.h:129
static const uchar kBufferEmpty
Definition unicode.h:176
static const uchar kBadChar
Definition unicode.h:175
static const uchar kIncomplete
Definition unicode.h:177
static uchar ValueOfIncrementalFinish(State *state)
Definition unicode.cc:224
Utf8DfaDecoder::State State
Definition unicode.h:163
static uchar ValueOfIncremental(const uint8_t **cursor, State *state, Utf8IncrementalBuffer *buffer)
Definition unicode-inl.h:86
virtual size_t GetMoreData(const uint8_t **src)=0
std::unique_ptr< Utf16CharacterStream > Clone() const override
BufferedCharacterStream(const BufferedCharacterStream< ByteStream > &other)
virtual size_t FillBuffer(size_t position)=0
ChunkedStream(ScriptCompiler::ExternalSourceStream *source)
Chunk & FindChunk(size_t position, RuntimeCallStats *stats)
std::shared_ptr< std::vector< struct Chunk > > chunks_
ScriptCompiler::ExternalSourceStream * source_
void FetchChunk(size_t position, RuntimeCallStats *stats)
Range< Char > GetDataAt(size_t pos, RuntimeCallStats *stats, DisallowGarbageCollection *no_gc=nullptr)
virtual void ProcessChunk(const uint8_t *data, size_t position, size_t length)
ChunkedStream(const ChunkedStream &other) V8_NOEXCEPT
ExternalStringStream(Tagged< ExternalString > string, size_t start_offset, size_t length)
ExternalStringStream(const ExternalStringStream &other) V8_NOEXCEPT
typename CharTraits< Char >::ExternalString ExternalString
Range< Char > GetDataAt(size_t pos, RuntimeCallStats *stats, DisallowGarbageCollection *no_gc=nullptr)
LocalHeap * main_thread_local_heap()
Definition isolate.cc:7479
void RemoveGCEpilogueCallback(GCEpilogueCallback *callback, void *data)
Range< Char > GetDataAt(size_t pos, RuntimeCallStats *stats, DisallowGarbageCollection *no_gc)
OnHeapStream(Handle< String > string, size_t start_offset, size_t end)
typename CharTraits< Char >::String String
OnHeapStream(const OnHeapStream &) V8_NOEXCEPT
RelocatingCharacterStream(Isolate *isolate, size_t pos, TArgs... args)
static Utf16CharacterStream * For(Isolate *isolate, Handle< String > data)
static std::unique_ptr< Utf16CharacterStream > ForTesting(const char *data)
const v8::String::ExternalStringResourceBase * resource_
ScopedExternalStringLock(Tagged< ExternalString > string)
ScopedExternalStringLock(const ScopedExternalStringLock &other) V8_NOEXCEPT
static V8_INLINE HandleType< String > Flatten(Isolate *isolate, HandleType< T > string, AllocationType allocation=AllocationType::kYoung)
TestingStream(const Char *data, size_t length)
Range< Char > GetDataAt(size_t pos, RuntimeCallStats *stats, DisallowGarbageCollection *no_gc=nullptr)
UnbufferedCharacterStream(const UnbufferedCharacterStream< ByteStream > &other)
std::unique_ptr< Utf16CharacterStream > Clone() const override
RuntimeCallStats * runtime_call_stats() const
Definition scanner.h:140
const uint16_t * buffer_cursor_
Definition scanner.h:205
Utf8ExternalStreamingStream(ScriptCompiler::ExternalSourceStream *source_stream)
ScriptCompiler::ExternalSourceStream * source_stream_
std::shared_ptr< std::vector< Chunk > > chunks_
Utf8ExternalStreamingStream(const Utf8ExternalStreamingStream &source_stream) V8_NOEXCEPT
std::unique_ptr< Utf16CharacterStream > Clone() const override
Windows1252CharacterStream(const Windows1252CharacterStream &other) V8_NOEXCEPT
std::unique_ptr< Utf16CharacterStream > Clone() const override
Windows1252CharacterStream(size_t pos, ScriptCompiler::ExternalSourceStream *source_stream)
base::OwnedVector< uint8_t > buffer_
Definition assembler.cc:111
int start
int end
base::Vector< const DirectHandle< Object > > args
Definition execution.cc:74
LiftoffAssembler::CacheState state
int position
Definition liveedit.cc:290
STL namespace.
unsigned int uchar
Definition unicode.h:21
uint16_t uc16
Definition strings.h:18
V8_INLINE IndirectHandle< T > handle(Tagged< T > object, Isolate *isolate)
Definition handles-inl.h:72
void CopyChars(DstType *dst, const SrcType *src, size_t count) V8_NONNULL(1
uint32_t NonAsciiStart(const uint8_t *chars, uint32_t length)
Tagged< To > Cast(Tagged< From > value, const v8::SourceLocation &loc=INIT_SOURCE_LOCATION_IN_DEBUG)
Definition casting.h:150
#define RCS_SCOPE(...)
#define V8_NOEXCEPT
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define CHECK_GE(lhs, rhs)
#define CHECK(condition)
Definition logging.h:124
#define CHECK_LE(lhs, rhs)
#define DCHECK_NOT_NULL(val)
Definition logging.h:492
#define DCHECK_IMPLIES(v1, v2)
Definition logging.h:493
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_LT(v1, v2)
Definition logging.h:489
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
Chunk(const Char *const data, size_t position, size_t length)
Chunk(const uint8_t *data, size_t length, StreamPosition start)
#define V8_LIKELY(condition)
Definition v8config.h:661
#define V8_UNLIKELY(condition)
Definition v8config.h:660
#define V8_NODISCARD
Definition v8config.h:693