unicode-inl_8h_source.html

// Copyright 2007-2010 the V8 project authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.


#ifndef V8_STRINGS_UNICODE_INL_H_

#define V8_STRINGS_UNICODE_INL_H_


#include "src/strings/unicode.h"

// Include the non-inl header before the rest of the headers.


#include "src/base/logging.h"

#include "src/utils/utils.h"


namespace unibrow {


#ifndef V8_INTL_SUPPORT

template <class T, int s>


bool Predicate<T, s>::get(uchar code_point) {

  CacheEntry entry = entries_[code_point & kMask];

  if (entry.code_point() == code_point) return entry.value();

  return CalculateValue(code_point);

}


template <class T, int s>


bool Predicate<T, s>::CalculateValue(uchar code_point) {

  bool result = T::Is(code_point);

  entries_[code_point & kMask] = CacheEntry(code_point, result);

  return result;

}


template <class T, int s>


int Mapping<T, s>::get(uchar c, uchar n, uchar* result) {

  CacheEntry entry = entries_[c & kMask];

  if (entry.code_point_ == c) {

    if (entry.offset_ == 0) {

      return 0;

    } else {

      result[0] = c + entry.offset_;

      return 1;

    }

  } else {

    return CalculateValue(c, n, result);

  }

}


template <class T, int s>


int Mapping<T, s>::CalculateValue(uchar c, uchar n, uchar* result) {

  bool allow_caching = true;

  int length = T::Convert(c, n, result, &allow_caching);

  if (allow_caching) {

    if (length == 1) {

      entries_[c & kMask] = CacheEntry(c, result[0] - c);

      return 1;

    } else {

      entries_[c & kMask] = CacheEntry(c, 0);

      return 0;

    }

  } else {

    return length;

  }

}


#endif  // !V8_INTL_SUPPORT


bool Utf16::HasUnpairedSurrogate(const uint16_t* code_units, size_t length) {

  for (size_t i = 0; i < length; ++i) {

    const int code_unit = code_units[i];

    if (IsLeadSurrogate(code_unit)) {

      // The current code unit is a leading surrogate. Check if it is followed

      // by a trailing surrogate.

      if (i == length - 1) return true;

      if (!IsTrailSurrogate(code_units[i + 1])) return true;

      // Skip the paired trailing surrogate.


      ++i;

    } else if (IsTrailSurrogate(code_unit)) {

      // All paired trailing surrogates are skipped above, so this branch is

      // only for those that are unpaired.


      return true;

    }

  }

  return false;

}


// Decodes UTF-8 bytes incrementally, allowing the decoding of bytes as they

// stream in. This **must** be followed by a call to ValueOfIncrementalFinish

// when the stream is complete, to ensure incomplete sequences are handled.


uchar Utf8::ValueOfIncremental(const uint8_t** cursor, State* state,

                               Utf8IncrementalBuffer* buffer) {

  DCHECK_NOT_NULL(buffer);

  State old_state = *state;

  uint8_t next = **cursor;

  *cursor += 1;


  if (V8_LIKELY(next <= kMaxOneByteChar && old_state == State::kAccept)) {

    DCHECK_EQ(0u, *buffer);

    return static_cast<uchar>(next);

  }


  // So we're at the lead byte of a 2/3/4 sequence, or we're at a continuation

  // char in that sequence.

  Utf8DfaDecoder::Decode(next, state, buffer);


  switch (*state) {

    case State::kAccept: {

      uchar t = *buffer;

      *buffer = 0;

      return t;

    }


    case State::kReject:

      *state = State::kAccept;

      *buffer = 0;


      // If we hit a bad byte, we need to determine if we were trying to start

      // a sequence or continue one. If we were trying to start a sequence,

      // that means it's just an invalid lead byte and we need to continue to

      // the next (which we already did above). If we were already in a

      // sequence, we need to reprocess this same byte after resetting to the

      // initial state.

      if (old_state != State::kAccept) {

        // We were trying to continue a sequence, so let's reprocess this byte

        // next time.

        *cursor -= 1;

      }

      return kBadChar;


    default:

      return kIncomplete;

  }

}


unsigned Utf8::EncodeOneByte(char* str, uint8_t c) {

  static const int kMask = ~(1 << 6);

  if (c <= kMaxOneByteChar) {

    str[0] = c;

    return 1;

  } else {

    str[0] = 0xC0 | (c >> 6);

    str[1] = 0x80 | (c & kMask);

    return 2;

  }

}


// Encode encodes the UTF-16 code units c and previous into the given str

// buffer, and combines surrogate code units into single code points. If

// replace_invalid is set to true, orphan surrogate code units will be replaced

// with kBadChar.


unsigned Utf8::Encode(char* str, uchar c, int previous, bool replace_invalid) {

  static const int kMask = ~(1 << 6);

  if (c <= kMaxOneByteChar) {

    str[0] = c;

    return 1;

  } else if (c <= kMaxTwoByteChar) {

    str[0] = 0xC0 | (c >> 6);

    str[1] = 0x80 | (c & kMask);

    return 2;

  } else if (c <= kMaxThreeByteChar) {

    DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));

    if (Utf16::IsSurrogatePair(previous, c)) {

      const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;

      return Encode(str - kUnmatchedSize,

                    Utf16::CombineSurrogatePair(previous, c),

                    Utf16::kNoPreviousCharacter, replace_invalid) -

             kUnmatchedSize;

    } else if (replace_invalid &&

               (Utf16::IsLeadSurrogate(c) || Utf16::IsTrailSurrogate(c))) {

      c = kBadChar;

    }

    str[0] = 0xE0 | (c >> 12);

    str[1] = 0x80 | ((c >> 6) & kMask);

    str[2] = 0x80 | (c & kMask);

    return 3;

  } else {

    str[0] = 0xF0 | (c >> 18);

    str[1] = 0x80 | ((c >> 12) & kMask);

    str[2] = 0x80 | ((c >> 6) & kMask);

    str[3] = 0x80 | (c & kMask);

    return 4;

  }

}


uchar Utf8::ValueOf(const uint8_t* bytes, size_t length, size_t* cursor) {

  if (length == 0) return kBadChar;

  uint8_t first = bytes[0];

  // Characters between 0000 and 007F are encoded as a single character

  if (V8_LIKELY(first <= kMaxOneByteChar)) {

    *cursor += 1;

    return first;

  }

  return CalculateValue(bytes, length, cursor);

}


unsigned Utf8::LengthOneByte(uint8_t c) {

  if (c <= kMaxOneByteChar) {

    return 1;

  } else {

    return 2;

  }

}


unsigned Utf8::Length(uchar c, int previous) {

  if (c <= kMaxOneByteChar) {

    return 1;

  } else if (c <= kMaxTwoByteChar) {

    return 2;

  } else if (c <= kMaxThreeByteChar) {

    DCHECK(!Utf16::IsLeadSurrogate(Utf16::kNoPreviousCharacter));

    if (Utf16::IsSurrogatePair(previous, c)) {

      return kSizeOfUnmatchedSurrogate - kBytesSavedByCombiningSurrogates;

    }

    return 3;

  } else {

    return 4;

  }

}


bool Utf8::IsValidCharacter(uchar c) {

  return c < 0xD800u || (c >= 0xE000u && c < 0xFDD0u) ||

         (c > 0xFDEFu && c <= 0x10FFFFu && (c & 0xFFFEu) != 0xFFFEu &&

          c != kBadChar);

}


template <typename Char>


Utf8::EncodingResult Utf8::Encode(v8::base::Vector<const Char> string,

                                  char* buffer, size_t capacity,

                                  bool write_null, bool replace_invalid_utf8) {

  constexpr bool kSourceIsOneByte = sizeof(Char) == 1;


  if constexpr (kSourceIsOneByte) {

    // Only 16-bit characters can contain invalid unicode.

    replace_invalid_utf8 = false;

  }


  size_t write_index = 0;

  const Char* characters = string.begin();

  size_t content_capacity = capacity - write_null;

  CHECK_LE(content_capacity, capacity);

  uint16_t last = Utf16::kNoPreviousCharacter;

  size_t read_index = 0;

  for (; read_index < string.size(); read_index++) {

    Char character = characters[read_index];


    size_t required_capacity;

    if constexpr (kSourceIsOneByte) {

      required_capacity = Utf8::LengthOneByte(character);

    } else {

      required_capacity = Utf8::Length(character, last);

    }

    size_t remaining_capacity = content_capacity - write_index;

    if (remaining_capacity < required_capacity) {

      // Not enough space left, so stop here.

      if (Utf16::IsSurrogatePair(last, character)) {

        DCHECK_GE(write_index, Utf8::kSizeOfUnmatchedSurrogate);

        // We're in the middle of a surrogate pair. Delete the first part again.

        write_index -= Utf8::kSizeOfUnmatchedSurrogate;

        // We've already read at least one character which is a lead surrogate

        DCHECK_NE(read_index, 0);

        --read_index;

      }

      break;

    }


    if constexpr (kSourceIsOneByte) {

      write_index += Utf8::EncodeOneByte(buffer + write_index, character);

    } else {

      // Handle the case where we cut off in the middle of a surrogate pair.

      if ((read_index + 1 < string.size()) &&

          Utf16::IsSurrogatePair(character, characters[read_index + 1])) {

        write_index += Utf8::kSizeOfUnmatchedSurrogate;

      } else {

        write_index += Utf8::Encode(buffer + write_index, character, last,

                                    replace_invalid_utf8);

      }

    }


    last = character;

  }

  DCHECK_LE(write_index, capacity);


  if (write_null) {

    DCHECK_LT(write_index, capacity);

    buffer[write_index++] = '\0';

  }


  size_t bytes_written = write_index;

  size_t characters_processed = read_index;

  return {bytes_written, characters_processed};

}


}  // namespace unibrow


#endif  // V8_STRINGS_UNICODE_INL_H_


unibrow::Mapping::CalculateValue
int CalculateValue(uchar c, uchar n, uchar *result)
Definition unicode-inl.h:47

unibrow::Mapping::get
int get(uchar c, uchar n, uchar *result)
Definition unicode-inl.h:32

unibrow::Predicate::CacheEntry
Definition unicode.h:39

unibrow::Predicate::CacheEntry::value
bool value() const
Definition unicode.h:52

unibrow::Predicate::CacheEntry::code_point
uchar code_point() const
Definition unicode.h:51

unibrow::Predicate::get
bool get(uchar c)
Definition unicode-inl.h:18

unibrow::Predicate::CalculateValue
bool CalculateValue(uchar c)
Definition unicode-inl.h:25

unibrow::Utf16::kNoPreviousCharacter
static const int kNoPreviousCharacter
Definition unicode.h:102

unibrow::Utf16::IsSurrogatePair
static bool IsSurrogatePair(int lead, int trail)
Definition unicode.h:103

unibrow::Utf16::CombineSurrogatePair
static int CombineSurrogatePair(uchar lead, uchar trail)
Definition unicode.h:113

unibrow::Utf16::IsTrailSurrogate
static bool IsTrailSurrogate(int code)
Definition unicode.h:109

unibrow::Utf16::HasUnpairedSurrogate
static bool HasUnpairedSurrogate(const uint16_t *code_units, size_t length)
Definition unicode-inl.h:64

unibrow::Utf16::IsLeadSurrogate
static bool IsLeadSurrogate(int code)
Definition unicode.h:106

unibrow::Utf8::ValueOf
static uchar ValueOf(const uint8_t *str, size_t length, size_t *cursor)
Definition unicode-inl.h:181

unibrow::Utf8::kMaxThreeByteChar
static const unsigned kMaxThreeByteChar
Definition unicode.h:181

unibrow::Utf8::kMaxOneByteChar
static const unsigned kMaxOneByteChar
Definition unicode.h:179

unibrow::Utf8::Utf8IncrementalBuffer
uint32_t Utf8IncrementalBuffer
Definition unicode.h:197

unibrow::Utf8::kBadChar
static const uchar kBadChar
Definition unicode.h:175

unibrow::Utf8::Length
static unsigned Length(uchar chr, int previous)
Definition unicode-inl.h:200

unibrow::Utf8::CalculateValue
static uchar CalculateValue(const uint8_t *str, size_t length, size_t *cursor)
Definition unicode.cc:202

unibrow::Utf8::kIncomplete
static const uchar kIncomplete
Definition unicode.h:177

unibrow::Utf8::EncodeOneByte
static unsigned EncodeOneByte(char *out, uint8_t c)
Definition unicode-inl.h:131

unibrow::Utf8::IsValidCharacter
static bool IsValidCharacter(uchar c)
Definition unicode-inl.h:216

unibrow::Utf8::LengthOneByte
static unsigned LengthOneByte(uint8_t chr)
Definition unicode-inl.h:192

unibrow::Utf8::kMaxTwoByteChar
static const unsigned kMaxTwoByteChar
Definition unicode.h:180

unibrow::Utf8::Encode
static unsigned Encode(char *out, uchar c, int previous, bool replace_invalid=false)
Definition unicode-inl.h:147

unibrow::Utf8::kBytesSavedByCombiningSurrogates
static const unsigned kBytesSavedByCombiningSurrogates
Definition unicode.h:186

unibrow::Utf8::kSizeOfUnmatchedSurrogate
static const unsigned kSizeOfUnmatchedSurrogate
Definition unicode.h:187

unibrow::Utf8::State
Utf8DfaDecoder::State State
Definition unicode.h:163

unibrow::Utf8::ValueOfIncremental
static uchar ValueOfIncremental(const uint8_t **cursor, State *state, Utf8IncrementalBuffer *buffer)
Definition unicode-inl.h:86

v8::base::Vector
Definition zone-list.h:15

previous
LineAndColumn previous
Definition earley-parser.cc:21

result
ZoneVector< RpoNumber > & result
Definition jump-threading.cc:21

entries_
std::vector< EntryBuilder > entries_
Definition liftoff-compiler.cc:304

state
LiftoffAssembler::CacheState state
Definition liftoff-compiler.cc:453

unibrow
Definition factory.h:25

unibrow::uchar
unsigned int uchar
Definition unicode.h:21

unibrow::uint16_t
unsigned short uint16_t
Definition unicode.cc:39

v8::internal
Definition api-arguments-inl.h:20

length
int length
Definition regexp-bytecode-peephole.cc:22

size
int size
Definition setup-heap-internal.cc:131

logging.h

DCHECK_LE
#define DCHECK_LE(v1, v2)
Definition logging.h:490

CHECK_LE
#define CHECK_LE(lhs, rhs)

DCHECK_NOT_NULL
#define DCHECK_NOT_NULL(val)
Definition logging.h:492

DCHECK_NE
#define DCHECK_NE(v1, v2)
Definition logging.h:486

DCHECK_GE
#define DCHECK_GE(v1, v2)
Definition logging.h:488

DCHECK
#define DCHECK(condition)
Definition logging.h:482

DCHECK_LT
#define DCHECK_LT(v1, v2)
Definition logging.h:489

DCHECK_EQ
#define DCHECK_EQ(v1, v2)
Definition logging.h:485

unibrow::Mapping::CacheEntry
Definition unicode.h:78

unibrow::Mapping::CacheEntry::offset_
signed offset_
Definition unicode.h:83

unibrow::Mapping::CacheEntry::code_point_
uchar code_point_
Definition unicode.h:82

unibrow::Utf8::EncodingResult
Definition unicode.h:216

unicode.h

utils.h

V8_LIKELY
#define V8_LIKELY(condition)
Definition v8config.h:661