33 static bool Is1ByteSequence(uint8 first_byte) {
35 return (first_byte & 0x80) == 0;
38 static bool Is2ByteSequence(uint8 first_byte) {
40 return (first_byte & 0xe0) == 0xc0;
43 static bool Is3ByteSequence(uint8 first_byte) {
45 return (first_byte & 0xf0) == 0xe0;
48 static bool Is4ByteSequence(uint8 first_byte) {
50 return (first_byte & 0xf8) == 0xf0;
53 static bool IsContinuationByte(uint8 byte) {
56 return (byte & 0xc0) == 0x80;
59 static uint32 Compute2ByteUnicode(uint8 byte1, uint8 byte2) {
60 const uint32 codepoint = (
static_cast<uint32
>(byte1 & 0x1f) << 6) |
61 static_cast<uint32
>(byte2 & 0x3f);
65 static uint32 Compute3ByteUnicode(uint8 byte1, uint8 byte2, uint8 byte3) {
66 const uint32 codepoint = (
static_cast<uint32
>(byte1 & 0x0f) << 12) |
67 (
static_cast<uint32
>(byte2 & 0x3f) << 6) |
68 static_cast<uint32
>(byte3 & 0x3f);
72 static uint32 Compute4ByteUnicode(uint8 byte1, uint8 byte2,
73 uint8 byte3, uint8 byte4) {
74 const uint32 codepoint = (
static_cast<uint32
>(byte1 & 0x07) << 18) |
75 (
static_cast<uint32
>(byte2 & 0x3f) << 12) |
76 (
static_cast<uint32
>(byte3 & 0x3f) << 6) |
77 static_cast<uint32
>(byte4 & 0x3f);
94 : string_(utf8_string),
95 byte_count_(string_.size()),
97 state_(byte_count_ ? kInString : kEndOfString) {}
105 const uint8 byte1 = GetNextByte();
106 if (Is1ByteSequence(byte1)) {
107 unicode_index = byte1;
108 }
else if (Is2ByteSequence(byte1)) {
109 const uint8 byte2 = GetNextByte();
110 if (IsContinuationByte(byte2))
111 unicode_index = Compute2ByteUnicode(byte1, byte2);
112 }
else if (Is3ByteSequence(byte1)) {
113 const uint8 byte2 = GetNextByte();
114 const uint8 byte3 = GetNextByte();
115 if (IsContinuationByte(byte2) && IsContinuationByte(byte3))
116 unicode_index = Compute3ByteUnicode(byte1, byte2, byte3);
117 }
else if (Is4ByteSequence(byte1)) {
118 const uint8 byte2 = GetNextByte();
119 const uint8 byte3 = GetNextByte();
120 const uint8 byte4 = GetNextByte();
121 if (IsContinuationByte(byte2) && IsContinuationByte(byte3) &&
122 IsContinuationByte(byte4))
123 unicode_index = Compute4ByteUnicode(byte1, byte2, byte3, byte4);
125 static const uint32 kMaxValidIndex = 0x10ffff;
126 if (unicode_index > kMaxValidIndex)
134 return unicode_index;
146 uint8 Utf8Iterator::GetNextByte() {
148 const uint8 next_byte = string_[cur_index_];
149 if (++cur_index_ == byte_count_)
static const uint32 kInvalidCharIndex
An invalid Unicode character index.
size_t ComputeCharCount() const
Convenience function that computes and returns the number of Unicode characters in the string by iter...
The Utf8Iterator class iterates over characters in strings encoded with UTF-8, extracting the Unicode...
uint32 Next()
Returns the Unicode index (up to 21 bits) for the next character in the string, or kInvalidCharIndex ...
State GetState() const
Returns the state of the iterator.
Utf8Iterator(const std::string &utf8_string)
The constructor is passed an std::string in UTF-8 format.