Ion
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utf8iterator.cc
Go to the documentation of this file.
1 
18 #include "ion/base/utf8iterator.h"
19 
20 namespace ion {
21 namespace base {
22 
23 namespace {
24 
26 
31 
32 
33 static bool Is1ByteSequence(uint8 first_byte) {
35  return (first_byte & 0x80) == 0;
36 }
37 
38 static bool Is2ByteSequence(uint8 first_byte) {
40  return (first_byte & 0xe0) == 0xc0;
41 }
42 
43 static bool Is3ByteSequence(uint8 first_byte) {
45  return (first_byte & 0xf0) == 0xe0;
46 }
47 
48 static bool Is4ByteSequence(uint8 first_byte) {
50  return (first_byte & 0xf8) == 0xf0;
51 }
52 
53 static bool IsContinuationByte(uint8 byte) {
56  return (byte & 0xc0) == 0x80;
57 }
58 
59 static uint32 Compute2ByteUnicode(uint8 byte1, uint8 byte2) {
60  const uint32 codepoint = (static_cast<uint32>(byte1 & 0x1f) << 6) |
61  static_cast<uint32>(byte2 & 0x3f);
62  return codepoint <= 0x7f ? Utf8Iterator::kInvalidCharIndex : codepoint;
63 }
64 
65 static uint32 Compute3ByteUnicode(uint8 byte1, uint8 byte2, uint8 byte3) {
66  const uint32 codepoint = (static_cast<uint32>(byte1 & 0x0f) << 12) |
67  (static_cast<uint32>(byte2 & 0x3f) << 6) |
68  static_cast<uint32>(byte3 & 0x3f);
69  return codepoint <= 0x7ff ? Utf8Iterator::kInvalidCharIndex : codepoint;
70 }
71 
72 static uint32 Compute4ByteUnicode(uint8 byte1, uint8 byte2,
73  uint8 byte3, uint8 byte4) {
74  const uint32 codepoint = (static_cast<uint32>(byte1 & 0x07) << 18) |
75  (static_cast<uint32>(byte2 & 0x3f) << 12) |
76  (static_cast<uint32>(byte3 & 0x3f) << 6) |
77  static_cast<uint32>(byte4 & 0x3f);
78  return codepoint <= 0xffff ? Utf8Iterator::kInvalidCharIndex : codepoint;
79 }
80 
81 } // anonymous namespace
82 
83 const uint32 Utf8Iterator::kInvalidCharIndex = 0x110000;
84 
86 
91 
92 
93 Utf8Iterator::Utf8Iterator(const std::string& utf8_string)
94  : string_(utf8_string),
95  byte_count_(string_.size()),
96  cur_index_(0),
97  state_(byte_count_ ? kInString : kEndOfString) {}
98 
100  uint32 unicode_index = kInvalidCharIndex;
101  if (state_ == kInString) {
105  const uint8 byte1 = GetNextByte();
106  if (Is1ByteSequence(byte1)) {
107  unicode_index = byte1;
108  } else if (Is2ByteSequence(byte1)) {
109  const uint8 byte2 = GetNextByte();
110  if (IsContinuationByte(byte2))
111  unicode_index = Compute2ByteUnicode(byte1, byte2);
112  } else if (Is3ByteSequence(byte1)) {
113  const uint8 byte2 = GetNextByte();
114  const uint8 byte3 = GetNextByte();
115  if (IsContinuationByte(byte2) && IsContinuationByte(byte3))
116  unicode_index = Compute3ByteUnicode(byte1, byte2, byte3);
117  } else if (Is4ByteSequence(byte1)) {
118  const uint8 byte2 = GetNextByte();
119  const uint8 byte3 = GetNextByte();
120  const uint8 byte4 = GetNextByte();
121  if (IsContinuationByte(byte2) && IsContinuationByte(byte3) &&
122  IsContinuationByte(byte4))
123  unicode_index = Compute4ByteUnicode(byte1, byte2, byte3, byte4);
125  static const uint32 kMaxValidIndex = 0x10ffff;
126  if (unicode_index > kMaxValidIndex)
127  unicode_index = kInvalidCharIndex;
128  }
131  if (unicode_index == kInvalidCharIndex && state_ == kInString)
132  state_ = kInvalid;
133  }
134  return unicode_index;
135 }
136 
138  size_t count = 0;
139  Utf8Iterator it(string_);
140  while (it.Next() != kInvalidCharIndex)
141  ++count;
143  return it.GetState() == kEndOfString ? count : 0;
144 }
145 
146 uint8 Utf8Iterator::GetNextByte() {
147  if (state_ == kInString) {
148  const uint8 next_byte = string_[cur_index_];
149  if (++cur_index_ == byte_count_)
150  state_ = kEndOfString;
151  return next_byte;
152  }
153  state_ = kInvalid;
154  return 0;
155 }
156 
157 } // namespace base
158 } // namespace ion
static const uint32 kInvalidCharIndex
An invalid Unicode character index.
Definition: utf8iterator.h:59
size_t ComputeCharCount() const
Convenience function that computes and returns the number of Unicode characters in the string by iter...
The Utf8Iterator class iterates over characters in strings encoded with UTF-8, extracting the Unicode...
Definition: utf8iterator.h:49
uint32 Next()
Returns the Unicode index (up to 21 bits) for the next character in the string, or kInvalidCharIndex ...
Definition: utf8iterator.cc:99
State GetState() const
Returns the state of the iterator.
Definition: utf8iterator.h:72
Utf8Iterator(const std::string &utf8_string)
The constructor is passed an std::string in UTF-8 format.
Definition: utf8iterator.cc:93