Android-cuttlefish cvd tool
utf8range.h
Go to the documentation of this file.
1/*
2 * Copyright 2020, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <stddef.h>
20#include <stdint.h>
21#include <type_traits>
22
23namespace teeui {
24
30template <typename CharIterator> class UTF8Range {
31 public:
32 UTF8Range(CharIterator begin, CharIterator end) : begin_(begin), end_(end) {}
34 UTF8Range(const UTF8Range&) = default;
35 UTF8Range(UTF8Range&&) = default;
37 UTF8Range& operator=(const UTF8Range&) = default;
38
44 static size_t byteCount(char c) {
45 if (0x80 & c) {
46 /*
47 * CLZ - count leading zeroes.
48 * __builtin_clz promotes the argument to unsigned int.
49 * We invert c to turn leading ones into leading zeroes.
50 * We subtract additional leading zeroes due to the type promotion from the result.
51 */
52 return __builtin_clz((unsigned char)(~c)) - (sizeof(unsigned int) * 8 - 8);
53 } else {
54 return 1;
55 }
56 }
57 static unsigned long codePoint(CharIterator begin) {
58 unsigned long c = (uint8_t)*begin;
59 size_t byte_count = byteCount(c);
60 if (byte_count == 1) {
61 return c;
62 } else {
63 // multi byte
64 unsigned long result = c & ~(0xffu << (8 - byte_count));
65 ++begin;
66 for (size_t i = 1; i < byte_count; ++i) {
67 result <<= 6;
68 result |= *begin & 0x3f;
69 ++begin;
70 }
71 return result;
72 }
73 }
74
75 class Iter {
76 CharIterator begin_;
77
78 public:
79 Iter() : begin_{} {}
80 Iter(CharIterator begin) : begin_(begin) {}
81 Iter(const Iter& rhs) : begin_(rhs.begin_) {}
82 Iter& operator=(const Iter& rhs) {
83 begin_ = rhs.begin_;
84 return *this;
85 }
86 CharIterator operator*() const { return begin_; }
89 return *this;
90 }
92 Iter dummy = *this;
93 ++(*this);
94 return dummy;
95 }
96 bool operator==(const Iter& rhs) const { return begin_ == rhs.begin_; }
97 bool operator!=(const Iter& rhs) const { return !(*this == rhs); }
98 unsigned long codePoint() const { return UTF8Range::codePoint(begin_); }
99 };
100 Iter begin() const { return Iter(begin_); }
101 Iter end() const { return Iter(end_); }
102 /*
103 * Checks if the range is safe to use. If this returns false, iteration over this range is
104 * undefined. It may infinite loop and read out of bounds.
105 */
106 bool verify() {
107 for (auto pos = begin_; pos != end_;) {
108 // are we out of sync?
109 if ((*pos & 0xc0) == 0x80) return false;
110 auto byte_count = byteCount(*pos);
111 // did we run out of buffer;
112 if (end_ - pos < byte_count) return false;
113 // we could check if the non header bytes have the wrong header. While this would
114 // be malformed UTF8, it does not impact control flow and is thus not security
115 // critical.
116 pos += byte_count;
117 }
118 return true;
119 }
120
121 private:
122 CharIterator begin_;
123 CharIterator end_;
124 static_assert(std::is_same<std::remove_reference_t<decltype(*begin_)>, const char>::value,
125 "Iterator must dereference to const char");
126 static_assert(
127 std::is_convertible<std::remove_reference_t<decltype(end_ - begin_)>, size_t>::value,
128 "Iterator arithmetic must evaluate to something that is convertible to size_t");
129};
130
131} // namespace teeui
Definition: utf8range.h:75
CharIterator begin_
Definition: utf8range.h:76
bool operator!=(const Iter &rhs) const
Definition: utf8range.h:97
Iter(CharIterator begin)
Definition: utf8range.h:80
CharIterator operator*() const
Definition: utf8range.h:86
Iter operator++(int)
Definition: utf8range.h:91
Iter & operator=(const Iter &rhs)
Definition: utf8range.h:82
Iter & operator++()
Definition: utf8range.h:87
Iter(const Iter &rhs)
Definition: utf8range.h:81
bool operator==(const Iter &rhs) const
Definition: utf8range.h:96
unsigned long codePoint() const
Definition: utf8range.h:98
Iter()
Definition: utf8range.h:79
Definition: utf8range.h:30
UTF8Range(const UTF8Range &)=default
UTF8Range & operator=(UTF8Range &&)=default
bool verify()
Definition: utf8range.h:106
static unsigned long codePoint(CharIterator begin)
Definition: utf8range.h:57
UTF8Range & operator=(const UTF8Range &)=default
Iter end() const
Definition: utf8range.h:101
UTF8Range(UTF8Range &&)=default
UTF8Range()
Definition: utf8range.h:33
CharIterator end_
Definition: utf8range.h:123
static size_t byteCount(char c)
Definition: utf8range.h:44
CharIterator begin_
Definition: utf8range.h:122
Iter begin() const
Definition: utf8range.h:100
UTF8Range(CharIterator begin, CharIterator end)
Definition: utf8range.h:32
Definition: layout.h:28