1 /*
2  * Copyright 2020, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <stddef.h>
20 #include <stdint.h>
21 #include <type_traits>
22 
23 namespace teeui {
24 
25 /**
26  * Important notice. The UTF8Range only works on verified UTF8 encoded strings.
27  * E.g. if the string successfully passed through our CBOR formatting (see cbor.h) it is safe to
28  * use with UTF8Range. Alternatively, you can call verify() on a new range.
29  */
30 template <typename CharIterator> class UTF8Range {
31   public:
UTF8Range(CharIterator begin,CharIterator end)32     UTF8Range(CharIterator begin, CharIterator end) : begin_(begin), end_(end) {}
UTF8Range()33     UTF8Range() : begin_{}, end_{begin_} {};
34     UTF8Range(const UTF8Range&) = default;
35     UTF8Range(UTF8Range&&) = default;
36     UTF8Range& operator=(UTF8Range&&) = default;
37     UTF8Range& operator=(const UTF8Range&) = default;
38 
39     /**
40      * Decodes a header byte of a UTF8 sequence. In UTF8 encoding the number of leading ones
41      * indicate the length of the UTF8 sequence. Following bytes start with b10 followed by six
42      * payload bits. Sequences of length one start with a 0 followed by 7 payload bits.
43      */
byteCount(char c)44     static size_t byteCount(char c) {
45         if (0x80 & c) {
46             /*
47              * CLZ - count leading zeroes.
48              * __builtin_clz promotes the argument to unsigned int.
49              * We invert c to turn leading ones into leading zeroes.
50              * We subtract additional leading zeroes due to the type promotion from the result.
51              */
52             return __builtin_clz((unsigned char)(~c)) - (sizeof(unsigned int) * 8 - 8);
53         } else {
54             return 1;
55         }
56     }
codePoint(CharIterator begin)57     static unsigned long codePoint(CharIterator begin) {
58         unsigned long c = (uint8_t)*begin;
59         size_t byte_count = byteCount(c);
60         if (byte_count == 1) {
61             return c;
62         } else {
63             // multi byte
64             unsigned long result = c & ~(0xff << (8 - byte_count));
65             ++begin;
66             for (size_t i = 1; i < byte_count; ++i) {
67                 result <<= 6;
68                 result |= *begin & 0x3f;
69                 ++begin;
70             }
71             return result;
72         }
73     }
74 
75     class Iter {
76         CharIterator begin_;
77 
78       public:
Iter()79         Iter() : begin_{} {}
Iter(CharIterator begin)80         Iter(CharIterator begin) : begin_(begin) {}
Iter(const Iter & rhs)81         Iter(const Iter& rhs) : begin_(rhs.begin_) {}
82         Iter& operator=(const Iter& rhs) {
83             begin_ = rhs.begin_;
84             return *this;
85         }
86         CharIterator operator*() const { return begin_; }
87         Iter& operator++() {
88             begin_ += byteCount(*begin_);
89             return *this;
90         }
91         Iter operator++(int) {
92             Iter dummy = *this;
93             ++(*this);
94             return dummy;
95         }
96         bool operator==(const Iter& rhs) const { return begin_ == rhs.begin_; }
97         bool operator!=(const Iter& rhs) const { return !(*this == rhs); }
codePoint()98         unsigned long codePoint() const { return UTF8Range::codePoint(begin_); }
99     };
begin()100     Iter begin() const { return Iter(begin_); }
end()101     Iter end() const { return Iter(end_); }
102     /*
103      * Checks if the range is safe to use. If this returns false, iteration over this range is
104      * undefined. It may infinite loop and read out of bounds.
105      */
verify()106     bool verify() {
107         for (auto pos = begin_; pos != end_;) {
108             // are we out of sync?
109             if ((*pos & 0xc0) == 0x80) return false;
110             auto byte_count = byteCount(*pos);
111             // did we run out of buffer;
112             if (end_ - pos < byte_count) return false;
113             // we could check if the non header bytes have the wrong header. While this would
114             // be malformed UTF8, it does not impact control flow and is thus not security
115             // critical.
116             pos += byte_count;
117         }
118         return true;
119     }
120 
121   private:
122     CharIterator begin_;
123     CharIterator end_;
124     static_assert(std::is_same<std::remove_reference_t<decltype(*begin_)>, const char>::value,
125                   "Iterator must dereference to const char");
126     static_assert(
127         std::is_convertible<std::remove_reference_t<decltype(end_ - begin_)>, size_t>::value,
128         "Iterator arithmetic must evaluate to something that is convertible to size_t");
129 };
130 
131 }  // namespace teeui
132