1 /* 2 * Copyright 2020, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #pragma once 18 19 #include <stddef.h> 20 #include <stdint.h> 21 #include <type_traits> 22 23 namespace teeui { 24 25 /** 26 * Important notice. The UTF8Range only works on verified UTF8 encoded strings. 27 * E.g. if the string successfully passed through our CBOR formatting (see cbor.h) it is safe to 28 * use with UTF8Range. Alternatively, you can call verify() on a new range. 29 */ 30 template <typename CharIterator> class UTF8Range { 31 public: UTF8Range(CharIterator begin,CharIterator end)32 UTF8Range(CharIterator begin, CharIterator end) : begin_(begin), end_(end) {} UTF8Range()33 UTF8Range() : begin_{}, end_{begin_} {}; 34 UTF8Range(const UTF8Range&) = default; 35 UTF8Range(UTF8Range&&) = default; 36 UTF8Range& operator=(UTF8Range&&) = default; 37 UTF8Range& operator=(const UTF8Range&) = default; 38 39 /** 40 * Decodes a header byte of a UTF8 sequence. In UTF8 encoding the number of leading ones 41 * indicate the length of the UTF8 sequence. Following bytes start with b10 followed by six 42 * payload bits. Sequences of length one start with a 0 followed by 7 payload bits. 43 */ byteCount(char c)44 static size_t byteCount(char c) { 45 if (0x80 & c) { 46 /* 47 * CLZ - count leading zeroes. 48 * __builtin_clz promotes the argument to unsigned int. 49 * We invert c to turn leading ones into leading zeroes. 50 * We subtract additional leading zeroes due to the type promotion from the result. 51 */ 52 return __builtin_clz((unsigned char)(~c)) - (sizeof(unsigned int) * 8 - 8); 53 } else { 54 return 1; 55 } 56 } codePoint(CharIterator begin)57 static unsigned long codePoint(CharIterator begin) { 58 unsigned long c = (uint8_t)*begin; 59 size_t byte_count = byteCount(c); 60 if (byte_count == 1) { 61 return c; 62 } else { 63 // multi byte 64 unsigned long result = c & ~(0xff << (8 - byte_count)); 65 ++begin; 66 for (size_t i = 1; i < byte_count; ++i) { 67 result <<= 6; 68 result |= *begin & 0x3f; 69 ++begin; 70 } 71 return result; 72 } 73 } 74 75 class Iter { 76 CharIterator begin_; 77 78 public: Iter()79 Iter() : begin_{} {} Iter(CharIterator begin)80 Iter(CharIterator begin) : begin_(begin) {} Iter(const Iter & rhs)81 Iter(const Iter& rhs) : begin_(rhs.begin_) {} 82 Iter& operator=(const Iter& rhs) { 83 begin_ = rhs.begin_; 84 return *this; 85 } 86 CharIterator operator*() const { return begin_; } 87 Iter& operator++() { 88 begin_ += byteCount(*begin_); 89 return *this; 90 } 91 Iter operator++(int) { 92 Iter dummy = *this; 93 ++(*this); 94 return dummy; 95 } 96 bool operator==(const Iter& rhs) const { return begin_ == rhs.begin_; } 97 bool operator!=(const Iter& rhs) const { return !(*this == rhs); } codePoint()98 unsigned long codePoint() const { return UTF8Range::codePoint(begin_); } 99 }; begin()100 Iter begin() const { return Iter(begin_); } end()101 Iter end() const { return Iter(end_); } 102 /* 103 * Checks if the range is safe to use. If this returns false, iteration over this range is 104 * undefined. It may infinite loop and read out of bounds. 105 */ verify()106 bool verify() { 107 for (auto pos = begin_; pos != end_;) { 108 // are we out of sync? 109 if ((*pos & 0xc0) == 0x80) return false; 110 auto byte_count = byteCount(*pos); 111 // did we run out of buffer; 112 if (end_ - pos < byte_count) return false; 113 // we could check if the non header bytes have the wrong header. While this would 114 // be malformed UTF8, it does not impact control flow and is thus not security 115 // critical. 116 pos += byte_count; 117 } 118 return true; 119 } 120 121 private: 122 CharIterator begin_; 123 CharIterator end_; 124 static_assert(std::is_same<std::remove_reference_t<decltype(*begin_)>, const char>::value, 125 "Iterator must dereference to const char"); 126 static_assert( 127 std::is_convertible<std::remove_reference_t<decltype(end_ - begin_)>, size_t>::value, 128 "Iterator arithmetic must evaluate to something that is convertible to size_t"); 129 }; 130 131 } // namespace teeui 132