1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/utf8_utils.h"
18
19 #include "utils/char_utils.h"
20
21 namespace latinime {
22 namespace dicttoolkit {
23
24 const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4;
25 const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8};
26 const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0};
27 const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03};
28 const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
29
30 const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F;
31 const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80;
32 const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6;
33
getCodePoints(const std::string & utf8Str)34 /* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) {
35 std::vector<int> codePoints;
36 int remainingByteCountForCurrentCodePoint = 0;
37 int currentCodePointSequenceSize = 0;
38 int codePoint = 0;
39 for (const char c : utf8Str) {
40 if (remainingByteCountForCurrentCodePoint == 0) {
41 currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c);
42 if (currentCodePointSequenceSize <= 0) {
43 AKLOGE("%x is an invalid utf8 first byte value.", c);
44 return std::vector<int>();
45 }
46 remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize;
47 codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint);
48 } else {
49 codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
50 codePoint += maskTrailingByte(c);
51 }
52 remainingByteCountForCurrentCodePoint--;
53 if (remainingByteCountForCurrentCodePoint == 0) {
54 if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) {
55 AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.",
56 currentCodePointSequenceSize, codePoint);
57 return std::vector<int>();
58 }
59 codePoints.push_back(codePoint);
60 }
61 }
62 return codePoints;
63 }
64
getSequenceSizeByCheckingFirstByte(const uint8_t firstByte)65 /* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) {
66 for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
67 if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) {
68 return i;
69 }
70 }
71 // Not a valid utf8 char first byte.
72 return -1;
73 }
74
maskFirstByte(const uint8_t firstByte,const int sequenceSize)75 /* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte,
76 const int sequenceSize) {
77 return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize];
78 }
79
maskTrailingByte(const uint8_t secondOrLaterByte)80 /* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) {
81 return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK;
82 }
83
getUtf8String(const CodePointArrayView codePoints)84 /* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) {
85 std::string utf8String;
86 for (const int codePoint : codePoints) {
87 const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint);
88 if (sequenceSize <= 0) {
89 AKLOGE("Cannot encode code point (%d).", codePoint);
90 return std::string();
91 }
92 const int trailingByteCount = sequenceSize - 1;
93 // Output first byte.
94 const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE);
95 utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize]));
96 // Output second and later bytes.
97 for (int i = 1; i < sequenceSize; ++i) {
98 const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
99 const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK;
100 utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER));
101 }
102 }
103 return utf8String;
104 }
105
getSequenceSizeToEncodeCodePoint(const int codePoint)106 /* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) {
107 if (codePoint < 0) {
108 return -1;
109 }
110 for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
111 if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) {
112 return i;
113 }
114 }
115 return -1;
116 }
117
118 } // namespace dicttoolkit
119 } // namespace latinime
120