1 /*
2  * Copyright (C) 2010 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_CHAR_UTILS_H
18 #define LATINIME_CHAR_UTILS_H
19 
20 #include <cctype>
21 #include <cstring>
22 #include <vector>
23 
24 #include "defines.h"
25 
26 namespace latinime {
27 
28 class CharUtils {
29  public:
30     static const std::vector<int> EMPTY_STRING;
31 
isAsciiUpper(int c)32     static AK_FORCE_INLINE bool isAsciiUpper(int c) {
33         // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
34         // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
35         return (c >= 'A' && c <= 'Z');
36     }
37 
toLowerCase(const int c)38     static AK_FORCE_INLINE int toLowerCase(const int c) {
39         if (isAsciiUpper(c)) {
40             return toAsciiLower(c);
41         }
42         if (isAscii(c)) {
43             return c;
44         }
45         return latin_tolower(c);
46     }
47 
toBaseLowerCase(const int c)48     static AK_FORCE_INLINE int toBaseLowerCase(const int c) {
49         return toLowerCase(toBaseCodePoint(c));
50     }
51 
isIntentionalOmissionCodePoint(const int codePoint)52     static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) {
53         // TODO: Do not hardcode here
54         return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
55     }
getCodePointCount(const int arraySize,const int * const codePoints)56     static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) {
57         int size = 0;
58         for (; size < arraySize; ++size) {
59             if (codePoints[size] == '\0') {
60                 break;
61             }
62         }
63         return size;
64     }
65 
toBaseCodePoint(int c)66     static AK_FORCE_INLINE int toBaseCodePoint(int c) {
67         if (c < BASE_CHARS_SIZE) {
68             return static_cast<int>(BASE_CHARS[c]);
69         }
70         return c;
71     }
72 
getSpaceCount(const int * const codePointBuffer,const int length)73     static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) {
74         int spaceCount = 0;
75         for (int i = 0; i < length; ++i) {
76             if (codePointBuffer[i] == KEYCODE_SPACE) {
77                 ++spaceCount;
78             }
79         }
80         return spaceCount;
81     }
82 
isInUnicodeSpace(const int codePoint)83     static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
84         return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
85     }
86 
87     // Returns updated code point count. Returns 0 when the code points cannot be marked as a
88     // Beginning-of-Sentence.
attachBeginningOfSentenceMarker(int * const codePoints,const int codePointCount,const int maxCodePoint)89     static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
90             const int codePointCount, const int maxCodePoint) {
91         if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
92             // Marker has already been attached.
93             return codePointCount;
94         }
95         if (codePointCount >= maxCodePoint) {
96             // the code points cannot be marked as a Beginning-of-Sentence.
97             return 0;
98         }
99         memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount);
100         codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE;
101         return codePointCount + 1;
102     }
103 
104     // Returns updated code point count.
removeBeginningOfSentenceMarker(int * const codePoints,const int codePointCount)105     static AK_FORCE_INLINE int removeBeginningOfSentenceMarker(int *const codePoints,
106             const int codePointCount) {
107         if (codePointCount <= 0 || codePoints[0] != CODE_POINT_BEGINNING_OF_SENTENCE) {
108             return codePointCount;
109         }
110         const int newCodePointCount = codePointCount - 1;
111         memmove(codePoints, codePoints + 1, sizeof(int) * newCodePointCount);
112         return newCodePointCount;
113     }
114 
115  private:
116     DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
117 
118     static const int MIN_UNICODE_CODE_POINT;
119     static const int MAX_UNICODE_CODE_POINT;
120 
121     /**
122      * Table mapping most combined Latin, Greek, and Cyrillic characters
123      * to their base characters.  If c is in range, BASE_CHARS[c] == c
124      * if c is not a combined character, or the base character if it
125      * is combined.
126      */
127     static const int BASE_CHARS_SIZE = 0x0500;
128     static const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
129 
isAscii(int c)130     static AK_FORCE_INLINE bool isAscii(int c) {
131         return isascii(c) != 0;
132     }
133 
toAsciiLower(int c)134     static AK_FORCE_INLINE int toAsciiLower(int c) {
135         return c - 'A' + 'a';
136     }
137 
138     static int latin_tolower(const int c);
139 };
140 } // namespace latinime
141 #endif // LATINIME_CHAR_UTILS_H
142