1 /* 2 * Copyright (C) 2010 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_CHAR_UTILS_H 18 #define LATINIME_CHAR_UTILS_H 19 20 #include <cctype> 21 #include <cstring> 22 #include <vector> 23 24 #include "defines.h" 25 26 namespace latinime { 27 28 class CharUtils { 29 public: 30 static const std::vector<int> EMPTY_STRING; 31 isAsciiUpper(int c)32 static AK_FORCE_INLINE bool isAsciiUpper(int c) { 33 // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to 34 // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). 35 return (c >= 'A' && c <= 'Z'); 36 } 37 toLowerCase(const int c)38 static AK_FORCE_INLINE int toLowerCase(const int c) { 39 if (isAsciiUpper(c)) { 40 return toAsciiLower(c); 41 } 42 if (isAscii(c)) { 43 return c; 44 } 45 return latin_tolower(c); 46 } 47 toBaseLowerCase(const int c)48 static AK_FORCE_INLINE int toBaseLowerCase(const int c) { 49 return toLowerCase(toBaseCodePoint(c)); 50 } 51 isIntentionalOmissionCodePoint(const int codePoint)52 static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) { 53 // TODO: Do not hardcode here 54 return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; 55 } getCodePointCount(const int arraySize,const int * const codePoints)56 static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { 57 int size = 0; 58 for (; size < arraySize; ++size) { 59 if (codePoints[size] == '\0') { 60 break; 61 } 62 } 63 return size; 64 } 65 toBaseCodePoint(int c)66 static AK_FORCE_INLINE int toBaseCodePoint(int c) { 67 if (c < BASE_CHARS_SIZE) { 68 return static_cast<int>(BASE_CHARS[c]); 69 } 70 return c; 71 } 72 getSpaceCount(const int * const codePointBuffer,const int length)73 static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) { 74 int spaceCount = 0; 75 for (int i = 0; i < length; ++i) { 76 if (codePointBuffer[i] == KEYCODE_SPACE) { 77 ++spaceCount; 78 } 79 } 80 return spaceCount; 81 } 82 isInUnicodeSpace(const int codePoint)83 static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) { 84 return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; 85 } 86 87 // Returns updated code point count. Returns 0 when the code points cannot be marked as a 88 // Beginning-of-Sentence. attachBeginningOfSentenceMarker(int * const codePoints,const int codePointCount,const int maxCodePoint)89 static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, 90 const int codePointCount, const int maxCodePoint) { 91 if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) { 92 // Marker has already been attached. 93 return codePointCount; 94 } 95 if (codePointCount >= maxCodePoint) { 96 // the code points cannot be marked as a Beginning-of-Sentence. 97 return 0; 98 } 99 memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount); 100 codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE; 101 return codePointCount + 1; 102 } 103 104 // Returns updated code point count. removeBeginningOfSentenceMarker(int * const codePoints,const int codePointCount)105 static AK_FORCE_INLINE int removeBeginningOfSentenceMarker(int *const codePoints, 106 const int codePointCount) { 107 if (codePointCount <= 0 || codePoints[0] != CODE_POINT_BEGINNING_OF_SENTENCE) { 108 return codePointCount; 109 } 110 const int newCodePointCount = codePointCount - 1; 111 memmove(codePoints, codePoints + 1, sizeof(int) * newCodePointCount); 112 return newCodePointCount; 113 } 114 115 private: 116 DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); 117 118 static const int MIN_UNICODE_CODE_POINT; 119 static const int MAX_UNICODE_CODE_POINT; 120 121 /** 122 * Table mapping most combined Latin, Greek, and Cyrillic characters 123 * to their base characters. If c is in range, BASE_CHARS[c] == c 124 * if c is not a combined character, or the base character if it 125 * is combined. 126 */ 127 static const int BASE_CHARS_SIZE = 0x0500; 128 static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; 129 isAscii(int c)130 static AK_FORCE_INLINE bool isAscii(int c) { 131 return isascii(c) != 0; 132 } 133 toAsciiLower(int c)134 static AK_FORCE_INLINE int toAsciiLower(int c) { 135 return c - 'A' + 'a'; 136 } 137 138 static int latin_tolower(const int c); 139 }; 140 } // namespace latinime 141 #endif // LATINIME_CHAR_UTILS_H 142