1 /*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
18
19 #include "defines.h"
20 #include "dictionary/interface/dictionary_bigrams_structure_policy.h"
21 #include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
22 #include "dictionary/utils/byte_array_utils.h"
23
24 namespace latinime {
25
26 typedef PatriciaTrieReadingUtils PtReadingUtils;
27
28 const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0;
29 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00;
30 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40;
31 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80;
32 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0;
33
34 // Flag for single/multiple char group
35 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20;
36 // Flag for terminal PtNodes
37 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10;
38 // Flag for shortcut targets presence
39 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08;
40 // Flag for bigram presence
41 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
42 // Flag for non-words (typically, shortcut only entries)
43 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
44 // Flag for possibly offensive words
45 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
46
getPtNodeArraySizeAndAdvancePosition(const uint8_t * const buffer,int * const pos)47 /* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
48 const uint8_t *const buffer, int *const pos) {
49 const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
50 if (firstByte < 0x80) {
51 return firstByte;
52 } else {
53 return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition(
54 buffer, pos);
55 }
56 }
57
getFlagsAndAdvancePosition(const uint8_t * const buffer,int * const pos)58 /* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition(
59 const uint8_t *const buffer, int *const pos) {
60 return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
61 }
62
getCodePointAndAdvancePosition(const uint8_t * const buffer,const int * const codePointTable,int * const pos)63 /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
64 const int *const codePointTable, int *const pos) {
65 return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
66 }
67
68 // Returns the number of read characters.
getCharsAndAdvancePosition(const uint8_t * const buffer,const NodeFlags flags,const int maxLength,const int * const codePointTable,int * const outBuffer,int * const pos)69 /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
70 const NodeFlags flags, const int maxLength, const int *const codePointTable,
71 int *const outBuffer, int *const pos) {
72 int length = 0;
73 if (hasMultipleChars(flags)) {
74 length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
75 outBuffer, pos);
76 } else {
77 const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
78 if (codePoint == NOT_A_CODE_POINT) {
79 // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
80 // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
81 // when the PtNode has a single code point.
82 length = 0;
83 AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x",
84 *pos - 1, codePoint, buffer[*pos - 1]);
85 ASSERT(false);
86 } else if (maxLength > 0) {
87 outBuffer[0] = codePoint;
88 length = 1;
89 }
90 }
91 return length;
92 }
93
94 // Returns the number of skipped characters.
skipCharacters(const uint8_t * const buffer,const NodeFlags flags,const int maxLength,const int * const codePointTable,int * const pos)95 /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
96 const int maxLength, const int *const codePointTable, int *const pos) {
97 if (hasMultipleChars(flags)) {
98 return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
99 } else {
100 if (maxLength > 0) {
101 getCodePointAndAdvancePosition(buffer, codePointTable, pos);
102 return 1;
103 } else {
104 return 0;
105 }
106 }
107 }
108
readProbabilityAndAdvancePosition(const uint8_t * const buffer,int * const pos)109 /* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer,
110 int *const pos) {
111 return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
112 }
113
readChildrenPositionAndAdvancePosition(const uint8_t * const buffer,const NodeFlags flags,int * const pos)114 /* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition(
115 const uint8_t *const buffer, const NodeFlags flags, int *const pos) {
116 const int base = *pos;
117 int offset = 0;
118 switch (MASK_CHILDREN_POSITION_TYPE & flags) {
119 case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE:
120 offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
121 break;
122 case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES:
123 offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos);
124 break;
125 case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES:
126 offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos);
127 break;
128 default:
129 // If we come here, it means we asked for the children of a word with
130 // no children.
131 return NOT_A_DICT_POS;
132 }
133 return base + offset;
134 }
135
readPtNodeInfo(const uint8_t * const dictBuf,const int ptNodePos,const DictionaryShortcutsStructurePolicy * const shortcutPolicy,const DictionaryBigramsStructurePolicy * const bigramPolicy,const int * const codePointTable,NodeFlags * const outFlags,int * const outCodePointCount,int * const outCodePoint,int * const outProbability,int * const outChildrenPos,int * const outShortcutPos,int * const outBigramPos,int * const outSiblingPos)136 /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
137 const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
138 const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
139 NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
140 int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
141 int *const outBigramPos, int *const outSiblingPos) {
142 int readingPos = ptNodePos;
143 const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
144 *outFlags = flags;
145 *outCodePointCount = getCharsAndAdvancePosition(
146 dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
147 *outProbability = isTerminal(flags) ?
148 readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
149 *outChildrenPos = hasChildrenInFlags(flags) ?
150 readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS;
151 *outShortcutPos = NOT_A_DICT_POS;
152 if (hasShortcutTargets(flags)) {
153 *outShortcutPos = readingPos;
154 shortcutPolicy->skipAllShortcuts(&readingPos);
155 }
156 *outBigramPos = NOT_A_DICT_POS;
157 if (hasBigrams(flags)) {
158 *outBigramPos = readingPos;
159 bigramPolicy->skipAllBigrams(&readingPos);
160 }
161 *outSiblingPos = readingPos;
162 }
163
164 } // namespace latinime
165