1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
18 
19 #include "defines.h"
20 #include "dictionary/interface/dictionary_bigrams_structure_policy.h"
21 #include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
22 #include "dictionary/utils/byte_array_utils.h"
23 
24 namespace latinime {
25 
26 typedef PatriciaTrieReadingUtils PtReadingUtils;
27 
28 const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0;
29 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00;
30 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40;
31 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80;
32 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0;
33 
34 // Flag for single/multiple char group
35 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20;
36 // Flag for terminal PtNodes
37 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10;
38 // Flag for shortcut targets presence
39 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08;
40 // Flag for bigram presence
41 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
42 // Flag for non-words (typically, shortcut only entries)
43 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
44 // Flag for possibly offensive words
45 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
46 
getPtNodeArraySizeAndAdvancePosition(const uint8_t * const buffer,int * const pos)47 /* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
48         const uint8_t *const buffer, int *const pos) {
49     const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
50     if (firstByte < 0x80) {
51         return firstByte;
52     } else {
53         return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition(
54                 buffer, pos);
55     }
56 }
57 
getFlagsAndAdvancePosition(const uint8_t * const buffer,int * const pos)58 /* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition(
59         const uint8_t *const buffer, int *const pos) {
60     return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
61 }
62 
getCodePointAndAdvancePosition(const uint8_t * const buffer,const int * const codePointTable,int * const pos)63 /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
64         const int *const codePointTable, int *const pos) {
65     return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
66 }
67 
68 // Returns the number of read characters.
getCharsAndAdvancePosition(const uint8_t * const buffer,const NodeFlags flags,const int maxLength,const int * const codePointTable,int * const outBuffer,int * const pos)69 /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
70         const NodeFlags flags, const int maxLength, const int *const codePointTable,
71         int *const outBuffer, int *const pos) {
72     int length = 0;
73     if (hasMultipleChars(flags)) {
74         length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
75                 outBuffer, pos);
76     } else {
77         const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
78         if (codePoint == NOT_A_CODE_POINT) {
79             // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
80             // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
81             // when the PtNode has a single code point.
82             length = 0;
83             AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x",
84                     *pos - 1, codePoint, buffer[*pos - 1]);
85             ASSERT(false);
86         } else if (maxLength > 0) {
87             outBuffer[0] = codePoint;
88             length = 1;
89         }
90     }
91     return length;
92 }
93 
94 // Returns the number of skipped characters.
skipCharacters(const uint8_t * const buffer,const NodeFlags flags,const int maxLength,const int * const codePointTable,int * const pos)95 /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
96         const int maxLength, const int *const codePointTable, int *const pos) {
97     if (hasMultipleChars(flags)) {
98         return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
99     } else {
100         if (maxLength > 0) {
101             getCodePointAndAdvancePosition(buffer, codePointTable, pos);
102             return 1;
103         } else {
104             return 0;
105         }
106     }
107 }
108 
readProbabilityAndAdvancePosition(const uint8_t * const buffer,int * const pos)109 /* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer,
110         int *const pos) {
111     return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
112 }
113 
readChildrenPositionAndAdvancePosition(const uint8_t * const buffer,const NodeFlags flags,int * const pos)114 /* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition(
115         const uint8_t *const buffer, const NodeFlags flags, int *const pos) {
116     const int base = *pos;
117     int offset = 0;
118     switch (MASK_CHILDREN_POSITION_TYPE & flags) {
119         case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE:
120             offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos);
121             break;
122         case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES:
123             offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos);
124             break;
125         case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES:
126             offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos);
127             break;
128         default:
129             // If we come here, it means we asked for the children of a word with
130             // no children.
131             return NOT_A_DICT_POS;
132     }
133     return base + offset;
134 }
135 
readPtNodeInfo(const uint8_t * const dictBuf,const int ptNodePos,const DictionaryShortcutsStructurePolicy * const shortcutPolicy,const DictionaryBigramsStructurePolicy * const bigramPolicy,const int * const codePointTable,NodeFlags * const outFlags,int * const outCodePointCount,int * const outCodePoint,int * const outProbability,int * const outChildrenPos,int * const outShortcutPos,int * const outBigramPos,int * const outSiblingPos)136 /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
137         const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
138         const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
139         NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
140         int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
141         int *const outBigramPos, int *const outSiblingPos) {
142     int readingPos = ptNodePos;
143     const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
144     *outFlags = flags;
145     *outCodePointCount = getCharsAndAdvancePosition(
146             dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
147     *outProbability = isTerminal(flags) ?
148             readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
149     *outChildrenPos = hasChildrenInFlags(flags) ?
150             readChildrenPositionAndAdvancePosition(dictBuf, flags, &readingPos) : NOT_A_DICT_POS;
151     *outShortcutPos = NOT_A_DICT_POS;
152     if (hasShortcutTargets(flags)) {
153         *outShortcutPos = readingPos;
154         shortcutPolicy->skipAllShortcuts(&readingPos);
155     }
156     *outBigramPos = NOT_A_DICT_POS;
157     if (hasBigrams(flags)) {
158         *outBigramPos = readingPos;
159         bigramPolicy->skipAllBigrams(&readingPos);
160     }
161     *outSiblingPos = readingPos;
162 }
163 
164 } // namespace latinime
165