1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
18 
19 #include "dictionary/structure/pt_common/pt_node_array_reader.h"
20 #include "utils/char_utils.h"
21 
22 namespace latinime {
23 
24 // To avoid infinite loop caused by invalid or malicious forward links.
25 const int DynamicPtReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
26 const int DynamicPtReadingHelper::MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000;
27 const size_t DynamicPtReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH;
28 
onVisitingPtNode(const PtNodeParams * const ptNodeParams)29 bool DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions::onVisitingPtNode(
30         const PtNodeParams *const ptNodeParams) {
31     if (ptNodeParams->isTerminal() && !ptNodeParams->isDeleted()) {
32         mTerminalPositions->push_back(ptNodeParams->getHeadPos());
33     }
34     return true;
35 }
36 
37 // Visits all PtNodes in post-order depth first manner.
38 // For example, visits c -> b -> y -> x -> a for the following dictionary:
39 // a _ b _ c
40 //   \ x _ y
traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener * const listener)41 bool DynamicPtReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner(
42         TraversingEventListener *const listener) {
43     bool alreadyVisitedChildren = false;
44     // Descend from the root to the root PtNode array.
45     if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
46         return false;
47     }
48     while (!isEnd()) {
49         const PtNodeParams ptNodeParams(getPtNodeParams());
50         if (!ptNodeParams.isValid()) {
51             break;
52         }
53         if (!alreadyVisitedChildren) {
54             if (ptNodeParams.hasChildren()) {
55                 // Move to the first child.
56                 if (!listener->onDescend(ptNodeParams.getChildrenPos())) {
57                     return false;
58                 }
59                 pushReadingStateToStack();
60                 readChildNode(ptNodeParams);
61             } else {
62                 alreadyVisitedChildren = true;
63             }
64         } else {
65             if (!listener->onVisitingPtNode(&ptNodeParams)) {
66                 return false;
67             }
68             readNextSiblingNode(ptNodeParams);
69             if (isEnd()) {
70                 // All PtNodes in current linked PtNode arrays have been visited.
71                 // Return to the parent.
72                 if (!listener->onReadingPtNodeArrayTail()) {
73                     return false;
74                 }
75                 if (mReadingStateStack.size() <= 0) {
76                     break;
77                 }
78                 if (!listener->onAscend()) {
79                     return false;
80                 }
81                 popReadingStateFromStack();
82                 alreadyVisitedChildren = true;
83             } else {
84                 // Process sibling PtNode.
85                 alreadyVisitedChildren = false;
86             }
87         }
88     }
89     // Ascend from the root PtNode array to the root.
90     if (!listener->onAscend()) {
91         return false;
92     }
93     return !isError();
94 }
95 
96 // Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order
97 // that PtNodes are written in the dictionary buffer.
98 // For example, visits a -> b -> x -> c -> y for the following dictionary:
99 // a _ b _ c
100 //   \ x _ y
traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(TraversingEventListener * const listener)101 bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
102         TraversingEventListener *const listener) {
103     bool alreadyVisitedAllPtNodesInArray = false;
104     bool alreadyVisitedChildren = false;
105     // Descend from the root to the root PtNode array.
106     if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) {
107         return false;
108     }
109     if (isEnd()) {
110         // Empty dictionary. Needs to notify the listener of the tail of empty PtNode array.
111         if (!listener->onReadingPtNodeArrayTail()) {
112             return false;
113         }
114     }
115     pushReadingStateToStack();
116     while (!isEnd()) {
117         const PtNodeParams ptNodeParams(getPtNodeParams());
118         if (!ptNodeParams.isValid()) {
119             break;
120         }
121         if (alreadyVisitedAllPtNodesInArray) {
122             if (alreadyVisitedChildren) {
123                 // Move to next sibling PtNode's children.
124                 readNextSiblingNode(ptNodeParams);
125                 if (isEnd()) {
126                     // Return to the parent PTNode.
127                     if (!listener->onAscend()) {
128                         return false;
129                     }
130                     if (mReadingStateStack.size() <= 0) {
131                         break;
132                     }
133                     popReadingStateFromStack();
134                     alreadyVisitedChildren = true;
135                     alreadyVisitedAllPtNodesInArray = true;
136                 } else {
137                     alreadyVisitedChildren = false;
138                 }
139             } else {
140                 if (ptNodeParams.hasChildren()) {
141                     // Move to the first child.
142                     if (!listener->onDescend(ptNodeParams.getChildrenPos())) {
143                         return false;
144                     }
145                     pushReadingStateToStack();
146                     readChildNode(ptNodeParams);
147                     // Push state to return the head of PtNode array.
148                     pushReadingStateToStack();
149                     alreadyVisitedAllPtNodesInArray = false;
150                     alreadyVisitedChildren = false;
151                 } else {
152                     alreadyVisitedChildren = true;
153                 }
154             }
155         } else {
156             if (!listener->onVisitingPtNode(&ptNodeParams)) {
157                 return false;
158             }
159             readNextSiblingNode(ptNodeParams);
160             if (isEnd()) {
161                 if (!listener->onReadingPtNodeArrayTail()) {
162                     return false;
163                 }
164                 // Return to the head of current PtNode array.
165                 popReadingStateFromStack();
166                 alreadyVisitedAllPtNodesInArray = true;
167             }
168         }
169     }
170     popReadingStateFromStack();
171     // Ascend from the root PtNode array to the root.
172     if (!listener->onAscend()) {
173         return false;
174     }
175     return !isError();
176 }
177 
getCodePointsAndReturnCodePointCount(const int maxCodePointCount,int * const outCodePoints)178 int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount,
179         int *const outCodePoints) {
180     // This method traverses parent nodes from the terminal by following parent pointers; thus,
181     // node code points are stored in the buffer in the reverse order.
182     int reverseCodePoints[maxCodePointCount];
183     const PtNodeParams terminalPtNodeParams(getPtNodeParams());
184     // First, read the terminal node and get its probability.
185     if (!isValidTerminalNode(terminalPtNodeParams)) {
186         // Node at the ptNodePos is not a valid terminal node.
187         return 0;
188     }
189     // Then, following parent node link to the dictionary root and fetch node code points.
190     int totalCodePointCount = 0;
191     while (!isEnd()) {
192         const PtNodeParams ptNodeParams(getPtNodeParams());
193         totalCodePointCount = getTotalCodePointCount(ptNodeParams);
194         if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) {
195             // The ptNodePos is not a valid terminal node position in the dictionary.
196             return 0;
197         }
198         // Store node code points to buffer in the reverse order.
199         fetchMergedNodeCodePointsInReverseOrder(ptNodeParams, getPrevTotalCodePointCount(),
200                 reverseCodePoints);
201         // Follow parent node toward the root node.
202         readParentNode(ptNodeParams);
203     }
204     if (isError()) {
205         // The node position or the dictionary is invalid.
206         return 0;
207     }
208     // Reverse the stored code points to output them.
209     for (int i = 0; i < totalCodePointCount; ++i) {
210         outCodePoints[i] = reverseCodePoints[totalCodePointCount - i - 1];
211     }
212     return totalCodePointCount;
213 }
214 
getTerminalPtNodePositionOfWord(const int * const inWord,const size_t length,const bool forceLowerCaseSearch)215 int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord,
216         const size_t length, const bool forceLowerCaseSearch) {
217     int searchCodePoints[length];
218     for (size_t i = 0; i < length; ++i) {
219         searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
220     }
221     while (!isEnd()) {
222         const PtNodeParams ptNodeParams(getPtNodeParams());
223         const int matchedCodePointCount = getPrevTotalCodePointCount();
224         if (getTotalCodePointCount(ptNodeParams) > length
225                 || !isMatchedCodePoint(ptNodeParams, 0 /* index */,
226                         searchCodePoints[matchedCodePointCount])) {
227             // Current node has too many code points or its first code point is different from
228             // target code point. Skip this node and read the next sibling node.
229             readNextSiblingNode(ptNodeParams);
230             continue;
231         }
232         // Check following merged node code points.
233         const int nodeCodePointCount = ptNodeParams.getCodePointCount();
234         for (int j = 1; j < nodeCodePointCount; ++j) {
235             if (!isMatchedCodePoint(ptNodeParams, j, searchCodePoints[matchedCodePointCount + j])) {
236                 // Different code point is found. The given word is not included in the dictionary.
237                 return NOT_A_DICT_POS;
238             }
239         }
240         // All characters are matched.
241         if (length == getTotalCodePointCount(ptNodeParams)) {
242             if (!ptNodeParams.isTerminal()) {
243                 return NOT_A_DICT_POS;
244             }
245             // Terminal position is found.
246             return ptNodeParams.getHeadPos();
247         }
248         if (!ptNodeParams.hasChildren()) {
249             return NOT_A_DICT_POS;
250         }
251         // Advance to the children nodes.
252         readChildNode(ptNodeParams);
253     }
254     // If we already traversed the tree further than the word is long, there means
255     // there was no match (or we would have found it).
256     return NOT_A_DICT_POS;
257 }
258 
259 // Read node array size and process empty node arrays. Nodes and arrays are counted up in this
260 // method to avoid an infinite loop.
nextPtNodeArray()261 void DynamicPtReadingHelper::nextPtNodeArray() {
262     int ptNodeCountInArray = 0;
263     int firstPtNodePos = NOT_A_DICT_POS;
264     if (!mPtNodeArrayReader->readPtNodeArrayInfoAndReturnIfValid(
265             mReadingState.mPos, &ptNodeCountInArray, &firstPtNodePos)) {
266         mIsError = true;
267         mReadingState.mPos = NOT_A_DICT_POS;
268         return;
269     }
270     mReadingState.mPosOfThisPtNodeArrayHead = mReadingState.mPos;
271     mReadingState.mRemainingPtNodeCountInThisArray = ptNodeCountInArray;
272     mReadingState.mPos = firstPtNodePos;
273     // Count up nodes and node arrays to avoid infinite loop.
274     mReadingState.mTotalPtNodeIndexInThisArrayChain +=
275             mReadingState.mRemainingPtNodeCountInThisArray;
276     mReadingState.mPtNodeArrayIndexInThisArrayChain++;
277     if (mReadingState.mRemainingPtNodeCountInThisArray < 0
278             || mReadingState.mTotalPtNodeIndexInThisArrayChain
279                     > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP
280             || mReadingState.mPtNodeArrayIndexInThisArrayChain
281                     > MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) {
282         // Invalid dictionary.
283         AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d"
284                 "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d",
285                 mReadingState.mRemainingPtNodeCountInThisArray,
286                 mReadingState.mTotalPtNodeIndexInThisArrayChain,
287                 MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP,
288                 mReadingState.mPtNodeArrayIndexInThisArrayChain,
289                 MAX_PT_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP);
290         ASSERT(false);
291         mIsError = true;
292         mReadingState.mPos = NOT_A_DICT_POS;
293         return;
294     }
295     if (mReadingState.mRemainingPtNodeCountInThisArray == 0) {
296         // Empty node array. Try following forward link.
297         followForwardLink();
298     }
299 }
300 
301 // Follow the forward link and read the next node array if exists.
followForwardLink()302 void DynamicPtReadingHelper::followForwardLink() {
303     int nextPtNodeArrayPos = NOT_A_DICT_POS;
304     if (!mPtNodeArrayReader->readForwardLinkAndReturnIfValid(
305             mReadingState.mPos, &nextPtNodeArrayPos)) {
306         mIsError = true;
307         mReadingState.mPos = NOT_A_DICT_POS;
308         return;
309     }
310     mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos;
311     if (nextPtNodeArrayPos != NOT_A_DICT_POS) {
312         // Follow the forward link.
313         mReadingState.mPos = nextPtNodeArrayPos;
314         nextPtNodeArray();
315     } else {
316         // All node arrays have been read.
317         mReadingState.mPos = NOT_A_DICT_POS;
318     }
319 }
320 
321 } // namespace latinime
322