1 /*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "dictionary/structure/v2/patricia_trie_policy.h"
18
19 #include "defines.h"
20 #include "suggest/core/dicnode/dic_node.h"
21 #include "suggest/core/dicnode/dic_node_vector.h"
22 #include "dictionary/interface/ngram_listener.h"
23 #include "dictionary/property/ngram_context.h"
24 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
25 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
26 #include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
27 #include "dictionary/utils/multi_bigram_map.h"
28 #include "dictionary/utils/probability_utils.h"
29 #include "utils/char_utils.h"
30
31 namespace latinime {
32
createAndGetAllChildDicNodes(const DicNode * const dicNode,DicNodeVector * const childDicNodes) const33 void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
34 DicNodeVector *const childDicNodes) const {
35 if (!dicNode->hasChildren()) {
36 return;
37 }
38 int nextPos = dicNode->getChildrenPtNodeArrayPos();
39 if (!isValidPos(nextPos)) {
40 AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd",
41 nextPos, mBuffer.size());
42 mIsCorrupted = true;
43 ASSERT(false);
44 return;
45 }
46 const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
47 mBuffer.data(), &nextPos);
48 for (int i = 0; i < childCount; i++) {
49 if (!isValidPos(nextPos)) {
50 AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d",
51 nextPos, mBuffer.size(), i, childCount);
52 mIsCorrupted = true;
53 ASSERT(false);
54 return;
55 }
56 nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes);
57 }
58 }
59
getCodePointsAndReturnCodePointCount(const int wordId,const int maxCodePointCount,int * const outCodePoints) const60 int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
61 const int maxCodePointCount, int *const outCodePoints) const {
62 return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount,
63 outCodePoints, nullptr /* outUnigramProbability */);
64 }
65 // This retrieves code points and the probability of the word by its id.
66 // Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
67 // it is possible to check for this with advantageous complexity. For each PtNode array, we search
68 // for PtNodes with children and compare the children position with the position we look for.
69 // When we shoot the position we look for, it means the word we look for is in the children
70 // of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a
71 // PtNode array with the last PtNode's children position still less than what we are searching for,
72 // we must descend the last PtNode's children (for example, if the word we are searching for starts
73 // with a z, it's the last PtNode of the root array, so all children addresses will be smaller
74 // than the position we look for, and we have to descend the z PtNode).
75 /* Parameters :
76 * wordId: Id of the word we are searching for.
77 * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
78 * outUnigramProbability: a pointer to an int to write the probability into.
79 * Return value : the code point count, of 0 if the word was not found.
80 */
81 // TODO: Split this function to be more readable
getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId,const int maxCodePointCount,int * const outCodePoints,int * const outUnigramProbability) const82 int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
83 const int wordId, const int maxCodePointCount, int *const outCodePoints,
84 int *const outUnigramProbability) const {
85 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
86 int pos = getRootPosition();
87 int wordPos = 0;
88 const int *const codePointTable = mHeaderPolicy.getCodePointTable();
89 if (outUnigramProbability) {
90 *outUnigramProbability = NOT_A_PROBABILITY;
91 }
92 // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
93 // only traverse PtNodes that are actually a part of the terminal we are searching, so each
94 // time we enter this loop we are one depth level further than last time.
95 // The only reason we count PtNodes is because we want to reduce the probability of infinite
96 // looping in case there is a bug. Since we know there is an upper bound to the depth we are
97 // supposed to traverse, it does not hurt to count iterations.
98 for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) {
99 int lastCandidatePtNodePos = 0;
100 // Let's loop through PtNodes in this PtNode array searching for either the terminal
101 // or one of its ascendants.
102 if (!isValidPos(pos)) {
103 AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd",
104 pos, mBuffer.size());
105 mIsCorrupted = true;
106 ASSERT(false);
107 return 0;
108 }
109 for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
110 mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) {
111 const int startPos = pos;
112 if (!isValidPos(pos)) {
113 AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size());
114 mIsCorrupted = true;
115 ASSERT(false);
116 return 0;
117 }
118 const PatriciaTrieReadingUtils::NodeFlags flags =
119 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
120 const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
121 mBuffer.data(), codePointTable, &pos);
122 if (ptNodePos == startPos) {
123 // We found the position. Copy the rest of the code points in the buffer and return
124 // the length.
125 outCodePoints[wordPos] = character;
126 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
127 int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
128 mBuffer.data(), codePointTable, &pos);
129 // We count code points in order to avoid infinite loops if the file is broken
130 // or if there is some other bug
131 int charCount = maxCodePointCount;
132 while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
133 outCodePoints[++wordPos] = nextChar;
134 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
135 mBuffer.data(), codePointTable, &pos);
136 }
137 }
138 if (outUnigramProbability) {
139 *outUnigramProbability =
140 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
141 mBuffer.data(), &pos);
142 }
143 return ++wordPos;
144 }
145 // We need to skip past this PtNode, so skip any remaining code points after the
146 // first and possibly the probability.
147 if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
148 PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
149 codePointTable, &pos);
150 }
151 if (PatriciaTrieReadingUtils::isTerminal(flags)) {
152 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
153 }
154 // The fact that this PtNode has children is very important. Since we already know
155 // that this PtNode does not match, if it has no children we know it is irrelevant
156 // to what we are searching for.
157 const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags);
158 // We will write in `found' whether we have passed the children position we are
159 // searching for. For example if we search for "beer", the children of b are less
160 // than the address we are searching for and the children of c are greater. When we
161 // come here for c, we realize this is too big, and that we should descend b.
162 bool found;
163 if (hasChildren) {
164 int currentPos = pos;
165 // Here comes the tricky part. First, read the children position.
166 const int childrenPos = PatriciaTrieReadingUtils
167 ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags,
168 ¤tPos);
169 if (childrenPos > ptNodePos) {
170 // If the children pos is greater than the position, it means the previous
171 // PtNode, which position is stored in lastCandidatePtNodePos, was the right
172 // one.
173 found = true;
174 } else if (1 >= ptNodeCount) {
175 // However if we are on the LAST PtNode of this array, and we have NOT shot the
176 // position we should descend THIS PtNode. So we trick the
177 // lastCandidatePtNodePos so that we will descend this PtNode, not the previous
178 // one.
179 lastCandidatePtNodePos = startPos;
180 found = true;
181 } else {
182 // Else, we should continue looking.
183 found = false;
184 }
185 } else {
186 // Even if we don't have children here, we could still be on the last PtNode of
187 // this array. If this is the case, we should descend the last PtNode that had
188 // children, and their position is already in lastCandidatePtNodePos.
189 found = (1 >= ptNodeCount);
190 }
191
192 if (found) {
193 // Okay, we found the PtNode we should descend. Its position is in
194 // the lastCandidatePtNodePos variable, so we just re-read it.
195 if (0 != lastCandidatePtNodePos) {
196 const PatriciaTrieReadingUtils::NodeFlags lastFlags =
197 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
198 mBuffer.data(), &lastCandidatePtNodePos);
199 const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
200 mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
201 // We copy all the characters in this PtNode to the buffer
202 outCodePoints[wordPos] = lastChar;
203 if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
204 int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
205 mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
206 int charCount = maxCodePointCount;
207 while (-1 != nextChar && --charCount > 0) {
208 outCodePoints[++wordPos] = nextChar;
209 nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
210 mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
211 }
212 }
213 ++wordPos;
214 // Now we only need to branch to the children address. Skip the probability if
215 // it's there, read pos, and break to resume the search at pos.
216 if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
217 PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(),
218 &lastCandidatePtNodePos);
219 }
220 pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
221 mBuffer.data(), lastFlags, &lastCandidatePtNodePos);
222 break;
223 } else {
224 // Here is a little tricky part: we come here if we found out that all children
225 // addresses in this PtNode are bigger than the address we are searching for.
226 // Should we conclude the word is not in the dictionary? No! It could still be
227 // one of the remaining PtNodes in this array, so we have to keep looking in
228 // this array until we find it (or we realize it's not there either, in which
229 // case it's actually not in the dictionary). Pass the end of this PtNode,
230 // ready to start the next one.
231 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
232 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
233 mBuffer.data(), flags, &pos);
234 }
235 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
236 mShortcutListPolicy.skipAllShortcuts(&pos);
237 }
238 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
239 if (!mBigramListPolicy.skipAllBigrams(&pos)) {
240 AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(),
241 pos);
242 mIsCorrupted = true;
243 ASSERT(false);
244 return 0;
245 }
246 }
247 }
248 } else {
249 // If we did not find it, we should record the last children address for the next
250 // iteration.
251 if (hasChildren) lastCandidatePtNodePos = startPos;
252 // Now skip the end of this PtNode (children pos and the attributes if any) so that
253 // our pos is after the end of this PtNode, at the start of the next one.
254 if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
255 PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
256 mBuffer.data(), flags, &pos);
257 }
258 if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
259 mShortcutListPolicy.skipAllShortcuts(&pos);
260 }
261 if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
262 if (!mBigramListPolicy.skipAllBigrams(&pos)) {
263 AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos);
264 mIsCorrupted = true;
265 ASSERT(false);
266 return 0;
267 }
268 }
269 }
270
271 }
272 }
273 // If we have looked through all the PtNodes and found no match, the ptNodePos is
274 // not the position of a terminal in this dictionary.
275 return 0;
276 }
277
278 // This function gets the position of the terminal PtNode of the exact matching word in the
279 // dictionary. If no match is found, it returns NOT_A_WORD_ID.
getWordId(const CodePointArrayView wordCodePoints,const bool forceLowerCaseSearch) const280 int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
281 const bool forceLowerCaseSearch) const {
282 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
283 readingHelper.initWithPtNodeArrayPos(getRootPosition());
284 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
285 wordCodePoints.size(), forceLowerCaseSearch);
286 if (readingHelper.isError()) {
287 mIsCorrupted = true;
288 AKLOGE("Dictionary reading error in getWordId().");
289 }
290 return getWordIdFromTerminalPtNodePos(ptNodePos);
291 }
292
getWordAttributesInContext(const WordIdArrayView prevWordIds,const int wordId,MultiBigramMap * const multiBigramMap) const293 const WordAttributes PatriciaTriePolicy::getWordAttributesInContext(
294 const WordIdArrayView prevWordIds, const int wordId,
295 MultiBigramMap *const multiBigramMap) const {
296 if (wordId == NOT_A_WORD_ID) {
297 return WordAttributes();
298 }
299 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
300 const PtNodeParams ptNodeParams =
301 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
302 if (multiBigramMap) {
303 const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
304 prevWordIds, wordId, ptNodeParams.getProbability());
305 return getWordAttributes(probability, ptNodeParams);
306 }
307 if (!prevWordIds.empty()) {
308 const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId);
309 if (bigramProbability != NOT_A_PROBABILITY) {
310 return getWordAttributes(bigramProbability, ptNodeParams);
311 }
312 }
313 return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
314 ptNodeParams);
315 }
316
getWordAttributes(const int probability,const PtNodeParams & ptNodeParams) const317 const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability,
318 const PtNodeParams &ptNodeParams) const {
319 return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
320 ptNodeParams.isPossiblyOffensive());
321 }
322
getProbability(const int unigramProbability,const int bigramProbability) const323 int PatriciaTriePolicy::getProbability(const int unigramProbability,
324 const int bigramProbability) const {
325 // Due to space constraints, the probability for bigrams is approximate - the lower the unigram
326 // probability, the worse the precision. The theoritical maximum error in resulting probability
327 // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means
328 // that sometimes, we'll see some bigrams interverted here, but it can't get too bad.
329 if (unigramProbability == NOT_A_PROBABILITY) {
330 return NOT_A_PROBABILITY;
331 } else if (bigramProbability == NOT_A_PROBABILITY) {
332 return ProbabilityUtils::backoff(unigramProbability);
333 } else {
334 return ProbabilityUtils::computeProbabilityForBigram(unigramProbability,
335 bigramProbability);
336 }
337 }
338
getProbabilityOfWord(const WordIdArrayView prevWordIds,const int wordId) const339 int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
340 const int wordId) const {
341 if (wordId == NOT_A_WORD_ID) {
342 return NOT_A_PROBABILITY;
343 }
344 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
345 const PtNodeParams ptNodeParams =
346 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
347 if (ptNodeParams.isNotAWord()) {
348 // If this is not a word, it should behave as having no probability outside of the
349 // suggestion process (where it should be used for shortcuts).
350 return NOT_A_PROBABILITY;
351 }
352 if (!prevWordIds.empty()) {
353 const int bigramsPosition = getBigramsPositionOfPtNode(
354 getTerminalPtNodePosFromWordId(prevWordIds[0]));
355 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
356 while (bigramsIt.hasNext()) {
357 bigramsIt.next();
358 if (bigramsIt.getBigramPos() == ptNodePos
359 && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
360 return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
361 }
362 }
363 return NOT_A_PROBABILITY;
364 }
365 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
366 }
367
iterateNgramEntries(const WordIdArrayView prevWordIds,NgramListener * const listener) const368 void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
369 NgramListener *const listener) const {
370 if (prevWordIds.empty()) {
371 return;
372 }
373 const int bigramsPosition = getBigramsPositionOfPtNode(
374 getTerminalPtNodePosFromWordId(prevWordIds[0]));
375 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
376 while (bigramsIt.hasNext()) {
377 bigramsIt.next();
378 listener->onVisitEntry(bigramsIt.getProbability(),
379 getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
380 }
381 }
382
getShortcutIterator(const int wordId) const383 BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const {
384 const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
385 return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos);
386 }
387
getShortcutPositionOfPtNode(const int ptNodePos) const388 int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
389 if (ptNodePos == NOT_A_DICT_POS) {
390 return NOT_A_DICT_POS;
391 }
392 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getShortcutPos();
393 }
394
getBigramsPositionOfPtNode(const int ptNodePos) const395 int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
396 if (ptNodePos == NOT_A_DICT_POS) {
397 return NOT_A_DICT_POS;
398 }
399 return mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos).getBigramsPos();
400 }
401
createAndGetLeavingChildNode(const DicNode * const dicNode,const int ptNodePos,DicNodeVector * childDicNodes) const402 int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode,
403 const int ptNodePos, DicNodeVector *childDicNodes) const {
404 PatriciaTrieReadingUtils::NodeFlags flags;
405 int mergedNodeCodePointCount = 0;
406 int mergedNodeCodePoints[MAX_WORD_LENGTH];
407 int probability = NOT_A_PROBABILITY;
408 int childrenPos = NOT_A_DICT_POS;
409 int shortcutPos = NOT_A_DICT_POS;
410 int bigramPos = NOT_A_DICT_POS;
411 int siblingPos = NOT_A_DICT_POS;
412 const int *const codePointTable = mHeaderPolicy.getCodePointTable();
413 PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
414 &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
415 mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
416 &siblingPos);
417 // Skip PtNodes don't start with Unicode code point because they represent non-word information.
418 if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
419 const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
420 childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId,
421 CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount));
422 }
423 return siblingPos;
424 }
425
getWordProperty(const CodePointArrayView wordCodePoints) const426 const WordProperty PatriciaTriePolicy::getWordProperty(
427 const CodePointArrayView wordCodePoints) const {
428 const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
429 if (wordId == NOT_A_WORD_ID) {
430 AKLOGE("getWordProperty was called for invalid word.");
431 return WordProperty();
432 }
433 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
434 const PtNodeParams ptNodeParams =
435 mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
436 // Fetch bigram information.
437 std::vector<NgramProperty> ngrams;
438 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
439 int bigramWord1CodePoints[MAX_WORD_LENGTH];
440 BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
441 while (bigramsIt.hasNext()) {
442 // Fetch the next bigram information and forward the iterator.
443 bigramsIt.next();
444 // Skip the entry if the entry has been deleted. This never happens for ver2 dicts.
445 if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
446 int word1Probability = NOT_A_PROBABILITY;
447 const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
448 getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
449 bigramWord1CodePoints, &word1Probability);
450 const int probability = getProbability(word1Probability, bigramsIt.getProbability());
451 ngrams.emplace_back(
452 NgramContext(wordCodePoints.data(), wordCodePoints.size(),
453 ptNodeParams.representsBeginningOfSentence()),
454 CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
455 probability, HistoricalInfo());
456 }
457 }
458 // Fetch shortcut information.
459 std::vector<UnigramProperty::ShortcutProperty> shortcuts;
460 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
461 if (shortcutPos != NOT_A_DICT_POS) {
462 int shortcutTargetCodePoints[MAX_WORD_LENGTH];
463 ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos);
464 bool hasNext = true;
465 while (hasNext) {
466 const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
467 ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos);
468 hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
469 const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
470 mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
471 const int shortcutProbability =
472 ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
473 shortcuts.emplace_back(
474 CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(),
475 shortcutProbability);
476 }
477 }
478 const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
479 ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
480 ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts));
481 return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
482 }
483
getNextWordAndNextToken(const int token,int * const outCodePoints,int * const outCodePointCount)484 int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
485 int *const outCodePointCount) {
486 *outCodePointCount = 0;
487 if (token == 0) {
488 // Start iterating the dictionary.
489 mTerminalPtNodePositionsForIteratingWords.clear();
490 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
491 &mTerminalPtNodePositionsForIteratingWords);
492 DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
493 readingHelper.initWithPtNodeArrayPos(getRootPosition());
494 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
495 }
496 const int terminalPtNodePositionsVectorSize =
497 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
498 if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
499 AKLOGE("Given token %d is invalid.", token);
500 return 0;
501 }
502 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
503 *outCodePointCount = getCodePointsAndReturnCodePointCount(
504 getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
505 const int nextToken = token + 1;
506 if (nextToken >= terminalPtNodePositionsVectorSize) {
507 // All words have been iterated.
508 mTerminalPtNodePositionsForIteratingWords.clear();
509 return 0;
510 }
511 return nextToken;
512 }
513
getWordIdFromTerminalPtNodePos(const int ptNodePos) const514 int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
515 return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
516 }
517
getTerminalPtNodePosFromWordId(const int wordId) const518 int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
519 return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
520 }
521
isValidPos(const int pos) const522 bool PatriciaTriePolicy::isValidPos(const int pos) const {
523 return pos >= 0 && pos < static_cast<int>(mBuffer.size());
524 }
525
526 } // namespace latinime
527