1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/structure/v4/ver4_patricia_trie_policy.h"
18 
19 #include <array>
20 #include <vector>
21 
22 #include "suggest/core/dicnode/dic_node.h"
23 #include "suggest/core/dicnode/dic_node_vector.h"
24 #include "dictionary/interface/ngram_listener.h"
25 #include "dictionary/property/ngram_context.h"
26 #include "dictionary/property/ngram_property.h"
27 #include "dictionary/property/unigram_property.h"
28 #include "dictionary/property/word_property.h"
29 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
30 #include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
31 #include "dictionary/utils/forgetting_curve_utils.h"
32 #include "dictionary/utils/multi_bigram_map.h"
33 #include "dictionary/utils/probability_utils.h"
34 #include "utils/ngram_utils.h"
35 
36 namespace latinime {
37 
38 // Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
39 // BinaryDictionaryDecayingTests.
40 const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
41 const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
42 const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
43 const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
44 const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
45 const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
46         Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
47 
createAndGetAllChildDicNodes(const DicNode * const dicNode,DicNodeVector * const childDicNodes) const48 void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
49         DicNodeVector *const childDicNodes) const {
50     if (!dicNode->hasChildren()) {
51         return;
52     }
53     DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
54     readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
55     while (!readingHelper.isEnd()) {
56         const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
57         if (!ptNodeParams.isValid()) {
58             break;
59         }
60         const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
61         const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
62         childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
63                 wordId, ptNodeParams.getCodePointArrayView());
64         readingHelper.readNextSiblingNode(ptNodeParams);
65     }
66     if (readingHelper.isError()) {
67         mIsCorrupted = true;
68         AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
69     }
70 }
71 
getCodePointsAndReturnCodePointCount(const int wordId,const int maxCodePointCount,int * const outCodePoints) const72 int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
73         const int maxCodePointCount, int *const outCodePoints) const {
74     DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
75     const int ptNodePos =
76             mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
77     readingHelper.initWithPtNodePos(ptNodePos);
78     const int codePointCount =  readingHelper.getCodePointsAndReturnCodePointCount(
79             maxCodePointCount, outCodePoints);
80     if (readingHelper.isError()) {
81         mIsCorrupted = true;
82         AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
83     }
84     return codePointCount;
85 }
86 
getWordId(const CodePointArrayView wordCodePoints,const bool forceLowerCaseSearch) const87 int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
88         const bool forceLowerCaseSearch) const {
89     DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
90     readingHelper.initWithPtNodeArrayPos(getRootPosition());
91     const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
92             wordCodePoints.size(), forceLowerCaseSearch);
93     if (readingHelper.isError()) {
94         mIsCorrupted = true;
95         AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
96     }
97     if (ptNodePos == NOT_A_DICT_POS) {
98         return NOT_A_WORD_ID;
99     }
100     const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
101     if (ptNodeParams.isDeleted()) {
102         return NOT_A_WORD_ID;
103     }
104     return ptNodeParams.getTerminalId();
105 }
106 
getWordAttributesInContext(const WordIdArrayView prevWordIds,const int wordId,MultiBigramMap * const multiBigramMap) const107 const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
108         const WordIdArrayView prevWordIds, const int wordId,
109         MultiBigramMap *const multiBigramMap) const {
110     if (wordId == NOT_A_WORD_ID) {
111         return WordAttributes();
112     }
113     return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
114             false /* mustMatchAllPrevWords */, mHeaderPolicy);
115 }
116 
getProbabilityOfWord(const WordIdArrayView prevWordIds,const int wordId) const117 int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
118         const int wordId) const {
119     if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
120         return NOT_A_PROBABILITY;
121     }
122     const WordAttributes wordAttributes =
123             mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
124                     true /* mustMatchAllPrevWords */, mHeaderPolicy);
125     if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) {
126         return NOT_A_PROBABILITY;
127     }
128     return wordAttributes.getProbability();
129 }
130 
getShortcutIterator(const int wordId) const131 BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
132         const int wordId) const {
133     const int shortcutPos = getShortcutPositionOfWord(wordId);
134     return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
135 }
136 
iterateNgramEntries(const WordIdArrayView prevWordIds,NgramListener * const listener) const137 void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
138         NgramListener *const listener) const {
139     if (prevWordIds.empty()) {
140         return;
141     }
142     const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
143     for (size_t i = 1; i <= prevWordIds.size(); ++i) {
144         for (const auto& entry : languageModelDictContent->getProbabilityEntries(
145                 prevWordIds.limit(i))) {
146             const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
147             if (!probabilityEntry.isValid()) {
148                 continue;
149             }
150             int probability = NOT_A_PROBABILITY;
151             if (probabilityEntry.hasHistoricalInfo()) {
152                 // TODO: Quit checking count here.
153                 // If count <= 1, the word can be an invaild word. The actual probability should
154                 // be checked using getWordAttributesInContext() in onVisitEntry().
155                 probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ?
156                         NOT_A_PROBABILITY : 0;
157             } else {
158                 probability = probabilityEntry.getProbability();
159             }
160             listener->onVisitEntry(probability, entry.getWordId());
161         }
162     }
163 }
164 
getShortcutPositionOfWord(const int wordId) const165 int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const {
166     if (wordId == NOT_A_WORD_ID) {
167         return NOT_A_DICT_POS;
168     }
169     const int ptNodePos =
170             mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
171     const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
172     if (ptNodeParams.isDeleted()) {
173         return NOT_A_DICT_POS;
174     }
175     return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
176             ptNodeParams.getTerminalId());
177 }
178 
addUnigramEntry(const CodePointArrayView wordCodePoints,const UnigramProperty * const unigramProperty)179 bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
180         const UnigramProperty *const unigramProperty) {
181     if (!mBuffers->isUpdatable()) {
182         AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
183         return false;
184     }
185     if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
186         AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
187                 mDictBuffer->getTailPosition());
188         return false;
189     }
190     if (wordCodePoints.size() > MAX_WORD_LENGTH) {
191         AKLOGE("The word is too long to insert to the dictionary, length: %zd",
192                 wordCodePoints.size());
193         return false;
194     }
195     for (const auto &shortcut : unigramProperty->getShortcuts()) {
196         if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
197             AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
198                     shortcut.getTargetCodePoints()->size());
199             return false;
200         }
201     }
202     DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
203     readingHelper.initWithPtNodeArrayPos(getRootPosition());
204     bool addedNewUnigram = false;
205     int codePointsToAdd[MAX_WORD_LENGTH];
206     int codePointCountToAdd = wordCodePoints.size();
207     memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
208     if (unigramProperty->representsBeginningOfSentence()) {
209         codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
210                 codePointCountToAdd, MAX_WORD_LENGTH);
211     }
212     if (codePointCountToAdd <= 0) {
213         return false;
214     }
215     const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
216     if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
217             &addedNewUnigram)) {
218         if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
219             mEntryCounters.incrementNgramCount(NgramType::Unigram);
220         }
221         if (unigramProperty->getShortcuts().size() > 0) {
222             // Add shortcut target.
223             const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */);
224             if (wordId == NOT_A_WORD_ID) {
225                 AKLOGE("Cannot find word id to add shortcut target.");
226                 return false;
227             }
228             const int wordPos =
229                     mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
230             for (const auto &shortcut : unigramProperty->getShortcuts()) {
231                 if (!mUpdatingHelper.addShortcutTarget(wordPos,
232                         CodePointArrayView(*shortcut.getTargetCodePoints()),
233                         shortcut.getProbability())) {
234                     AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
235                             "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
236                             shortcut.getProbability());
237                     return false;
238                 }
239             }
240         }
241         return true;
242     } else {
243         return false;
244     }
245 }
246 
removeUnigramEntry(const CodePointArrayView wordCodePoints)247 bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
248     if (!mBuffers->isUpdatable()) {
249         AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
250         return false;
251     }
252     const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
253     if (wordId == NOT_A_WORD_ID) {
254         return false;
255     }
256     const int ptNodePos =
257             mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
258     const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
259     if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) {
260         AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos);
261         return false;
262     }
263     if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) {
264         return false;
265     }
266     if (!ptNodeParams.representsNonWordInfo()) {
267         mEntryCounters.decrementNgramCount(NgramType::Unigram);
268     }
269     return true;
270 }
271 
addNgramEntry(const NgramProperty * const ngramProperty)272 bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
273     if (!mBuffers->isUpdatable()) {
274         AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
275         return false;
276     }
277     if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
278         AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
279                 mDictBuffer->getTailPosition());
280         return false;
281     }
282     const NgramContext *const ngramContext = ngramProperty->getNgramContext();
283     if (!ngramContext->isValid()) {
284         AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
285         return false;
286     }
287     if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
288         AKLOGE("The word is too long to insert the ngram to the dictionary. "
289                 "length: %zd", ngramProperty->getTargetCodePoints()->size());
290         return false;
291     }
292     WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
293     const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
294             false /* tryLowerCaseSearch */);
295     if (prevWordIds.empty()) {
296         return false;
297     }
298     for (size_t i = 0; i < prevWordIds.size(); ++i) {
299         if (prevWordIds[i] != NOT_A_WORD_ID) {
300             continue;
301         }
302         if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) {
303             return false;
304         }
305         const UnigramProperty beginningOfSentenceUnigramProperty(
306                 true /* representsBeginningOfSentence */, true /* isNotAWord */,
307                 false /* isBlacklisted */, false /* isPossiblyOffensive */,
308                 MAX_PROBABILITY /* probability */, HistoricalInfo());
309         if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
310                 &beginningOfSentenceUnigramProperty)) {
311             AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
312             return false;
313         }
314         // Refresh word ids.
315         ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
316     }
317     const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()),
318             false /* forceLowerCaseSearch */);
319     if (wordId == NOT_A_WORD_ID) {
320         return false;
321     }
322     bool addedNewEntry = false;
323     if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) {
324         if (addedNewEntry) {
325             mEntryCounters.incrementNgramCount(
326                     NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
327         }
328         return true;
329     } else {
330         return false;
331     }
332 }
333 
removeNgramEntry(const NgramContext * const ngramContext,const CodePointArrayView wordCodePoints)334 bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
335         const CodePointArrayView wordCodePoints) {
336     if (!mBuffers->isUpdatable()) {
337         AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
338         return false;
339     }
340     if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
341         AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
342                 mDictBuffer->getTailPosition());
343         return false;
344     }
345     if (!ngramContext->isValid()) {
346         AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
347         return false;
348     }
349     if (wordCodePoints.size() > MAX_WORD_LENGTH) {
350         AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
351                 wordCodePoints.size());
352     }
353     WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
354     const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
355             false /* tryLowerCaseSerch */);
356     if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) {
357         return false;
358     }
359     const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
360     if (wordId == NOT_A_WORD_ID) {
361         return false;
362     }
363     if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) {
364         mEntryCounters.decrementNgramCount(
365                 NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
366         return true;
367     } else {
368         return false;
369     }
370 }
371 
updateEntriesForWordWithNgramContext(const NgramContext * const ngramContext,const CodePointArrayView wordCodePoints,const bool isValidWord,const HistoricalInfo historicalInfo)372 bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
373         const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
374         const bool isValidWord, const HistoricalInfo historicalInfo) {
375     if (!mBuffers->isUpdatable()) {
376         AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
377                 "dictionary.");
378         return false;
379     }
380     const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ?
381             false : isValidWord;
382     int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
383     if (wordId == NOT_A_WORD_ID) {
384         // The word is not in the dictionary.
385         const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
386                 false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */,
387                 NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */,
388                 0 /* count */));
389         if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
390             AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
391             return false;
392         }
393         if (!isValidWord) {
394             return true;
395         }
396         wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
397     }
398 
399     WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
400     const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
401             false /* tryLowerCaseSearch */);
402     if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
403         if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
404             const UnigramProperty beginningOfSentenceUnigramProperty(
405                     true /* representsBeginningOfSentence */,
406                     true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
407                     HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
408             if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
409                     &beginningOfSentenceUnigramProperty)) {
410                 AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext().");
411                 return false;
412             }
413             // Refresh word ids.
414             ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
415         }
416         // Update entries for beginning of sentence.
417         if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(
418                 prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo,
419                 mHeaderPolicy, &mEntryCounters)) {
420             return false;
421         }
422     }
423     if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds,
424             wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) {
425         return false;
426     }
427     return true;
428 }
429 
flush(const char * const filePath)430 bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
431     if (!mBuffers->isUpdatable()) {
432         AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
433         return false;
434     }
435     if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
436         AKLOGE("Cannot flush the dictionary to file.");
437         mIsCorrupted = true;
438         return false;
439     }
440     return true;
441 }
442 
flushWithGC(const char * const filePath)443 bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
444     if (!mBuffers->isUpdatable()) {
445         AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
446         return false;
447     }
448     if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
449         AKLOGE("Cannot flush the dictionary to file with GC.");
450         mIsCorrupted = true;
451         return false;
452     }
453     return true;
454 }
455 
needsToRunGC(const bool mindsBlockByGC) const456 bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
457     if (!mBuffers->isUpdatable()) {
458         AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
459         return false;
460     }
461     if (mBuffers->isNearSizeLimit()) {
462         // Additional buffer size is near the limit.
463         return true;
464     } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
465             > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
466         // Total extended region size of the trie exceeds the limit.
467         return true;
468     } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
469             && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
470         // Needs to reduce dictionary size.
471         return true;
472     } else if (mHeaderPolicy->isDecayingDict()) {
473         return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
474                 mHeaderPolicy);
475     }
476     return false;
477 }
478 
getProperty(const char * const query,const int queryLength,char * const outResult,const int maxResultLength)479 void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
480         char *const outResult, const int maxResultLength) {
481     const int compareLength = queryLength + 1 /* terminator */;
482     if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
483         snprintf(outResult, maxResultLength, "%d",
484                 mEntryCounters.getNgramCount(NgramType::Unigram));
485     } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
486         snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
487     } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
488         snprintf(outResult, maxResultLength, "%d",
489                 mHeaderPolicy->isDecayingDict() ?
490                         ForgettingCurveUtils::getEntryCountHardLimit(
491                                 mHeaderPolicy->getMaxNgramCounts().getNgramCount(
492                                         NgramType::Unigram)) :
493                         static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
494     } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
495         snprintf(outResult, maxResultLength, "%d",
496                 mHeaderPolicy->isDecayingDict() ?
497                         ForgettingCurveUtils::getEntryCountHardLimit(
498                                 mHeaderPolicy->getMaxNgramCounts().getNgramCount(
499                                         NgramType::Bigram)) :
500                         static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
501     }
502 }
503 
getWordProperty(const CodePointArrayView wordCodePoints) const504 const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
505         const CodePointArrayView wordCodePoints) const {
506     const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
507     if (wordId == NOT_A_WORD_ID) {
508         AKLOGE("getWordProperty is called for invalid word.");
509         return WordProperty();
510     }
511     const LanguageModelDictContent *const languageModelDictContent =
512             mBuffers->getLanguageModelDictContent();
513     // Fetch ngram information.
514     std::vector<NgramProperty> ngrams;
515     int ngramTargetCodePoints[MAX_WORD_LENGTH];
516     int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
517     int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
518     bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
519     for (const auto& entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord(
520             mHeaderPolicy, wordId)) {
521         const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(),
522                 MAX_WORD_LENGTH, ngramTargetCodePoints);
523         const WordIdArrayView prevWordIds = entry.getPrevWordIds();
524         for (size_t i = 0; i < prevWordIds.size(); ++i) {
525             ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i],
526                        MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]);
527             ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry(
528                     prevWordIds[i]).representsBeginningOfSentence();
529             if (ngramPrevWordIsBeginningOfSentense[i]) {
530                 ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker(
531                         ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]);
532             }
533         }
534         const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount,
535                 ngramPrevWordIsBeginningOfSentense, prevWordIds.size());
536         const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry();
537         const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo();
538         // TODO: Output flags in WordAttributes.
539         ngrams.emplace_back(ngramContext,
540                 CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(),
541                 entry.getWordAttributes().getProbability(), *historicalInfo);
542     }
543     // Fetch shortcut information.
544     std::vector<UnigramProperty::ShortcutProperty> shortcuts;
545     int shortcutPos = getShortcutPositionOfWord(wordId);
546     if (shortcutPos != NOT_A_DICT_POS) {
547         int shortcutTarget[MAX_WORD_LENGTH];
548         const ShortcutDictContent *const shortcutDictContent =
549                 mBuffers->getShortcutDictContent();
550         bool hasNext = true;
551         while (hasNext) {
552             int shortcutTargetLength = 0;
553             int shortcutProbability = NOT_A_PROBABILITY;
554             shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
555                     &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
556             shortcuts.emplace_back(
557                     CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
558                     shortcutProbability);
559         }
560     }
561     const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
562             WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy);
563     const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
564     const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
565     const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
566             wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
567             wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
568             *historicalInfo, std::move(shortcuts));
569     return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
570 }
571 
getNextWordAndNextToken(const int token,int * const outCodePoints,int * const outCodePointCount)572 int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
573         int *const outCodePointCount) {
574     *outCodePointCount = 0;
575     if (token == 0) {
576         mTerminalPtNodePositionsForIteratingWords.clear();
577         DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
578                 &mTerminalPtNodePositionsForIteratingWords);
579         DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
580         readingHelper.initWithPtNodeArrayPos(getRootPosition());
581         readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
582     }
583     const int terminalPtNodePositionsVectorSize =
584             static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
585     if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
586         AKLOGE("Given token %d is invalid.", token);
587         return 0;
588     }
589     const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
590     const PtNodeParams ptNodeParams =
591             mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos);
592     *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(),
593             MAX_WORD_LENGTH, outCodePoints);
594     const int nextToken = token + 1;
595     if (nextToken >= terminalPtNodePositionsVectorSize) {
596         // All words have been iterated.
597         mTerminalPtNodePositionsForIteratingWords.clear();
598         return 0;
599     }
600     return nextToken;
601 }
602 
603 } // namespace latinime
604