1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
19  * Do not edit this file other than updating policy's interface.
20  *
21  * This file was generated from
22  *   dictionary/structure/v4/ver4_patricia_trie_policy.cpp
23  */
24 
25 #include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
26 
27 #include <vector>
28 
29 #include "suggest/core/dicnode/dic_node.h"
30 #include "suggest/core/dicnode/dic_node_vector.h"
31 #include "dictionary/interface/ngram_listener.h"
32 #include "dictionary/property/ngram_context.h"
33 #include "dictionary/property/ngram_property.h"
34 #include "dictionary/property/unigram_property.h"
35 #include "dictionary/property/word_property.h"
36 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
37 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
38 #include "dictionary/utils/forgetting_curve_utils.h"
39 #include "dictionary/utils/multi_bigram_map.h"
40 #include "dictionary/utils/probability_utils.h"
41 
42 namespace latinime {
43 namespace backward {
44 namespace v402 {
45 
46 // Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
47 // BinaryDictionaryDecayingTests.
48 const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
49 const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
50 const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
51 const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
52 const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
53 const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
54         Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
55 const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1;
56 
createAndGetAllChildDicNodes(const DicNode * const dicNode,DicNodeVector * const childDicNodes) const57 void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
58         DicNodeVector *const childDicNodes) const {
59     if (!dicNode->hasChildren()) {
60         return;
61     }
62     DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
63     readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
64     while (!readingHelper.isEnd()) {
65         const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
66         if (!ptNodeParams.isValid()) {
67             break;
68         }
69         bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
70         if (isTerminal && mHeaderPolicy->isDecayingDict()) {
71             // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
72             // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
73             // valid terminal DicNode.
74             isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
75         }
76         readingHelper.readNextSiblingNode(ptNodeParams);
77         if (ptNodeParams.representsNonWordInfo()) {
78             // Skip PtNodes that represent non-word information.
79             continue;
80         }
81         const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
82         childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
83                 wordId, ptNodeParams.getCodePointArrayView());
84     }
85     if (readingHelper.isError()) {
86         mIsCorrupted = true;
87         AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
88     }
89 }
90 
getCodePointsAndReturnCodePointCount(const int wordId,const int maxCodePointCount,int * const outCodePoints) const91 int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
92         const int maxCodePointCount, int *const outCodePoints) const {
93     DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
94     const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
95     readingHelper.initWithPtNodePos(ptNodePos);
96     const int codePointCount =  readingHelper.getCodePointsAndReturnCodePointCount(
97             maxCodePointCount, outCodePoints);
98     if (readingHelper.isError()) {
99         mIsCorrupted = true;
100         AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
101     }
102     return codePointCount;
103 }
104 
getWordId(const CodePointArrayView wordCodePoints,const bool forceLowerCaseSearch) const105 int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
106         const bool forceLowerCaseSearch) const {
107     DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
108     readingHelper.initWithPtNodeArrayPos(getRootPosition());
109     const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
110             wordCodePoints.size(), forceLowerCaseSearch);
111     if (readingHelper.isError()) {
112         mIsCorrupted = true;
113         AKLOGE("Dictionary reading error in getWordId().");
114     }
115     return getWordIdFromTerminalPtNodePos(ptNodePos);
116 }
117 
getWordAttributesInContext(const WordIdArrayView prevWordIds,const int wordId,MultiBigramMap * const multiBigramMap) const118 const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
119         const WordIdArrayView prevWordIds, const int wordId,
120         MultiBigramMap *const multiBigramMap) const {
121     if (wordId == NOT_A_WORD_ID) {
122         return WordAttributes();
123     }
124     const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
125     const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
126     if (multiBigramMap) {
127         const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
128                 prevWordIds, wordId, ptNodeParams.getProbability());
129         return getWordAttributes(probability, ptNodeParams);
130     }
131     if (!prevWordIds.empty()) {
132         const int probability = getProbabilityOfWord(prevWordIds, wordId);
133         if (probability != NOT_A_PROBABILITY) {
134             return getWordAttributes(probability, ptNodeParams);
135         }
136     }
137     return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
138             ptNodeParams);
139 }
140 
getWordAttributes(const int probability,const PtNodeParams & ptNodeParams) const141 const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability,
142         const PtNodeParams &ptNodeParams) const {
143     return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
144             ptNodeParams.getProbability() == 0);
145 }
146 
getProbability(const int unigramProbability,const int bigramProbability) const147 int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
148         const int bigramProbability) const {
149     // In the v4 format, bigramProbability is a conditional probability.
150     const int bigramConditionalProbability = bigramProbability;
151     if (unigramProbability == NOT_A_PROBABILITY) {
152         return NOT_A_PROBABILITY;
153     }
154     if (bigramConditionalProbability == NOT_A_PROBABILITY) {
155         return ProbabilityUtils::backoff(unigramProbability);
156     }
157     return bigramConditionalProbability;
158 }
159 
getProbabilityOfWord(const WordIdArrayView prevWordIds,const int wordId) const160 int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
161         const int wordId) const {
162     if (wordId == NOT_A_WORD_ID) {
163         return NOT_A_PROBABILITY;
164     }
165     const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
166     const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
167     if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) {
168         return NOT_A_PROBABILITY;
169     }
170     if (prevWordIds.empty()) {
171         return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
172     }
173     if (prevWordIds[0] == NOT_A_WORD_ID) {
174         return NOT_A_PROBABILITY;
175     }
176     const PtNodeParams prevWordPtNodeParams =
177             mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
178     if (prevWordPtNodeParams.isDeleted()) {
179         return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
180     }
181     const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
182             prevWordPtNodeParams.getTerminalId());
183     BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
184     while (bigramsIt.hasNext()) {
185         bigramsIt.next();
186         if (bigramsIt.getBigramPos() == ptNodePos
187                 && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
188             const int bigramConditionalProbability = getBigramConditionalProbability(
189                     prevWordPtNodeParams.getProbability(),
190                     prevWordPtNodeParams.representsBeginningOfSentence(),
191                     bigramsIt.getProbability());
192             return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability);
193         }
194     }
195     return NOT_A_PROBABILITY;
196 }
197 
iterateNgramEntries(const WordIdArrayView prevWordIds,NgramListener * const listener) const198 void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
199         NgramListener *const listener) const {
200     if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) {
201         return;
202     }
203     const PtNodeParams prevWordPtNodeParams =
204             mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
205     if (prevWordPtNodeParams.isDeleted()) {
206         return;
207     }
208     const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
209             prevWordPtNodeParams.getTerminalId());
210     BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
211     while (bigramsIt.hasNext()) {
212         bigramsIt.next();
213         const int bigramConditionalProbability = getBigramConditionalProbability(
214                 prevWordPtNodeParams.getProbability(),
215                 prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability());
216         listener->onVisitEntry(bigramConditionalProbability,
217                 getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
218     }
219 }
220 
getBigramConditionalProbability(const int prevWordUnigramProbability,const bool isInBeginningOfSentenceContext,const int bigramProbability) const221 int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability,
222         const bool isInBeginningOfSentenceContext, const int bigramProbability) const {
223     if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
224         if (isInBeginningOfSentenceContext) {
225             return bigramProbability;
226         }
227         // Calculate conditional probability.
228         return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability,
229                 MAX_PROBABILITY);
230     } else {
231         // bigramProbability is a conditional probability.
232         return bigramProbability;
233     }
234 }
235 
getShortcutIterator(const int wordId) const236 BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
237         const int wordId) const {
238     const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
239     return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
240 }
241 
getShortcutPositionOfPtNode(const int ptNodePos) const242 int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
243     if (ptNodePos == NOT_A_DICT_POS) {
244         return NOT_A_DICT_POS;
245     }
246     const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
247     if (ptNodeParams.isDeleted()) {
248         return NOT_A_DICT_POS;
249     }
250     return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
251             ptNodeParams.getTerminalId());
252 }
253 
getBigramsPositionOfPtNode(const int ptNodePos) const254 int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
255     if (ptNodePos == NOT_A_DICT_POS) {
256         return NOT_A_DICT_POS;
257     }
258     const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
259     if (ptNodeParams.isDeleted()) {
260         return NOT_A_DICT_POS;
261     }
262     return mBuffers->getBigramDictContent()->getBigramListHeadPos(
263             ptNodeParams.getTerminalId());
264 }
265 
addUnigramEntry(const CodePointArrayView wordCodePoints,const UnigramProperty * const unigramProperty)266 bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
267         const UnigramProperty *const unigramProperty) {
268     if (!mBuffers->isUpdatable()) {
269         AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
270         return false;
271     }
272     if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
273         AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
274                 mDictBuffer->getTailPosition());
275         return false;
276     }
277     if (wordCodePoints.size() > MAX_WORD_LENGTH) {
278         AKLOGE("The word is too long to insert to the dictionary, length: %zd",
279                 wordCodePoints.size());
280         return false;
281     }
282     for (const auto &shortcut : unigramProperty->getShortcuts()) {
283         if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
284             AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
285                     shortcut.getTargetCodePoints()->size());
286             return false;
287         }
288     }
289     DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
290     readingHelper.initWithPtNodeArrayPos(getRootPosition());
291     bool addedNewUnigram = false;
292     int codePointsToAdd[MAX_WORD_LENGTH];
293     int codePointCountToAdd = wordCodePoints.size();
294     memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
295     if (unigramProperty->representsBeginningOfSentence()) {
296         codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
297                 codePointCountToAdd, MAX_WORD_LENGTH);
298     }
299     if (codePointCountToAdd <= 0) {
300         return false;
301     }
302     const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
303     if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
304             &addedNewUnigram)) {
305         if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
306             mEntryCounters.incrementNgramCount(NgramType::Unigram);
307         }
308         if (unigramProperty->getShortcuts().size() > 0) {
309             // Add shortcut target.
310             const int wordPos = getTerminalPtNodePosFromWordId(
311                     getWordId(codePointArrayView, false /* forceLowerCaseSearch */));
312             if (wordPos == NOT_A_DICT_POS) {
313                 AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
314                 return false;
315             }
316             for (const auto &shortcut : unigramProperty->getShortcuts()) {
317                 if (!mUpdatingHelper.addShortcutTarget(wordPos,
318                         CodePointArrayView(*shortcut.getTargetCodePoints()),
319                         shortcut.getProbability())) {
320                     AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
321                             "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
322                             shortcut.getProbability());
323                     return false;
324                 }
325             }
326         }
327         return true;
328     } else {
329         return false;
330     }
331 }
332 
removeUnigramEntry(const CodePointArrayView wordCodePoints)333 bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
334     if (!mBuffers->isUpdatable()) {
335         AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
336         return false;
337     }
338     const int ptNodePos = getTerminalPtNodePosFromWordId(
339             getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
340     if (ptNodePos == NOT_A_DICT_POS) {
341         return false;
342     }
343     const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
344     return mNodeWriter.suppressUnigramEntry(&ptNodeParams);
345 }
346 
addNgramEntry(const NgramProperty * const ngramProperty)347 bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
348     if (!mBuffers->isUpdatable()) {
349         AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
350         return false;
351     }
352     if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
353         AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
354                 mDictBuffer->getTailPosition());
355         return false;
356     }
357     const NgramContext *const ngramContext = ngramProperty->getNgramContext();
358     if (!ngramContext->isValid()) {
359         AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
360         return false;
361     }
362     if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
363         AKLOGE("The word is too long to insert the ngram to the dictionary. "
364                 "length: %zd", ngramProperty->getTargetCodePoints()->size());
365         return false;
366     }
367     WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
368     const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
369             false /* tryLowerCaseSearch */);
370     if (prevWordIds.empty()) {
371         return false;
372     }
373     if (prevWordIds[0] == NOT_A_WORD_ID) {
374         if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
375             const UnigramProperty beginningOfSentenceUnigramProperty(
376                     true /* representsBeginningOfSentence */, true /* isNotAWord */,
377                     false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
378             if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
379                     &beginningOfSentenceUnigramProperty)) {
380                 AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
381                 return false;
382             }
383             // Refresh word ids.
384             ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
385         } else {
386             return false;
387         }
388     }
389     const int wordPos = getTerminalPtNodePosFromWordId(getWordId(
390             CodePointArrayView(*ngramProperty->getTargetCodePoints()),
391                     false /* forceLowerCaseSearch */));
392     if (wordPos == NOT_A_DICT_POS) {
393         return false;
394     }
395     bool addedNewBigram = false;
396     const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
397     if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos),
398             wordPos, ngramProperty, &addedNewBigram)) {
399         if (addedNewBigram) {
400             mEntryCounters.incrementNgramCount(NgramType::Bigram);
401         }
402         return true;
403     } else {
404         return false;
405     }
406 }
407 
removeNgramEntry(const NgramContext * const ngramContext,const CodePointArrayView wordCodePoints)408 bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
409         const CodePointArrayView wordCodePoints) {
410     if (!mBuffers->isUpdatable()) {
411         AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
412         return false;
413     }
414     if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
415         AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
416                 mDictBuffer->getTailPosition());
417         return false;
418     }
419     if (!ngramContext->isValid()) {
420         AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
421         return false;
422     }
423     if (wordCodePoints.size() > MAX_WORD_LENGTH) {
424         AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
425                 wordCodePoints.size());
426     }
427     WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
428     const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
429             false /* tryLowerCaseSerch */);
430     if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
431         return false;
432     }
433     const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints,
434             false /* forceLowerCaseSearch */));
435     if (wordPos == NOT_A_DICT_POS) {
436         return false;
437     }
438     const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
439     if (mUpdatingHelper.removeNgramEntry(
440             PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) {
441         mEntryCounters.decrementNgramCount(NgramType::Bigram);
442         return true;
443     } else {
444         return false;
445     }
446 }
447 
448 
updateEntriesForWordWithNgramContext(const NgramContext * const ngramContext,const CodePointArrayView wordCodePoints,const bool isValidWord,const HistoricalInfo historicalInfo)449 bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
450         const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
451         const bool isValidWord, const HistoricalInfo historicalInfo) {
452     if (!mBuffers->isUpdatable()) {
453         AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
454                 "dictionary.");
455         return false;
456     }
457     const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY;
458     const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
459             false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo);
460     if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
461         AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
462         return false;
463     }
464     const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)
465             ? NOT_A_PROBABILITY : probability;
466     const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram,
467             historicalInfo);
468     if (!addNgramEntry(&ngramProperty)) {
469         AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
470         return false;
471     }
472     return true;
473 }
474 
flush(const char * const filePath)475 bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
476     if (!mBuffers->isUpdatable()) {
477         AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
478         return false;
479     }
480     if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
481         AKLOGE("Cannot flush the dictionary to file.");
482         mIsCorrupted = true;
483         return false;
484     }
485     return true;
486 }
487 
flushWithGC(const char * const filePath)488 bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
489     if (!mBuffers->isUpdatable()) {
490         AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
491         return false;
492     }
493     if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
494         AKLOGE("Cannot flush the dictionary to file with GC.");
495         mIsCorrupted = true;
496         return false;
497     }
498     return true;
499 }
500 
needsToRunGC(const bool mindsBlockByGC) const501 bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
502     if (!mBuffers->isUpdatable()) {
503         AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
504         return false;
505     }
506     if (mBuffers->isNearSizeLimit()) {
507         // Additional buffer size is near the limit.
508         return true;
509     } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
510             > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
511         // Total extended region size of the trie exceeds the limit.
512         return true;
513     } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
514             && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
515         // Needs to reduce dictionary size.
516         return true;
517     } else if (mHeaderPolicy->isDecayingDict()) {
518         return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
519                 mHeaderPolicy);
520     }
521     return false;
522 }
523 
getProperty(const char * const query,const int queryLength,char * const outResult,const int maxResultLength)524 void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
525         char *const outResult, const int maxResultLength) {
526     const int compareLength = queryLength + 1 /* terminator */;
527     if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
528         snprintf(outResult, maxResultLength, "%d",
529                 mEntryCounters.getNgramCount(NgramType::Unigram));
530     } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
531         snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
532     } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
533         snprintf(outResult, maxResultLength, "%d",
534                 mHeaderPolicy->isDecayingDict() ?
535                         ForgettingCurveUtils::getEntryCountHardLimit(
536                                 mHeaderPolicy->getMaxNgramCounts().getNgramCount(
537                                         NgramType::Unigram)) :
538                         static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
539     } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
540         snprintf(outResult, maxResultLength, "%d",
541                 mHeaderPolicy->isDecayingDict() ?
542                         ForgettingCurveUtils::getEntryCountHardLimit(
543                                 mHeaderPolicy->getMaxNgramCounts().getNgramCount(
544                                         NgramType::Bigram)) :
545                         static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
546     }
547 }
548 
getWordProperty(const CodePointArrayView wordCodePoints) const549 const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
550         const CodePointArrayView wordCodePoints) const {
551     const int ptNodePos = getTerminalPtNodePosFromWordId(
552             getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
553     if (ptNodePos == NOT_A_DICT_POS) {
554         AKLOGE("getWordProperty is called for invalid word.");
555         return WordProperty();
556     }
557     const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
558     const ProbabilityEntry probabilityEntry =
559             mBuffers->getProbabilityDictContent()->getProbabilityEntry(
560                     ptNodeParams.getTerminalId());
561     const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
562     // Fetch bigram information.
563     std::vector<NgramProperty> ngrams;
564     const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
565     if (bigramListPos != NOT_A_DICT_POS) {
566         int bigramWord1CodePoints[MAX_WORD_LENGTH];
567         const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent();
568         const TerminalPositionLookupTable *const terminalPositionLookupTable =
569                 mBuffers->getTerminalPositionLookupTable();
570         bool hasNext = true;
571         int readingPos = bigramListPos;
572         while (hasNext) {
573             const BigramEntry bigramEntry =
574                     bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
575             hasNext = bigramEntry.hasNext();
576             const int word1TerminalId = bigramEntry.getTargetTerminalId();
577             const int word1TerminalPtNodePos =
578                     terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId);
579             if (word1TerminalPtNodePos == NOT_A_DICT_POS) {
580                 continue;
581             }
582             const int codePointCount = getCodePointsAndReturnCodePointCount(
583                     getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH,
584                     bigramWord1CodePoints);
585             const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
586             const int rawBigramProbability = bigramEntry.hasHistoricalInfo()
587                     ? ForgettingCurveUtils::decodeProbability(
588                             bigramEntry.getHistoricalInfo(), mHeaderPolicy)
589                     : bigramEntry.getProbability();
590             const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(),
591                     ptNodeParams.representsBeginningOfSentence(), rawBigramProbability);
592             ngrams.emplace_back(
593                     NgramContext(wordCodePoints.data(), wordCodePoints.size(),
594                             ptNodeParams.representsBeginningOfSentence()),
595                     CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
596                     probability, *historicalInfo);
597         }
598     }
599     // Fetch shortcut information.
600     std::vector<UnigramProperty::ShortcutProperty> shortcuts;
601     int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
602     if (shortcutPos != NOT_A_DICT_POS) {
603         int shortcutTarget[MAX_WORD_LENGTH];
604         const ShortcutDictContent *const shortcutDictContent =
605                 mBuffers->getShortcutDictContent();
606         bool hasNext = true;
607         while (hasNext) {
608             int shortcutTargetLength = 0;
609             int shortcutProbability = NOT_A_PROBABILITY;
610             shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
611                     &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
612             shortcuts.emplace_back(
613                     CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
614                     shortcutProbability);
615         }
616     }
617     const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
618             ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
619             ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts));
620     return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
621 }
622 
getNextWordAndNextToken(const int token,int * const outCodePoints,int * const outCodePointCount)623 int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
624         int *const outCodePointCount) {
625     *outCodePointCount = 0;
626     if (token == 0) {
627         mTerminalPtNodePositionsForIteratingWords.clear();
628         DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
629                 &mTerminalPtNodePositionsForIteratingWords);
630         DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
631         readingHelper.initWithPtNodeArrayPos(getRootPosition());
632         readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
633     }
634     const int terminalPtNodePositionsVectorSize =
635             static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
636     if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
637         AKLOGE("Given token %d is invalid.", token);
638         return 0;
639     }
640     const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
641     *outCodePointCount = getCodePointsAndReturnCodePointCount(
642             getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
643     const int nextToken = token + 1;
644     if (nextToken >= terminalPtNodePositionsVectorSize) {
645         // All words have been iterated.
646         mTerminalPtNodePositionsForIteratingWords.clear();
647         return 0;
648     }
649     return nextToken;
650 }
651 
getWordIdFromTerminalPtNodePos(const int ptNodePos) const652 int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
653     return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
654 }
655 
getTerminalPtNodePosFromWordId(const int wordId) const656 int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
657     return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
658 }
659 
660 } // namespace v402
661 } // namespace backward
662 } // namespace latinime
663