1 /*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "dictionary/structure/v4/ver4_patricia_trie_policy.h"
18
19 #include <array>
20 #include <vector>
21
22 #include "suggest/core/dicnode/dic_node.h"
23 #include "suggest/core/dicnode/dic_node_vector.h"
24 #include "dictionary/interface/ngram_listener.h"
25 #include "dictionary/property/ngram_context.h"
26 #include "dictionary/property/ngram_property.h"
27 #include "dictionary/property/unigram_property.h"
28 #include "dictionary/property/word_property.h"
29 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
30 #include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
31 #include "dictionary/utils/forgetting_curve_utils.h"
32 #include "dictionary/utils/multi_bigram_map.h"
33 #include "dictionary/utils/probability_utils.h"
34 #include "utils/ngram_utils.h"
35
36 namespace latinime {
37
38 // Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
39 // BinaryDictionaryDecayingTests.
40 const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
41 const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
42 const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
43 const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
44 const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
45 const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
46 Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
47
createAndGetAllChildDicNodes(const DicNode * const dicNode,DicNodeVector * const childDicNodes) const48 void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
49 DicNodeVector *const childDicNodes) const {
50 if (!dicNode->hasChildren()) {
51 return;
52 }
53 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
54 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
55 while (!readingHelper.isEnd()) {
56 const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
57 if (!ptNodeParams.isValid()) {
58 break;
59 }
60 const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
61 const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
62 childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
63 wordId, ptNodeParams.getCodePointArrayView());
64 readingHelper.readNextSiblingNode(ptNodeParams);
65 }
66 if (readingHelper.isError()) {
67 mIsCorrupted = true;
68 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
69 }
70 }
71
getCodePointsAndReturnCodePointCount(const int wordId,const int maxCodePointCount,int * const outCodePoints) const72 int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
73 const int maxCodePointCount, int *const outCodePoints) const {
74 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
75 const int ptNodePos =
76 mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
77 readingHelper.initWithPtNodePos(ptNodePos);
78 const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount(
79 maxCodePointCount, outCodePoints);
80 if (readingHelper.isError()) {
81 mIsCorrupted = true;
82 AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
83 }
84 return codePointCount;
85 }
86
getWordId(const CodePointArrayView wordCodePoints,const bool forceLowerCaseSearch) const87 int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
88 const bool forceLowerCaseSearch) const {
89 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
90 readingHelper.initWithPtNodeArrayPos(getRootPosition());
91 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
92 wordCodePoints.size(), forceLowerCaseSearch);
93 if (readingHelper.isError()) {
94 mIsCorrupted = true;
95 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
96 }
97 if (ptNodePos == NOT_A_DICT_POS) {
98 return NOT_A_WORD_ID;
99 }
100 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
101 if (ptNodeParams.isDeleted()) {
102 return NOT_A_WORD_ID;
103 }
104 return ptNodeParams.getTerminalId();
105 }
106
getWordAttributesInContext(const WordIdArrayView prevWordIds,const int wordId,MultiBigramMap * const multiBigramMap) const107 const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
108 const WordIdArrayView prevWordIds, const int wordId,
109 MultiBigramMap *const multiBigramMap) const {
110 if (wordId == NOT_A_WORD_ID) {
111 return WordAttributes();
112 }
113 return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
114 false /* mustMatchAllPrevWords */, mHeaderPolicy);
115 }
116
getProbabilityOfWord(const WordIdArrayView prevWordIds,const int wordId) const117 int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
118 const int wordId) const {
119 if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
120 return NOT_A_PROBABILITY;
121 }
122 const WordAttributes wordAttributes =
123 mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
124 true /* mustMatchAllPrevWords */, mHeaderPolicy);
125 if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) {
126 return NOT_A_PROBABILITY;
127 }
128 return wordAttributes.getProbability();
129 }
130
getShortcutIterator(const int wordId) const131 BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
132 const int wordId) const {
133 const int shortcutPos = getShortcutPositionOfWord(wordId);
134 return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
135 }
136
iterateNgramEntries(const WordIdArrayView prevWordIds,NgramListener * const listener) const137 void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
138 NgramListener *const listener) const {
139 if (prevWordIds.empty()) {
140 return;
141 }
142 const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
143 for (size_t i = 1; i <= prevWordIds.size(); ++i) {
144 for (const auto& entry : languageModelDictContent->getProbabilityEntries(
145 prevWordIds.limit(i))) {
146 const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
147 if (!probabilityEntry.isValid()) {
148 continue;
149 }
150 int probability = NOT_A_PROBABILITY;
151 if (probabilityEntry.hasHistoricalInfo()) {
152 // TODO: Quit checking count here.
153 // If count <= 1, the word can be an invaild word. The actual probability should
154 // be checked using getWordAttributesInContext() in onVisitEntry().
155 probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ?
156 NOT_A_PROBABILITY : 0;
157 } else {
158 probability = probabilityEntry.getProbability();
159 }
160 listener->onVisitEntry(probability, entry.getWordId());
161 }
162 }
163 }
164
getShortcutPositionOfWord(const int wordId) const165 int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const {
166 if (wordId == NOT_A_WORD_ID) {
167 return NOT_A_DICT_POS;
168 }
169 const int ptNodePos =
170 mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
171 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
172 if (ptNodeParams.isDeleted()) {
173 return NOT_A_DICT_POS;
174 }
175 return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
176 ptNodeParams.getTerminalId());
177 }
178
addUnigramEntry(const CodePointArrayView wordCodePoints,const UnigramProperty * const unigramProperty)179 bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
180 const UnigramProperty *const unigramProperty) {
181 if (!mBuffers->isUpdatable()) {
182 AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
183 return false;
184 }
185 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
186 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
187 mDictBuffer->getTailPosition());
188 return false;
189 }
190 if (wordCodePoints.size() > MAX_WORD_LENGTH) {
191 AKLOGE("The word is too long to insert to the dictionary, length: %zd",
192 wordCodePoints.size());
193 return false;
194 }
195 for (const auto &shortcut : unigramProperty->getShortcuts()) {
196 if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
197 AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
198 shortcut.getTargetCodePoints()->size());
199 return false;
200 }
201 }
202 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
203 readingHelper.initWithPtNodeArrayPos(getRootPosition());
204 bool addedNewUnigram = false;
205 int codePointsToAdd[MAX_WORD_LENGTH];
206 int codePointCountToAdd = wordCodePoints.size();
207 memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
208 if (unigramProperty->representsBeginningOfSentence()) {
209 codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
210 codePointCountToAdd, MAX_WORD_LENGTH);
211 }
212 if (codePointCountToAdd <= 0) {
213 return false;
214 }
215 const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
216 if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
217 &addedNewUnigram)) {
218 if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
219 mEntryCounters.incrementNgramCount(NgramType::Unigram);
220 }
221 if (unigramProperty->getShortcuts().size() > 0) {
222 // Add shortcut target.
223 const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */);
224 if (wordId == NOT_A_WORD_ID) {
225 AKLOGE("Cannot find word id to add shortcut target.");
226 return false;
227 }
228 const int wordPos =
229 mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
230 for (const auto &shortcut : unigramProperty->getShortcuts()) {
231 if (!mUpdatingHelper.addShortcutTarget(wordPos,
232 CodePointArrayView(*shortcut.getTargetCodePoints()),
233 shortcut.getProbability())) {
234 AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
235 "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
236 shortcut.getProbability());
237 return false;
238 }
239 }
240 }
241 return true;
242 } else {
243 return false;
244 }
245 }
246
removeUnigramEntry(const CodePointArrayView wordCodePoints)247 bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
248 if (!mBuffers->isUpdatable()) {
249 AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
250 return false;
251 }
252 const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
253 if (wordId == NOT_A_WORD_ID) {
254 return false;
255 }
256 const int ptNodePos =
257 mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
258 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
259 if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) {
260 AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos);
261 return false;
262 }
263 if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) {
264 return false;
265 }
266 if (!ptNodeParams.representsNonWordInfo()) {
267 mEntryCounters.decrementNgramCount(NgramType::Unigram);
268 }
269 return true;
270 }
271
addNgramEntry(const NgramProperty * const ngramProperty)272 bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
273 if (!mBuffers->isUpdatable()) {
274 AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
275 return false;
276 }
277 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
278 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
279 mDictBuffer->getTailPosition());
280 return false;
281 }
282 const NgramContext *const ngramContext = ngramProperty->getNgramContext();
283 if (!ngramContext->isValid()) {
284 AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
285 return false;
286 }
287 if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
288 AKLOGE("The word is too long to insert the ngram to the dictionary. "
289 "length: %zd", ngramProperty->getTargetCodePoints()->size());
290 return false;
291 }
292 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
293 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
294 false /* tryLowerCaseSearch */);
295 if (prevWordIds.empty()) {
296 return false;
297 }
298 for (size_t i = 0; i < prevWordIds.size(); ++i) {
299 if (prevWordIds[i] != NOT_A_WORD_ID) {
300 continue;
301 }
302 if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) {
303 return false;
304 }
305 const UnigramProperty beginningOfSentenceUnigramProperty(
306 true /* representsBeginningOfSentence */, true /* isNotAWord */,
307 false /* isBlacklisted */, false /* isPossiblyOffensive */,
308 MAX_PROBABILITY /* probability */, HistoricalInfo());
309 if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
310 &beginningOfSentenceUnigramProperty)) {
311 AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
312 return false;
313 }
314 // Refresh word ids.
315 ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
316 }
317 const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()),
318 false /* forceLowerCaseSearch */);
319 if (wordId == NOT_A_WORD_ID) {
320 return false;
321 }
322 bool addedNewEntry = false;
323 if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) {
324 if (addedNewEntry) {
325 mEntryCounters.incrementNgramCount(
326 NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
327 }
328 return true;
329 } else {
330 return false;
331 }
332 }
333
removeNgramEntry(const NgramContext * const ngramContext,const CodePointArrayView wordCodePoints)334 bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
335 const CodePointArrayView wordCodePoints) {
336 if (!mBuffers->isUpdatable()) {
337 AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
338 return false;
339 }
340 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
341 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
342 mDictBuffer->getTailPosition());
343 return false;
344 }
345 if (!ngramContext->isValid()) {
346 AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
347 return false;
348 }
349 if (wordCodePoints.size() > MAX_WORD_LENGTH) {
350 AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
351 wordCodePoints.size());
352 }
353 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
354 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
355 false /* tryLowerCaseSerch */);
356 if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) {
357 return false;
358 }
359 const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
360 if (wordId == NOT_A_WORD_ID) {
361 return false;
362 }
363 if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) {
364 mEntryCounters.decrementNgramCount(
365 NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
366 return true;
367 } else {
368 return false;
369 }
370 }
371
updateEntriesForWordWithNgramContext(const NgramContext * const ngramContext,const CodePointArrayView wordCodePoints,const bool isValidWord,const HistoricalInfo historicalInfo)372 bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
373 const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
374 const bool isValidWord, const HistoricalInfo historicalInfo) {
375 if (!mBuffers->isUpdatable()) {
376 AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
377 "dictionary.");
378 return false;
379 }
380 const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ?
381 false : isValidWord;
382 int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
383 if (wordId == NOT_A_WORD_ID) {
384 // The word is not in the dictionary.
385 const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
386 false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */,
387 NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */,
388 0 /* count */));
389 if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
390 AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
391 return false;
392 }
393 if (!isValidWord) {
394 return true;
395 }
396 wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
397 }
398
399 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
400 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
401 false /* tryLowerCaseSearch */);
402 if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
403 if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
404 const UnigramProperty beginningOfSentenceUnigramProperty(
405 true /* representsBeginningOfSentence */,
406 true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
407 HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
408 if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
409 &beginningOfSentenceUnigramProperty)) {
410 AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext().");
411 return false;
412 }
413 // Refresh word ids.
414 ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
415 }
416 // Update entries for beginning of sentence.
417 if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(
418 prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo,
419 mHeaderPolicy, &mEntryCounters)) {
420 return false;
421 }
422 }
423 if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds,
424 wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) {
425 return false;
426 }
427 return true;
428 }
429
flush(const char * const filePath)430 bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
431 if (!mBuffers->isUpdatable()) {
432 AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
433 return false;
434 }
435 if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
436 AKLOGE("Cannot flush the dictionary to file.");
437 mIsCorrupted = true;
438 return false;
439 }
440 return true;
441 }
442
flushWithGC(const char * const filePath)443 bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
444 if (!mBuffers->isUpdatable()) {
445 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
446 return false;
447 }
448 if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
449 AKLOGE("Cannot flush the dictionary to file with GC.");
450 mIsCorrupted = true;
451 return false;
452 }
453 return true;
454 }
455
needsToRunGC(const bool mindsBlockByGC) const456 bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
457 if (!mBuffers->isUpdatable()) {
458 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
459 return false;
460 }
461 if (mBuffers->isNearSizeLimit()) {
462 // Additional buffer size is near the limit.
463 return true;
464 } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
465 > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
466 // Total extended region size of the trie exceeds the limit.
467 return true;
468 } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
469 && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
470 // Needs to reduce dictionary size.
471 return true;
472 } else if (mHeaderPolicy->isDecayingDict()) {
473 return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
474 mHeaderPolicy);
475 }
476 return false;
477 }
478
getProperty(const char * const query,const int queryLength,char * const outResult,const int maxResultLength)479 void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
480 char *const outResult, const int maxResultLength) {
481 const int compareLength = queryLength + 1 /* terminator */;
482 if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
483 snprintf(outResult, maxResultLength, "%d",
484 mEntryCounters.getNgramCount(NgramType::Unigram));
485 } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
486 snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
487 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
488 snprintf(outResult, maxResultLength, "%d",
489 mHeaderPolicy->isDecayingDict() ?
490 ForgettingCurveUtils::getEntryCountHardLimit(
491 mHeaderPolicy->getMaxNgramCounts().getNgramCount(
492 NgramType::Unigram)) :
493 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
494 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
495 snprintf(outResult, maxResultLength, "%d",
496 mHeaderPolicy->isDecayingDict() ?
497 ForgettingCurveUtils::getEntryCountHardLimit(
498 mHeaderPolicy->getMaxNgramCounts().getNgramCount(
499 NgramType::Bigram)) :
500 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
501 }
502 }
503
getWordProperty(const CodePointArrayView wordCodePoints) const504 const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
505 const CodePointArrayView wordCodePoints) const {
506 const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
507 if (wordId == NOT_A_WORD_ID) {
508 AKLOGE("getWordProperty is called for invalid word.");
509 return WordProperty();
510 }
511 const LanguageModelDictContent *const languageModelDictContent =
512 mBuffers->getLanguageModelDictContent();
513 // Fetch ngram information.
514 std::vector<NgramProperty> ngrams;
515 int ngramTargetCodePoints[MAX_WORD_LENGTH];
516 int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
517 int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
518 bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
519 for (const auto& entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord(
520 mHeaderPolicy, wordId)) {
521 const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(),
522 MAX_WORD_LENGTH, ngramTargetCodePoints);
523 const WordIdArrayView prevWordIds = entry.getPrevWordIds();
524 for (size_t i = 0; i < prevWordIds.size(); ++i) {
525 ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i],
526 MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]);
527 ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry(
528 prevWordIds[i]).representsBeginningOfSentence();
529 if (ngramPrevWordIsBeginningOfSentense[i]) {
530 ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker(
531 ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]);
532 }
533 }
534 const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount,
535 ngramPrevWordIsBeginningOfSentense, prevWordIds.size());
536 const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry();
537 const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo();
538 // TODO: Output flags in WordAttributes.
539 ngrams.emplace_back(ngramContext,
540 CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(),
541 entry.getWordAttributes().getProbability(), *historicalInfo);
542 }
543 // Fetch shortcut information.
544 std::vector<UnigramProperty::ShortcutProperty> shortcuts;
545 int shortcutPos = getShortcutPositionOfWord(wordId);
546 if (shortcutPos != NOT_A_DICT_POS) {
547 int shortcutTarget[MAX_WORD_LENGTH];
548 const ShortcutDictContent *const shortcutDictContent =
549 mBuffers->getShortcutDictContent();
550 bool hasNext = true;
551 while (hasNext) {
552 int shortcutTargetLength = 0;
553 int shortcutProbability = NOT_A_PROBABILITY;
554 shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
555 &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
556 shortcuts.emplace_back(
557 CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
558 shortcutProbability);
559 }
560 }
561 const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
562 WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy);
563 const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
564 const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
565 const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
566 wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
567 wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
568 *historicalInfo, std::move(shortcuts));
569 return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
570 }
571
getNextWordAndNextToken(const int token,int * const outCodePoints,int * const outCodePointCount)572 int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
573 int *const outCodePointCount) {
574 *outCodePointCount = 0;
575 if (token == 0) {
576 mTerminalPtNodePositionsForIteratingWords.clear();
577 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
578 &mTerminalPtNodePositionsForIteratingWords);
579 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
580 readingHelper.initWithPtNodeArrayPos(getRootPosition());
581 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
582 }
583 const int terminalPtNodePositionsVectorSize =
584 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
585 if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
586 AKLOGE("Given token %d is invalid.", token);
587 return 0;
588 }
589 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
590 const PtNodeParams ptNodeParams =
591 mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos);
592 *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(),
593 MAX_WORD_LENGTH, outCodePoints);
594 const int nextToken = token + 1;
595 if (nextToken >= terminalPtNodePositionsVectorSize) {
596 // All words have been iterated.
597 mTerminalPtNodePositionsForIteratingWords.clear();
598 return 0;
599 }
600 return nextToken;
601 }
602
603 } // namespace latinime
604