1 /*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /*
18 * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!!
19 * Do not edit this file other than updating policy's interface.
20 *
21 * This file was generated from
22 * dictionary/structure/v4/ver4_patricia_trie_policy.cpp
23 */
24
25 #include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
26
27 #include <vector>
28
29 #include "suggest/core/dicnode/dic_node.h"
30 #include "suggest/core/dicnode/dic_node_vector.h"
31 #include "dictionary/interface/ngram_listener.h"
32 #include "dictionary/property/ngram_context.h"
33 #include "dictionary/property/ngram_property.h"
34 #include "dictionary/property/unigram_property.h"
35 #include "dictionary/property/word_property.h"
36 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
37 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
38 #include "dictionary/utils/forgetting_curve_utils.h"
39 #include "dictionary/utils/multi_bigram_map.h"
40 #include "dictionary/utils/probability_utils.h"
41
42 namespace latinime {
43 namespace backward {
44 namespace v402 {
45
46 // Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
47 // BinaryDictionaryDecayingTests.
48 const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
49 const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
50 const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
51 const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
52 const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
53 const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
54 Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
55 const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1;
56
createAndGetAllChildDicNodes(const DicNode * const dicNode,DicNodeVector * const childDicNodes) const57 void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
58 DicNodeVector *const childDicNodes) const {
59 if (!dicNode->hasChildren()) {
60 return;
61 }
62 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
63 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
64 while (!readingHelper.isEnd()) {
65 const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
66 if (!ptNodeParams.isValid()) {
67 break;
68 }
69 bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
70 if (isTerminal && mHeaderPolicy->isDecayingDict()) {
71 // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
72 // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
73 // valid terminal DicNode.
74 isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
75 }
76 readingHelper.readNextSiblingNode(ptNodeParams);
77 if (ptNodeParams.representsNonWordInfo()) {
78 // Skip PtNodes that represent non-word information.
79 continue;
80 }
81 const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
82 childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
83 wordId, ptNodeParams.getCodePointArrayView());
84 }
85 if (readingHelper.isError()) {
86 mIsCorrupted = true;
87 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
88 }
89 }
90
getCodePointsAndReturnCodePointCount(const int wordId,const int maxCodePointCount,int * const outCodePoints) const91 int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
92 const int maxCodePointCount, int *const outCodePoints) const {
93 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
94 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
95 readingHelper.initWithPtNodePos(ptNodePos);
96 const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount(
97 maxCodePointCount, outCodePoints);
98 if (readingHelper.isError()) {
99 mIsCorrupted = true;
100 AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
101 }
102 return codePointCount;
103 }
104
getWordId(const CodePointArrayView wordCodePoints,const bool forceLowerCaseSearch) const105 int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
106 const bool forceLowerCaseSearch) const {
107 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
108 readingHelper.initWithPtNodeArrayPos(getRootPosition());
109 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
110 wordCodePoints.size(), forceLowerCaseSearch);
111 if (readingHelper.isError()) {
112 mIsCorrupted = true;
113 AKLOGE("Dictionary reading error in getWordId().");
114 }
115 return getWordIdFromTerminalPtNodePos(ptNodePos);
116 }
117
getWordAttributesInContext(const WordIdArrayView prevWordIds,const int wordId,MultiBigramMap * const multiBigramMap) const118 const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
119 const WordIdArrayView prevWordIds, const int wordId,
120 MultiBigramMap *const multiBigramMap) const {
121 if (wordId == NOT_A_WORD_ID) {
122 return WordAttributes();
123 }
124 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
125 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
126 if (multiBigramMap) {
127 const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
128 prevWordIds, wordId, ptNodeParams.getProbability());
129 return getWordAttributes(probability, ptNodeParams);
130 }
131 if (!prevWordIds.empty()) {
132 const int probability = getProbabilityOfWord(prevWordIds, wordId);
133 if (probability != NOT_A_PROBABILITY) {
134 return getWordAttributes(probability, ptNodeParams);
135 }
136 }
137 return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
138 ptNodeParams);
139 }
140
getWordAttributes(const int probability,const PtNodeParams & ptNodeParams) const141 const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability,
142 const PtNodeParams &ptNodeParams) const {
143 return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
144 ptNodeParams.getProbability() == 0);
145 }
146
getProbability(const int unigramProbability,const int bigramProbability) const147 int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
148 const int bigramProbability) const {
149 // In the v4 format, bigramProbability is a conditional probability.
150 const int bigramConditionalProbability = bigramProbability;
151 if (unigramProbability == NOT_A_PROBABILITY) {
152 return NOT_A_PROBABILITY;
153 }
154 if (bigramConditionalProbability == NOT_A_PROBABILITY) {
155 return ProbabilityUtils::backoff(unigramProbability);
156 }
157 return bigramConditionalProbability;
158 }
159
getProbabilityOfWord(const WordIdArrayView prevWordIds,const int wordId) const160 int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
161 const int wordId) const {
162 if (wordId == NOT_A_WORD_ID) {
163 return NOT_A_PROBABILITY;
164 }
165 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
166 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
167 if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) {
168 return NOT_A_PROBABILITY;
169 }
170 if (prevWordIds.empty()) {
171 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
172 }
173 if (prevWordIds[0] == NOT_A_WORD_ID) {
174 return NOT_A_PROBABILITY;
175 }
176 const PtNodeParams prevWordPtNodeParams =
177 mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
178 if (prevWordPtNodeParams.isDeleted()) {
179 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
180 }
181 const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
182 prevWordPtNodeParams.getTerminalId());
183 BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
184 while (bigramsIt.hasNext()) {
185 bigramsIt.next();
186 if (bigramsIt.getBigramPos() == ptNodePos
187 && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
188 const int bigramConditionalProbability = getBigramConditionalProbability(
189 prevWordPtNodeParams.getProbability(),
190 prevWordPtNodeParams.representsBeginningOfSentence(),
191 bigramsIt.getProbability());
192 return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability);
193 }
194 }
195 return NOT_A_PROBABILITY;
196 }
197
iterateNgramEntries(const WordIdArrayView prevWordIds,NgramListener * const listener) const198 void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
199 NgramListener *const listener) const {
200 if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) {
201 return;
202 }
203 const PtNodeParams prevWordPtNodeParams =
204 mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
205 if (prevWordPtNodeParams.isDeleted()) {
206 return;
207 }
208 const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
209 prevWordPtNodeParams.getTerminalId());
210 BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
211 while (bigramsIt.hasNext()) {
212 bigramsIt.next();
213 const int bigramConditionalProbability = getBigramConditionalProbability(
214 prevWordPtNodeParams.getProbability(),
215 prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability());
216 listener->onVisitEntry(bigramConditionalProbability,
217 getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
218 }
219 }
220
getBigramConditionalProbability(const int prevWordUnigramProbability,const bool isInBeginningOfSentenceContext,const int bigramProbability) const221 int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability,
222 const bool isInBeginningOfSentenceContext, const int bigramProbability) const {
223 if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
224 if (isInBeginningOfSentenceContext) {
225 return bigramProbability;
226 }
227 // Calculate conditional probability.
228 return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability,
229 MAX_PROBABILITY);
230 } else {
231 // bigramProbability is a conditional probability.
232 return bigramProbability;
233 }
234 }
235
getShortcutIterator(const int wordId) const236 BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
237 const int wordId) const {
238 const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
239 return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
240 }
241
getShortcutPositionOfPtNode(const int ptNodePos) const242 int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
243 if (ptNodePos == NOT_A_DICT_POS) {
244 return NOT_A_DICT_POS;
245 }
246 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
247 if (ptNodeParams.isDeleted()) {
248 return NOT_A_DICT_POS;
249 }
250 return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
251 ptNodeParams.getTerminalId());
252 }
253
getBigramsPositionOfPtNode(const int ptNodePos) const254 int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
255 if (ptNodePos == NOT_A_DICT_POS) {
256 return NOT_A_DICT_POS;
257 }
258 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
259 if (ptNodeParams.isDeleted()) {
260 return NOT_A_DICT_POS;
261 }
262 return mBuffers->getBigramDictContent()->getBigramListHeadPos(
263 ptNodeParams.getTerminalId());
264 }
265
addUnigramEntry(const CodePointArrayView wordCodePoints,const UnigramProperty * const unigramProperty)266 bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
267 const UnigramProperty *const unigramProperty) {
268 if (!mBuffers->isUpdatable()) {
269 AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
270 return false;
271 }
272 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
273 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
274 mDictBuffer->getTailPosition());
275 return false;
276 }
277 if (wordCodePoints.size() > MAX_WORD_LENGTH) {
278 AKLOGE("The word is too long to insert to the dictionary, length: %zd",
279 wordCodePoints.size());
280 return false;
281 }
282 for (const auto &shortcut : unigramProperty->getShortcuts()) {
283 if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
284 AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
285 shortcut.getTargetCodePoints()->size());
286 return false;
287 }
288 }
289 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
290 readingHelper.initWithPtNodeArrayPos(getRootPosition());
291 bool addedNewUnigram = false;
292 int codePointsToAdd[MAX_WORD_LENGTH];
293 int codePointCountToAdd = wordCodePoints.size();
294 memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
295 if (unigramProperty->representsBeginningOfSentence()) {
296 codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
297 codePointCountToAdd, MAX_WORD_LENGTH);
298 }
299 if (codePointCountToAdd <= 0) {
300 return false;
301 }
302 const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
303 if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
304 &addedNewUnigram)) {
305 if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
306 mEntryCounters.incrementNgramCount(NgramType::Unigram);
307 }
308 if (unigramProperty->getShortcuts().size() > 0) {
309 // Add shortcut target.
310 const int wordPos = getTerminalPtNodePosFromWordId(
311 getWordId(codePointArrayView, false /* forceLowerCaseSearch */));
312 if (wordPos == NOT_A_DICT_POS) {
313 AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
314 return false;
315 }
316 for (const auto &shortcut : unigramProperty->getShortcuts()) {
317 if (!mUpdatingHelper.addShortcutTarget(wordPos,
318 CodePointArrayView(*shortcut.getTargetCodePoints()),
319 shortcut.getProbability())) {
320 AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
321 "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
322 shortcut.getProbability());
323 return false;
324 }
325 }
326 }
327 return true;
328 } else {
329 return false;
330 }
331 }
332
removeUnigramEntry(const CodePointArrayView wordCodePoints)333 bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
334 if (!mBuffers->isUpdatable()) {
335 AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
336 return false;
337 }
338 const int ptNodePos = getTerminalPtNodePosFromWordId(
339 getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
340 if (ptNodePos == NOT_A_DICT_POS) {
341 return false;
342 }
343 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
344 return mNodeWriter.suppressUnigramEntry(&ptNodeParams);
345 }
346
addNgramEntry(const NgramProperty * const ngramProperty)347 bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
348 if (!mBuffers->isUpdatable()) {
349 AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
350 return false;
351 }
352 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
353 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
354 mDictBuffer->getTailPosition());
355 return false;
356 }
357 const NgramContext *const ngramContext = ngramProperty->getNgramContext();
358 if (!ngramContext->isValid()) {
359 AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
360 return false;
361 }
362 if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
363 AKLOGE("The word is too long to insert the ngram to the dictionary. "
364 "length: %zd", ngramProperty->getTargetCodePoints()->size());
365 return false;
366 }
367 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
368 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
369 false /* tryLowerCaseSearch */);
370 if (prevWordIds.empty()) {
371 return false;
372 }
373 if (prevWordIds[0] == NOT_A_WORD_ID) {
374 if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
375 const UnigramProperty beginningOfSentenceUnigramProperty(
376 true /* representsBeginningOfSentence */, true /* isNotAWord */,
377 false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
378 if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
379 &beginningOfSentenceUnigramProperty)) {
380 AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
381 return false;
382 }
383 // Refresh word ids.
384 ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
385 } else {
386 return false;
387 }
388 }
389 const int wordPos = getTerminalPtNodePosFromWordId(getWordId(
390 CodePointArrayView(*ngramProperty->getTargetCodePoints()),
391 false /* forceLowerCaseSearch */));
392 if (wordPos == NOT_A_DICT_POS) {
393 return false;
394 }
395 bool addedNewBigram = false;
396 const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
397 if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos),
398 wordPos, ngramProperty, &addedNewBigram)) {
399 if (addedNewBigram) {
400 mEntryCounters.incrementNgramCount(NgramType::Bigram);
401 }
402 return true;
403 } else {
404 return false;
405 }
406 }
407
removeNgramEntry(const NgramContext * const ngramContext,const CodePointArrayView wordCodePoints)408 bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
409 const CodePointArrayView wordCodePoints) {
410 if (!mBuffers->isUpdatable()) {
411 AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
412 return false;
413 }
414 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
415 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
416 mDictBuffer->getTailPosition());
417 return false;
418 }
419 if (!ngramContext->isValid()) {
420 AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
421 return false;
422 }
423 if (wordCodePoints.size() > MAX_WORD_LENGTH) {
424 AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
425 wordCodePoints.size());
426 }
427 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
428 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
429 false /* tryLowerCaseSerch */);
430 if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
431 return false;
432 }
433 const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints,
434 false /* forceLowerCaseSearch */));
435 if (wordPos == NOT_A_DICT_POS) {
436 return false;
437 }
438 const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
439 if (mUpdatingHelper.removeNgramEntry(
440 PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) {
441 mEntryCounters.decrementNgramCount(NgramType::Bigram);
442 return true;
443 } else {
444 return false;
445 }
446 }
447
448
updateEntriesForWordWithNgramContext(const NgramContext * const ngramContext,const CodePointArrayView wordCodePoints,const bool isValidWord,const HistoricalInfo historicalInfo)449 bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
450 const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
451 const bool isValidWord, const HistoricalInfo historicalInfo) {
452 if (!mBuffers->isUpdatable()) {
453 AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
454 "dictionary.");
455 return false;
456 }
457 const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY;
458 const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
459 false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo);
460 if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
461 AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
462 return false;
463 }
464 const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)
465 ? NOT_A_PROBABILITY : probability;
466 const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram,
467 historicalInfo);
468 if (!addNgramEntry(&ngramProperty)) {
469 AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
470 return false;
471 }
472 return true;
473 }
474
flush(const char * const filePath)475 bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
476 if (!mBuffers->isUpdatable()) {
477 AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
478 return false;
479 }
480 if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
481 AKLOGE("Cannot flush the dictionary to file.");
482 mIsCorrupted = true;
483 return false;
484 }
485 return true;
486 }
487
flushWithGC(const char * const filePath)488 bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
489 if (!mBuffers->isUpdatable()) {
490 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
491 return false;
492 }
493 if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
494 AKLOGE("Cannot flush the dictionary to file with GC.");
495 mIsCorrupted = true;
496 return false;
497 }
498 return true;
499 }
500
needsToRunGC(const bool mindsBlockByGC) const501 bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
502 if (!mBuffers->isUpdatable()) {
503 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
504 return false;
505 }
506 if (mBuffers->isNearSizeLimit()) {
507 // Additional buffer size is near the limit.
508 return true;
509 } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
510 > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
511 // Total extended region size of the trie exceeds the limit.
512 return true;
513 } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
514 && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
515 // Needs to reduce dictionary size.
516 return true;
517 } else if (mHeaderPolicy->isDecayingDict()) {
518 return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
519 mHeaderPolicy);
520 }
521 return false;
522 }
523
getProperty(const char * const query,const int queryLength,char * const outResult,const int maxResultLength)524 void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
525 char *const outResult, const int maxResultLength) {
526 const int compareLength = queryLength + 1 /* terminator */;
527 if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
528 snprintf(outResult, maxResultLength, "%d",
529 mEntryCounters.getNgramCount(NgramType::Unigram));
530 } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
531 snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
532 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
533 snprintf(outResult, maxResultLength, "%d",
534 mHeaderPolicy->isDecayingDict() ?
535 ForgettingCurveUtils::getEntryCountHardLimit(
536 mHeaderPolicy->getMaxNgramCounts().getNgramCount(
537 NgramType::Unigram)) :
538 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
539 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
540 snprintf(outResult, maxResultLength, "%d",
541 mHeaderPolicy->isDecayingDict() ?
542 ForgettingCurveUtils::getEntryCountHardLimit(
543 mHeaderPolicy->getMaxNgramCounts().getNgramCount(
544 NgramType::Bigram)) :
545 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
546 }
547 }
548
getWordProperty(const CodePointArrayView wordCodePoints) const549 const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
550 const CodePointArrayView wordCodePoints) const {
551 const int ptNodePos = getTerminalPtNodePosFromWordId(
552 getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
553 if (ptNodePos == NOT_A_DICT_POS) {
554 AKLOGE("getWordProperty is called for invalid word.");
555 return WordProperty();
556 }
557 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
558 const ProbabilityEntry probabilityEntry =
559 mBuffers->getProbabilityDictContent()->getProbabilityEntry(
560 ptNodeParams.getTerminalId());
561 const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
562 // Fetch bigram information.
563 std::vector<NgramProperty> ngrams;
564 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
565 if (bigramListPos != NOT_A_DICT_POS) {
566 int bigramWord1CodePoints[MAX_WORD_LENGTH];
567 const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent();
568 const TerminalPositionLookupTable *const terminalPositionLookupTable =
569 mBuffers->getTerminalPositionLookupTable();
570 bool hasNext = true;
571 int readingPos = bigramListPos;
572 while (hasNext) {
573 const BigramEntry bigramEntry =
574 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
575 hasNext = bigramEntry.hasNext();
576 const int word1TerminalId = bigramEntry.getTargetTerminalId();
577 const int word1TerminalPtNodePos =
578 terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId);
579 if (word1TerminalPtNodePos == NOT_A_DICT_POS) {
580 continue;
581 }
582 const int codePointCount = getCodePointsAndReturnCodePointCount(
583 getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH,
584 bigramWord1CodePoints);
585 const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
586 const int rawBigramProbability = bigramEntry.hasHistoricalInfo()
587 ? ForgettingCurveUtils::decodeProbability(
588 bigramEntry.getHistoricalInfo(), mHeaderPolicy)
589 : bigramEntry.getProbability();
590 const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(),
591 ptNodeParams.representsBeginningOfSentence(), rawBigramProbability);
592 ngrams.emplace_back(
593 NgramContext(wordCodePoints.data(), wordCodePoints.size(),
594 ptNodeParams.representsBeginningOfSentence()),
595 CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
596 probability, *historicalInfo);
597 }
598 }
599 // Fetch shortcut information.
600 std::vector<UnigramProperty::ShortcutProperty> shortcuts;
601 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
602 if (shortcutPos != NOT_A_DICT_POS) {
603 int shortcutTarget[MAX_WORD_LENGTH];
604 const ShortcutDictContent *const shortcutDictContent =
605 mBuffers->getShortcutDictContent();
606 bool hasNext = true;
607 while (hasNext) {
608 int shortcutTargetLength = 0;
609 int shortcutProbability = NOT_A_PROBABILITY;
610 shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
611 &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
612 shortcuts.emplace_back(
613 CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
614 shortcutProbability);
615 }
616 }
617 const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
618 ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
619 ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts));
620 return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
621 }
622
getNextWordAndNextToken(const int token,int * const outCodePoints,int * const outCodePointCount)623 int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
624 int *const outCodePointCount) {
625 *outCodePointCount = 0;
626 if (token == 0) {
627 mTerminalPtNodePositionsForIteratingWords.clear();
628 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
629 &mTerminalPtNodePositionsForIteratingWords);
630 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
631 readingHelper.initWithPtNodeArrayPos(getRootPosition());
632 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
633 }
634 const int terminalPtNodePositionsVectorSize =
635 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
636 if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
637 AKLOGE("Given token %d is invalid.", token);
638 return 0;
639 }
640 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
641 *outCodePointCount = getCodePointsAndReturnCodePointCount(
642 getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
643 const int nextToken = token + 1;
644 if (nextToken >= terminalPtNodePositionsVectorSize) {
645 // All words have been iterated.
646 mTerminalPtNodePositionsForIteratingWords.clear();
647 return 0;
648 }
649 return nextToken;
650 }
651
getWordIdFromTerminalPtNodePos(const int ptNodePos) const652 int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
653 return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
654 }
655
getTerminalPtNodePosFromWordId(const int wordId) const656 int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
657 return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
658 }
659
660 } // namespace v402
661 } // namespace backward
662 } // namespace latinime
663