1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
18 
19 #include "dictionary/header/header_policy.h"
20 #include "dictionary/property/unigram_property.h"
21 #include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
22 #include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
23 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
24 #include "dictionary/structure/v4/content/probability_entry.h"
25 #include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
26 #include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
27 #include "dictionary/structure/v4/ver4_dict_buffers.h"
28 #include "dictionary/utils/buffer_with_extendable_buffer.h"
29 #include "dictionary/utils/forgetting_curve_utils.h"
30 
31 namespace latinime {
32 
33 const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3;
34 
markPtNodeAsDeleted(const PtNodeParams * const toBeUpdatedPtNodeParams)35 bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
36         const PtNodeParams *const toBeUpdatedPtNodeParams) {
37     int pos = toBeUpdatedPtNodeParams->getHeadPos();
38     const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
39     const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
40     if (usesAdditionalBuffer) {
41         pos -= mTrieBuffer->getOriginalBufferSize();
42     }
43     // Read original flags
44     const PatriciaTrieReadingUtils::NodeFlags originalFlags =
45             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
46     const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
47             DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
48                     true /* isDeleted */, false /* willBecomeNonTerminal */);
49     int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
50     // Update flags.
51     if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
52             &writingPos)) {
53         return false;
54     }
55     if (toBeUpdatedPtNodeParams->isTerminal()) {
56         // The PtNode is a terminal. Delete entry from the terminal position lookup table.
57         return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
58                 toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */);
59     } else {
60         return true;
61     }
62 }
63 
64 // TODO: Quit using bigramLinkedNodePos.
markPtNodeAsMoved(const PtNodeParams * const toBeUpdatedPtNodeParams,const int movedPos,const int bigramLinkedNodePos)65 bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
66         const PtNodeParams *const toBeUpdatedPtNodeParams,
67         const int movedPos, const int bigramLinkedNodePos) {
68     int pos = toBeUpdatedPtNodeParams->getHeadPos();
69     const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
70     const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
71     if (usesAdditionalBuffer) {
72         pos -= mTrieBuffer->getOriginalBufferSize();
73     }
74     // Read original flags
75     const PatriciaTrieReadingUtils::NodeFlags originalFlags =
76             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
77     const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
78             DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */,
79                     false /* isDeleted */, false /* willBecomeNonTerminal */);
80     int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
81     // Update flags.
82     if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
83             &writingPos)) {
84         return false;
85     }
86     // Update moved position, which is stored in the parent offset field.
87     if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
88             mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
89         return false;
90     }
91     if (toBeUpdatedPtNodeParams->hasChildren()) {
92         // Update children's parent position.
93         mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos());
94         while (!mReadingHelper.isEnd()) {
95             const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams());
96             int parentOffsetFieldPos = childPtNodeParams.getHeadPos()
97                     + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
98             if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(
99                     mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(),
100                     &parentOffsetFieldPos)) {
101                 // Parent offset cannot be written because of a bug or a broken dictionary; thus,
102                 // we give up to update dictionary.
103                 return false;
104             }
105             mReadingHelper.readNextSiblingNode(childPtNodeParams);
106         }
107     }
108     return true;
109 }
110 
markPtNodeAsWillBecomeNonTerminal(const PtNodeParams * const toBeUpdatedPtNodeParams)111 bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal(
112         const PtNodeParams *const toBeUpdatedPtNodeParams) {
113     int pos = toBeUpdatedPtNodeParams->getHeadPos();
114     const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos);
115     const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer);
116     if (usesAdditionalBuffer) {
117         pos -= mTrieBuffer->getOriginalBufferSize();
118     }
119     // Read original flags
120     const PatriciaTrieReadingUtils::NodeFlags originalFlags =
121             PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos);
122     const PatriciaTrieReadingUtils::NodeFlags updatedFlags =
123             DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */,
124                     false /* isDeleted */, true /* willBecomeNonTerminal */);
125     if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
126             toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) {
127         AKLOGE("Cannot update terminal position lookup table. terminal id: %d",
128                 toBeUpdatedPtNodeParams->getTerminalId());
129         return false;
130     }
131     // Update flags.
132     int writingPos = toBeUpdatedPtNodeParams->getHeadPos();
133     return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags,
134             &writingPos);
135 }
136 
updatePtNodeUnigramProperty(const PtNodeParams * const toBeUpdatedPtNodeParams,const UnigramProperty * const unigramProperty)137 bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty(
138         const PtNodeParams *const toBeUpdatedPtNodeParams,
139         const UnigramProperty *const unigramProperty) {
140     // Update probability and historical information.
141     // TODO: Update other information in the unigram property.
142     if (!toBeUpdatedPtNodeParams->isTerminal()) {
143         return false;
144     }
145     const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty);
146     return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
147             toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntryOfUnigramProperty);
148 }
149 
updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(const PtNodeParams * const toBeUpdatedPtNodeParams,bool * const outNeedsToKeepPtNode)150 bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
151         const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) {
152     if (!toBeUpdatedPtNodeParams->isTerminal()) {
153         AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode.");
154         return false;
155     }
156     const ProbabilityEntry originalProbabilityEntry =
157             mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
158                     toBeUpdatedPtNodeParams->getTerminalId());
159     if (originalProbabilityEntry.isValid()) {
160         *outNeedsToKeepPtNode = true;
161         return true;
162     }
163     if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
164         AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
165         return false;
166     }
167     *outNeedsToKeepPtNode = false;
168     return true;
169 }
170 
updateChildrenPosition(const PtNodeParams * const toBeUpdatedPtNodeParams,const int newChildrenPosition)171 bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition(
172         const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) {
173     int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos();
174     return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
175             newChildrenPosition, &childrenPosFieldPos);
176 }
177 
updateTerminalId(const PtNodeParams * const toBeUpdatedPtNodeParams,const int newTerminalId)178 bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams,
179         const int newTerminalId) {
180     return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE,
181             toBeUpdatedPtNodeParams->getTerminalIdFieldPos());
182 }
183 
writePtNodeAndAdvancePosition(const PtNodeParams * const ptNodeParams,int * const ptNodeWritingPos)184 bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
185         const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) {
186     return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */,
187             ptNodeWritingPos);
188 }
189 
writeNewTerminalPtNodeAndAdvancePosition(const PtNodeParams * const ptNodeParams,const UnigramProperty * const unigramProperty,int * const ptNodeWritingPos)190 bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
191         const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
192         int *const ptNodeWritingPos) {
193     int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
194     if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId,
195             ptNodeWritingPos)) {
196         return false;
197     }
198     // Write probability.
199     ProbabilityEntry newProbabilityEntry;
200     const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty);
201     return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
202             terminalId, &probabilityEntryOfUnigramProperty);
203 }
204 
205 // TODO: Support counting ngram entries.
addNgramEntry(const WordIdArrayView prevWordIds,const int wordId,const NgramProperty * const ngramProperty,bool * const outAddedNewBigram)206 bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
207         const NgramProperty *const ngramProperty, bool *const outAddedNewBigram) {
208     LanguageModelDictContent *const languageModelDictContent =
209             mBuffers->getMutableLanguageModelDictContent();
210     const ProbabilityEntry probabilityEntry =
211             languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId);
212     const ProbabilityEntry probabilityEntryOfNgramProperty(ngramProperty);
213     if (!languageModelDictContent->setNgramProbabilityEntry(
214             prevWordIds, wordId, &probabilityEntryOfNgramProperty)) {
215         AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d",
216                 prevWordIds[0], prevWordIds.size(), wordId);
217         return false;
218     }
219     if (!probabilityEntry.isValid() && outAddedNewBigram) {
220         *outAddedNewBigram = true;
221     }
222     return true;
223 }
224 
removeNgramEntry(const WordIdArrayView prevWordIds,const int wordId)225 bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds,
226         const int wordId) {
227     LanguageModelDictContent *const languageModelDictContent =
228             mBuffers->getMutableLanguageModelDictContent();
229     return languageModelDictContent->removeNgramProbabilityEntry(prevWordIds, wordId);
230 }
231 
232 // TODO: Remove when we stop supporting v402 format.
updateAllBigramEntriesAndDeleteUselessEntries(const PtNodeParams * const sourcePtNodeParams,int * const outBigramEntryCount)233 bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
234             const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
235     // Do nothing.
236     return true;
237 }
238 
updateAllPositionFields(const PtNodeParams * const toBeUpdatedPtNodeParams,const DictPositionRelocationMap * const dictPositionRelocationMap,int * const outBigramEntryCount)239 bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
240         const PtNodeParams *const toBeUpdatedPtNodeParams,
241         const DictPositionRelocationMap *const dictPositionRelocationMap,
242         int *const outBigramEntryCount) {
243     int parentPos = toBeUpdatedPtNodeParams->getParentPos();
244     if (parentPos != NOT_A_DICT_POS) {
245         PtNodeWriter::PtNodePositionRelocationMap::const_iterator it =
246                 dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos);
247         if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) {
248             parentPos = it->second;
249         }
250     }
251     int writingPos = toBeUpdatedPtNodeParams->getHeadPos()
252             + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE;
253     // Write updated parent offset.
254     if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
255             parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) {
256         return false;
257     }
258 
259     // Updates children position.
260     int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos();
261     if (childrenPos != NOT_A_DICT_POS) {
262         PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it =
263                 dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos);
264         if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) {
265             childrenPos = it->second;
266         }
267     }
268     if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
269         return false;
270     }
271     return true;
272 }
273 
addShortcutTarget(const PtNodeParams * const ptNodeParams,const int * const targetCodePoints,const int targetCodePointCount,const int shortcutProbability)274 bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams,
275         const int *const targetCodePoints, const int targetCodePointCount,
276         const int shortcutProbability) {
277     if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
278             targetCodePoints, targetCodePointCount, shortcutProbability)) {
279         AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId());
280         return false;
281     }
282     return true;
283 }
284 
writePtNodeAndGetTerminalIdAndAdvancePosition(const PtNodeParams * const ptNodeParams,int * const outTerminalId,int * const ptNodeWritingPos)285 bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
286         const PtNodeParams *const ptNodeParams, int *const outTerminalId,
287         int *const ptNodeWritingPos) {
288     const int nodePos = *ptNodeWritingPos;
289     // Write placeholder flags. The Node flags are updated with appropriate flags at the last step of the
290     // PtNode writing.
291     if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer,
292             0 /* nodeFlags */, ptNodeWritingPos)) {
293         return false;
294     }
295     // Calculate a parent offset and write the offset.
296     if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer,
297             ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) {
298         return false;
299     }
300     // Write code points
301     if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer,
302             ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) {
303         return false;
304     }
305     int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
306     if (!ptNodeParams->willBecomeNonTerminal()) {
307         if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) {
308             terminalId = ptNodeParams->getTerminalId();
309         } else if (ptNodeParams->isTerminal()) {
310             // Write terminal information using a new terminal id.
311             // Get a new unused terminal id.
312             terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId();
313         }
314     }
315     const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID;
316     if (isTerminal) {
317         // Update the lookup table.
318         if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition(
319                 terminalId, nodePos)) {
320             return false;
321         }
322         // Write terminal Id.
323         if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId,
324                 Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) {
325             return false;
326         }
327         if (outTerminalId) {
328             *outTerminalId = terminalId;
329         }
330     }
331     // Write children position
332     if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer,
333             ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
334         return false;
335     }
336     return updatePtNodeFlags(nodePos, isTerminal,
337             ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
338 }
339 
updatePtNodeFlags(const int ptNodePos,const bool isTerminal,const bool hasMultipleChars)340 bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal,
341         const bool hasMultipleChars) {
342     // Create node flags and write them.
343     PatriciaTrieReadingUtils::NodeFlags nodeFlags =
344             PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */,
345                     false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */,
346                     false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE);
347     if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
348         AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
349         return false;
350     }
351     return true;
352 }
353 
354 } // namespace latinime
355