1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
18 
19 #include "dictionary/property/unigram_property.h"
20 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
21 #include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
22 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
23 #include "dictionary/structure/pt_common/pt_node_reader.h"
24 #include "dictionary/structure/pt_common/pt_node_writer.h"
25 #include "dictionary/utils/buffer_with_extendable_buffer.h"
26 
27 namespace latinime {
28 
29 const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
30 
addUnigramWord(DynamicPtReadingHelper * const readingHelper,const CodePointArrayView wordCodePoints,const UnigramProperty * const unigramProperty,bool * const outAddedNewUnigram)31 bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper,
32         const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty,
33         bool *const outAddedNewUnigram) {
34     int parentPos = NOT_A_DICT_POS;
35     while (!readingHelper->isEnd()) {
36         const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
37         if (!ptNodeParams.isValid()) {
38             break;
39         }
40         const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
41         if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */,
42                 wordCodePoints[matchedCodePointCount])) {
43             // The first code point is different from target code point. Skip this node and read
44             // the next sibling node.
45             readingHelper->readNextSiblingNode(ptNodeParams);
46             continue;
47         }
48         // Check following merged node code points.
49         const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size();
50         for (size_t j = 1; j < nodeCodePointCount; ++j) {
51             const size_t nextIndex = matchedCodePointCount + j;
52             if (nextIndex >= wordCodePoints.size()
53                     || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
54                             wordCodePoints[matchedCodePointCount + j])) {
55                 *outAddedNewUnigram = true;
56                 return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
57                         wordCodePoints.skip(matchedCodePointCount));
58             }
59         }
60         // All characters are matched.
61         if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) {
62             return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
63         }
64         if (!ptNodeParams.hasChildren()) {
65             *outAddedNewUnigram = true;
66             return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
67                     wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams)));
68         }
69         // Advance to the children nodes.
70         parentPos = ptNodeParams.getHeadPos();
71         readingHelper->readChildNode(ptNodeParams);
72     }
73     if (readingHelper->isError()) {
74         // The dictionary is invalid.
75         return false;
76     }
77     int pos = readingHelper->getPosOfLastForwardLinkField();
78     *outAddedNewUnigram = true;
79     return createAndInsertNodeIntoPtNodeArray(parentPos,
80             wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty,
81             &pos);
82 }
83 
addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,const int wordPos,const NgramProperty * const ngramProperty,bool * const outAddedNewEntry)84 bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
85         const int wordPos, const NgramProperty *const ngramProperty,
86         bool *const outAddedNewEntry) {
87     if (prevWordsPtNodePos.empty()) {
88         return false;
89     }
90     ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
91     int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
92     for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
93         prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
94                 prevWordsPtNodePos[i]).getTerminalId();
95     }
96     const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
97     const int wordId =
98             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
99     return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry);
100 }
101 
removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,const int wordPos)102 bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
103         const int wordPos) {
104     if (prevWordsPtNodePos.empty()) {
105         return false;
106     }
107     ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
108     int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
109     for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
110         prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
111                 prevWordsPtNodePos[i]).getTerminalId();
112     }
113     const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
114     const int wordId =
115             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
116     return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId);
117 }
118 
addShortcutTarget(const int wordPos,const CodePointArrayView targetCodePoints,const int shortcutProbability)119 bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
120         const CodePointArrayView targetCodePoints, const int shortcutProbability) {
121     const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos));
122     return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(),
123             targetCodePoints.size(), shortcutProbability);
124 }
125 
createAndInsertNodeIntoPtNodeArray(const int parentPos,const CodePointArrayView ptNodeCodePoints,const UnigramProperty * const unigramProperty,int * const forwardLinkFieldPos)126 bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
127         const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty,
128         int *const forwardLinkFieldPos) {
129     const int newPtNodeArrayPos = mBuffer->getTailPosition();
130     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
131             newPtNodeArrayPos, forwardLinkFieldPos)) {
132         return false;
133     }
134     return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty);
135 }
136 
setPtNodeProbability(const PtNodeParams * const originalPtNodeParams,const UnigramProperty * const unigramProperty,bool * const outAddedNewUnigram)137 bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
138         const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
139     if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) {
140         // Overwrites the probability.
141         *outAddedNewUnigram = false;
142         return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty);
143     } else {
144         // Make the node terminal and write the probability.
145         *outAddedNewUnigram = true;
146         const int movedPos = mBuffer->getTailPosition();
147         int writingPos = movedPos;
148         const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
149                 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
150                 true /* isTerminal */, originalPtNodeParams->getParentPos(),
151                 originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
152         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
153                 unigramProperty, &writingPos)) {
154             return false;
155         }
156         if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) {
157             return false;
158         }
159     }
160     return true;
161 }
162 
createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams * const parentPtNodeParams,const UnigramProperty * const unigramProperty,const CodePointArrayView codePoints)163 bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
164         const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
165         const CodePointArrayView codePoints) {
166     const int newPtNodeArrayPos = mBuffer->getTailPosition();
167     if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
168         return false;
169     }
170     return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
171             unigramProperty);
172 }
173 
createNewPtNodeArrayWithAChildPtNode(const int parentPtNodePos,const CodePointArrayView ptNodeCodePoints,const UnigramProperty * const unigramProperty)174 bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
175         const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints,
176         const UnigramProperty *const unigramProperty) {
177     int writingPos = mBuffer->getTailPosition();
178     if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
179             1 /* arraySize */, &writingPos)) {
180         return false;
181     }
182     const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
183             unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
184             true /* isTerminal */, parentPtNodePos, ptNodeCodePoints,
185             unigramProperty->getProbability()));
186     if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
187             unigramProperty, &writingPos)) {
188         return false;
189     }
190     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
191             NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
192         return false;
193     }
194     return true;
195 }
196 
197 // Returns whether the dictionary updating was succeeded or not.
reallocatePtNodeAndAddNewPtNodes(const PtNodeParams * const reallocatingPtNodeParams,const size_t overlappingCodePointCount,const UnigramProperty * const unigramProperty,const CodePointArrayView newPtNodeCodePoints)198 bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
199         const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount,
200         const UnigramProperty *const unigramProperty,
201         const CodePointArrayView newPtNodeCodePoints) {
202     // When addsExtraChild is true, split the reallocating PtNode and add new child.
203     // Reallocating PtNode: abcde, newNode: abcxy.
204     // abc (1st, not terminal) __ de (2nd)
205     //                         \_ xy (extra child, terminal)
206     // Otherwise, this method makes 1st part terminal and write information in unigramProperty.
207     // Reallocating PtNode: abcde, newNode: abc.
208     // abc (1st, terminal) __ de (2nd)
209     const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount;
210     const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition();
211     int writingPos = firstPartOfReallocatedPtNodePos;
212     // Write the 1st part of the reallocating node. The children position will be updated later
213     // with actual children position.
214     const CodePointArrayView firstPtNodeCodePoints =
215             reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
216     if (addsExtraChild) {
217         const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
218                 false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */,
219                 reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
220                 NOT_A_PROBABILITY));
221         if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
222             return false;
223         }
224     } else {
225         const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
226                 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
227                 true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
228                 firstPtNodeCodePoints, unigramProperty->getProbability()));
229         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
230                 unigramProperty, &writingPos)) {
231             return false;
232         }
233     }
234     const int actualChildrenPos = writingPos;
235     // Create new children PtNode array.
236     const size_t newPtNodeCount = addsExtraChild ? 2 : 1;
237     if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
238             newPtNodeCount, &writingPos)) {
239         return false;
240     }
241     // Write the 2nd part of the reallocating node.
242     const int secondPartOfReallocatedPtNodePos = writingPos;
243     const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
244             reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(),
245             reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
246             reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
247             reallocatingPtNodeParams->getProbability()));
248     if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) {
249         return false;
250     }
251     if (addsExtraChild) {
252         const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
253                 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
254                 true /* isTerminal */, firstPartOfReallocatedPtNodePos,
255                 newPtNodeCodePoints.skip(overlappingCodePointCount),
256                 unigramProperty->getProbability()));
257         if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
258                 unigramProperty, &writingPos)) {
259             return false;
260         }
261     }
262     if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
263             NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
264         return false;
265     }
266     // Update original reallocating PtNode as moved.
267     if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos,
268             secondPartOfReallocatedPtNodePos)) {
269         return false;
270     }
271     // Load node info. Information of the 1st part will be fetched.
272     const PtNodeParams ptNodeParams(
273             mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos));
274     // Update children position.
275     return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos);
276 }
277 
getUpdatedPtNodeParams(const PtNodeParams * const originalPtNodeParams,const bool isNotAWord,const bool isPossiblyOffensive,const bool isTerminal,const int parentPos,const CodePointArrayView codePoints,const int probability) const278 const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
279         const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
280         const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
281         const CodePointArrayView codePoints, const int probability) const {
282     const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
283             isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
284             false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
285             CHILDREN_POSITION_FIELD_SIZE);
286     return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
287 }
288 
getPtNodeParamsForNewPtNode(const bool isNotAWord,const bool isPossiblyOffensive,const bool isTerminal,const int parentPos,const CodePointArrayView codePoints,const int probability) const289 const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
290         const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
291         const CodePointArrayView codePoints, const int probability) const {
292     const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
293             isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
294             false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
295             CHILDREN_POSITION_FIELD_SIZE);
296     return PtNodeParams(flags, parentPos, codePoints, probability);
297 }
298 
299 } // namespace latinime
300