1 /*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
18
19 #include "dictionary/property/unigram_property.h"
20 #include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
21 #include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
22 #include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
23 #include "dictionary/structure/pt_common/pt_node_reader.h"
24 #include "dictionary/structure/pt_common/pt_node_writer.h"
25 #include "dictionary/utils/buffer_with_extendable_buffer.h"
26
27 namespace latinime {
28
29 const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
30
addUnigramWord(DynamicPtReadingHelper * const readingHelper,const CodePointArrayView wordCodePoints,const UnigramProperty * const unigramProperty,bool * const outAddedNewUnigram)31 bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper,
32 const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty,
33 bool *const outAddedNewUnigram) {
34 int parentPos = NOT_A_DICT_POS;
35 while (!readingHelper->isEnd()) {
36 const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
37 if (!ptNodeParams.isValid()) {
38 break;
39 }
40 const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
41 if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */,
42 wordCodePoints[matchedCodePointCount])) {
43 // The first code point is different from target code point. Skip this node and read
44 // the next sibling node.
45 readingHelper->readNextSiblingNode(ptNodeParams);
46 continue;
47 }
48 // Check following merged node code points.
49 const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size();
50 for (size_t j = 1; j < nodeCodePointCount; ++j) {
51 const size_t nextIndex = matchedCodePointCount + j;
52 if (nextIndex >= wordCodePoints.size()
53 || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
54 wordCodePoints[matchedCodePointCount + j])) {
55 *outAddedNewUnigram = true;
56 return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
57 wordCodePoints.skip(matchedCodePointCount));
58 }
59 }
60 // All characters are matched.
61 if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) {
62 return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
63 }
64 if (!ptNodeParams.hasChildren()) {
65 *outAddedNewUnigram = true;
66 return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
67 wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams)));
68 }
69 // Advance to the children nodes.
70 parentPos = ptNodeParams.getHeadPos();
71 readingHelper->readChildNode(ptNodeParams);
72 }
73 if (readingHelper->isError()) {
74 // The dictionary is invalid.
75 return false;
76 }
77 int pos = readingHelper->getPosOfLastForwardLinkField();
78 *outAddedNewUnigram = true;
79 return createAndInsertNodeIntoPtNodeArray(parentPos,
80 wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty,
81 &pos);
82 }
83
addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,const int wordPos,const NgramProperty * const ngramProperty,bool * const outAddedNewEntry)84 bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
85 const int wordPos, const NgramProperty *const ngramProperty,
86 bool *const outAddedNewEntry) {
87 if (prevWordsPtNodePos.empty()) {
88 return false;
89 }
90 ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
91 int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
92 for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
93 prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
94 prevWordsPtNodePos[i]).getTerminalId();
95 }
96 const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
97 const int wordId =
98 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
99 return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry);
100 }
101
removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,const int wordPos)102 bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
103 const int wordPos) {
104 if (prevWordsPtNodePos.empty()) {
105 return false;
106 }
107 ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM);
108 int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
109 for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) {
110 prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(
111 prevWordsPtNodePos[i]).getTerminalId();
112 }
113 const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
114 const int wordId =
115 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
116 return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId);
117 }
118
addShortcutTarget(const int wordPos,const CodePointArrayView targetCodePoints,const int shortcutProbability)119 bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
120 const CodePointArrayView targetCodePoints, const int shortcutProbability) {
121 const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos));
122 return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(),
123 targetCodePoints.size(), shortcutProbability);
124 }
125
createAndInsertNodeIntoPtNodeArray(const int parentPos,const CodePointArrayView ptNodeCodePoints,const UnigramProperty * const unigramProperty,int * const forwardLinkFieldPos)126 bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
127 const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty,
128 int *const forwardLinkFieldPos) {
129 const int newPtNodeArrayPos = mBuffer->getTailPosition();
130 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
131 newPtNodeArrayPos, forwardLinkFieldPos)) {
132 return false;
133 }
134 return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty);
135 }
136
setPtNodeProbability(const PtNodeParams * const originalPtNodeParams,const UnigramProperty * const unigramProperty,bool * const outAddedNewUnigram)137 bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
138 const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
139 if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) {
140 // Overwrites the probability.
141 *outAddedNewUnigram = false;
142 return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty);
143 } else {
144 // Make the node terminal and write the probability.
145 *outAddedNewUnigram = true;
146 const int movedPos = mBuffer->getTailPosition();
147 int writingPos = movedPos;
148 const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
149 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
150 true /* isTerminal */, originalPtNodeParams->getParentPos(),
151 originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
152 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
153 unigramProperty, &writingPos)) {
154 return false;
155 }
156 if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) {
157 return false;
158 }
159 }
160 return true;
161 }
162
createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams * const parentPtNodeParams,const UnigramProperty * const unigramProperty,const CodePointArrayView codePoints)163 bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
164 const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
165 const CodePointArrayView codePoints) {
166 const int newPtNodeArrayPos = mBuffer->getTailPosition();
167 if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
168 return false;
169 }
170 return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
171 unigramProperty);
172 }
173
createNewPtNodeArrayWithAChildPtNode(const int parentPtNodePos,const CodePointArrayView ptNodeCodePoints,const UnigramProperty * const unigramProperty)174 bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
175 const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints,
176 const UnigramProperty *const unigramProperty) {
177 int writingPos = mBuffer->getTailPosition();
178 if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
179 1 /* arraySize */, &writingPos)) {
180 return false;
181 }
182 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
183 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
184 true /* isTerminal */, parentPtNodePos, ptNodeCodePoints,
185 unigramProperty->getProbability()));
186 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
187 unigramProperty, &writingPos)) {
188 return false;
189 }
190 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
191 NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
192 return false;
193 }
194 return true;
195 }
196
197 // Returns whether the dictionary updating was succeeded or not.
reallocatePtNodeAndAddNewPtNodes(const PtNodeParams * const reallocatingPtNodeParams,const size_t overlappingCodePointCount,const UnigramProperty * const unigramProperty,const CodePointArrayView newPtNodeCodePoints)198 bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
199 const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount,
200 const UnigramProperty *const unigramProperty,
201 const CodePointArrayView newPtNodeCodePoints) {
202 // When addsExtraChild is true, split the reallocating PtNode and add new child.
203 // Reallocating PtNode: abcde, newNode: abcxy.
204 // abc (1st, not terminal) __ de (2nd)
205 // \_ xy (extra child, terminal)
206 // Otherwise, this method makes 1st part terminal and write information in unigramProperty.
207 // Reallocating PtNode: abcde, newNode: abc.
208 // abc (1st, terminal) __ de (2nd)
209 const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount;
210 const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition();
211 int writingPos = firstPartOfReallocatedPtNodePos;
212 // Write the 1st part of the reallocating node. The children position will be updated later
213 // with actual children position.
214 const CodePointArrayView firstPtNodeCodePoints =
215 reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
216 if (addsExtraChild) {
217 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
218 false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */,
219 reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
220 NOT_A_PROBABILITY));
221 if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
222 return false;
223 }
224 } else {
225 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
226 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
227 true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
228 firstPtNodeCodePoints, unigramProperty->getProbability()));
229 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
230 unigramProperty, &writingPos)) {
231 return false;
232 }
233 }
234 const int actualChildrenPos = writingPos;
235 // Create new children PtNode array.
236 const size_t newPtNodeCount = addsExtraChild ? 2 : 1;
237 if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
238 newPtNodeCount, &writingPos)) {
239 return false;
240 }
241 // Write the 2nd part of the reallocating node.
242 const int secondPartOfReallocatedPtNodePos = writingPos;
243 const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
244 reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(),
245 reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
246 reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
247 reallocatingPtNodeParams->getProbability()));
248 if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) {
249 return false;
250 }
251 if (addsExtraChild) {
252 const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
253 unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
254 true /* isTerminal */, firstPartOfReallocatedPtNodePos,
255 newPtNodeCodePoints.skip(overlappingCodePointCount),
256 unigramProperty->getProbability()));
257 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
258 unigramProperty, &writingPos)) {
259 return false;
260 }
261 }
262 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
263 NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) {
264 return false;
265 }
266 // Update original reallocating PtNode as moved.
267 if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos,
268 secondPartOfReallocatedPtNodePos)) {
269 return false;
270 }
271 // Load node info. Information of the 1st part will be fetched.
272 const PtNodeParams ptNodeParams(
273 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos));
274 // Update children position.
275 return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos);
276 }
277
getUpdatedPtNodeParams(const PtNodeParams * const originalPtNodeParams,const bool isNotAWord,const bool isPossiblyOffensive,const bool isTerminal,const int parentPos,const CodePointArrayView codePoints,const int probability) const278 const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
279 const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
280 const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
281 const CodePointArrayView codePoints, const int probability) const {
282 const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
283 isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
284 false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
285 CHILDREN_POSITION_FIELD_SIZE);
286 return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
287 }
288
getPtNodeParamsForNewPtNode(const bool isNotAWord,const bool isPossiblyOffensive,const bool isTerminal,const int parentPos,const CodePointArrayView codePoints,const int probability) const289 const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
290 const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
291 const CodePointArrayView codePoints, const int probability) const {
292 const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
293 isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
294 false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
295 CHILDREN_POSITION_FIELD_SIZE);
296 return PtNodeParams(flags, parentPos, codePoints, probability);
297 }
298
299 } // namespace latinime
300