1 /*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /*
18 * !!!!! DO NOT EDIT THIS FILE !!!!!
19 *
20 * This file was generated from
21 * dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
22 */
23
24 #include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
25
26 #include <cstring>
27 #include <queue>
28
29 #include "dictionary/header/header_policy.h"
30 #include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
31 #include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
32 #include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
33 #include "dictionary/structure/backward/v402/ver4_dict_constants.h"
34 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
35 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
36 #include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
37 #include "dictionary/utils/buffer_with_extendable_buffer.h"
38 #include "dictionary/utils/file_utils.h"
39 #include "dictionary/utils/forgetting_curve_utils.h"
40
41 namespace latinime {
42 namespace backward {
43 namespace v402 {
44
writeToDictFile(const char * const dictDirPath,const EntryCounts & entryCounts) const45 bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
46 const EntryCounts &entryCounts) const {
47 const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
48 BufferWithExtendableBuffer headerBuffer(
49 BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
50 const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
51 + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
52 if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
53 entryCounts, extendedRegionSize, &headerBuffer)) {
54 AKLOGE("Cannot write header structure to buffer. "
55 "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
56 "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
57 entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize);
58 return false;
59 }
60 return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
61 }
62
writeToDictFileWithGC(const int rootPtNodeArrayPos,const char * const dictDirPath)63 bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
64 const char *const dictDirPath) {
65 const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
66 Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
67 Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
68 Ver4DictConstants::MAX_DICTIONARY_SIZE));
69 int unigramCount = 0;
70 int bigramCount = 0;
71 if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
72 return false;
73 }
74 BufferWithExtendableBuffer headerBuffer(
75 BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
76 MutableEntryCounters entryCounters;
77 entryCounters.setNgramCount(NgramType::Unigram, unigramCount);
78 entryCounters.setNgramCount(NgramType::Bigram, bigramCount);
79 if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
80 entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
81 return false;
82 }
83 return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
84 }
85
runGC(const int rootPtNodeArrayPos,const HeaderPolicy * const headerPolicy,Ver4DictBuffers * const buffersToWrite,int * const outUnigramCount,int * const outBigramCount)86 bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
87 const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
88 int *const outUnigramCount, int *const outBigramCount) {
89 Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
90 mBuffers->getProbabilityDictContent(), headerPolicy);
91 Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
92 Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
93 mBuffers->getTerminalPositionLookupTable(), headerPolicy);
94 Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
95 mBuffers->getTerminalPositionLookupTable());
96 Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
97 mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
98 &shortcutPolicy);
99
100 DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
101 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
102 DynamicPtGcEventListeners
103 ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
104 traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
105 &ptNodeWriter);
106 if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
107 &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
108 return false;
109 }
110 const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
111 .getValidUnigramCount();
112 const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram);
113 if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
114 if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
115 AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
116 maxUnigramCount);
117 return false;
118 }
119 }
120
121 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
122 DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
123 traversePolicyToUpdateBigramProbability(&ptNodeWriter);
124 if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
125 &traversePolicyToUpdateBigramProbability)) {
126 return false;
127 }
128 const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
129 const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram);
130 if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
131 if (!truncateBigrams(maxBigramCount)) {
132 AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
133 return false;
134 }
135 }
136
137 // Mapping from positions in mBuffer to positions in bufferToWrite.
138 PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
139 readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
140 Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
141 buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
142 &shortcutPolicy);
143 DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
144 traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
145 buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
146 if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
147 &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
148 return false;
149 }
150
151 // Create policy instances for the GCed dictionary.
152 Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
153 buffersToWrite->getProbabilityDictContent(), headerPolicy);
154 Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
155 Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
156 buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
157 Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
158 buffersToWrite->getTerminalPositionLookupTable());
159 Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
160 buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
161 &newShortcutPolicy);
162 // Re-assign terminal IDs for valid terminal PtNodes.
163 TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
164 if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
165 &terminalIdMap)) {
166 return false;
167 }
168 // Run GC for probability dict content.
169 if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap,
170 mBuffers->getProbabilityDictContent())) {
171 return false;
172 }
173 // Run GC for bigram dict content.
174 if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
175 mBuffers->getBigramDictContent(), outBigramCount)) {
176 return false;
177 }
178 // Run GC for shortcut dict content.
179 if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
180 mBuffers->getShortcutDictContent())) {
181 return false;
182 }
183 DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
184 newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
185 DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
186 traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
187 if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
188 &traversePolicyToUpdateAllPositionFields)) {
189 return false;
190 }
191 newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
192 TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
193 traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
194 if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
195 &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
196 return false;
197 }
198 *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
199 return true;
200 }
201
truncateUnigrams(const Ver4PatriciaTrieNodeReader * const ptNodeReader,Ver4PatriciaTrieNodeWriter * const ptNodeWriter,const int maxUnigramCount)202 bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
203 const Ver4PatriciaTrieNodeReader *const ptNodeReader,
204 Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
205 const TerminalPositionLookupTable *const terminalPosLookupTable =
206 mBuffers->getTerminalPositionLookupTable();
207 const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
208 std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
209 priorityQueue;
210 for (int i = 0; i < nextTerminalId; ++i) {
211 const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
212 if (terminalPos == NOT_A_DICT_POS) {
213 continue;
214 }
215 const ProbabilityEntry probabilityEntry =
216 mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
217 const int probability = probabilityEntry.hasHistoricalInfo() ?
218 ForgettingCurveUtils::decodeProbability(
219 probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
220 probabilityEntry.getProbability();
221 priorityQueue.push(DictProbability(terminalPos, probability,
222 probabilityEntry.getHistoricalInfo()->getTimestamp()));
223 }
224
225 // Delete unigrams.
226 while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
227 const int ptNodePos = priorityQueue.top().getDictPos();
228 priorityQueue.pop();
229 const PtNodeParams ptNodeParams =
230 ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
231 if (ptNodeParams.representsNonWordInfo()) {
232 continue;
233 }
234 if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
235 AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
236 return false;
237 }
238 }
239 return true;
240 }
241
truncateBigrams(const int maxBigramCount)242 bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
243 const TerminalPositionLookupTable *const terminalPosLookupTable =
244 mBuffers->getTerminalPositionLookupTable();
245 const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
246 std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
247 priorityQueue;
248 BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
249 for (int i = 0; i < nextTerminalId; ++i) {
250 const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
251 if (bigramListPos == NOT_A_DICT_POS) {
252 continue;
253 }
254 bool hasNext = true;
255 int readingPos = bigramListPos;
256 while (hasNext) {
257 const int entryPos = readingPos;
258 const BigramEntry bigramEntry =
259 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
260 hasNext = bigramEntry.hasNext();
261 if (!bigramEntry.isValid()) {
262 continue;
263 }
264 const int probability = bigramEntry.hasHistoricalInfo() ?
265 ForgettingCurveUtils::decodeProbability(
266 bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
267 bigramEntry.getProbability();
268 priorityQueue.push(DictProbability(entryPos, probability,
269 bigramEntry.getHistoricalInfo()->getTimestamp()));
270 }
271 }
272
273 // Delete bigrams.
274 while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
275 const int entryPos = priorityQueue.top().getDictPos();
276 const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
277 const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
278 if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
279 AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
280 return false;
281 }
282 priorityQueue.pop();
283 }
284 return true;
285 }
286
287 bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
onVisitingPtNode(const PtNodeParams * const ptNodeParams)288 ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
289 if (!ptNodeParams->isTerminal()) {
290 return true;
291 }
292 TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
293 mTerminalIdMap->find(ptNodeParams->getTerminalId());
294 if (it == mTerminalIdMap->end()) {
295 AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
296 ptNodeParams->getTerminalId(), mTerminalIdMap->size());
297 return false;
298 }
299 if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
300 AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
301 }
302 return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
303 }
304
305 } // namespace v402
306 } // namespace backward
307 } // namespace latinime
308