1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /*
18  * !!!!! DO NOT EDIT THIS FILE !!!!!
19  *
20  * This file was generated from
21  *   dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
22  */
23 
24 #include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
25 
26 #include <cstring>
27 #include <queue>
28 
29 #include "dictionary/header/header_policy.h"
30 #include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
31 #include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
32 #include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
33 #include "dictionary/structure/backward/v402/ver4_dict_constants.h"
34 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
35 #include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
36 #include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
37 #include "dictionary/utils/buffer_with_extendable_buffer.h"
38 #include "dictionary/utils/file_utils.h"
39 #include "dictionary/utils/forgetting_curve_utils.h"
40 
41 namespace latinime {
42 namespace backward {
43 namespace v402 {
44 
writeToDictFile(const char * const dictDirPath,const EntryCounts & entryCounts) const45 bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
46         const EntryCounts &entryCounts) const {
47     const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
48     BufferWithExtendableBuffer headerBuffer(
49             BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
50     const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
51             + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
52     if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
53             entryCounts, extendedRegionSize, &headerBuffer)) {
54         AKLOGE("Cannot write header structure to buffer. "
55                 "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
56                 "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
57                 entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize);
58         return false;
59     }
60     return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
61 }
62 
writeToDictFileWithGC(const int rootPtNodeArrayPos,const char * const dictDirPath)63 bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
64         const char *const dictDirPath) {
65     const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
66     Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
67             Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
68                     Ver4DictConstants::MAX_DICTIONARY_SIZE));
69     int unigramCount = 0;
70     int bigramCount = 0;
71     if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
72         return false;
73     }
74     BufferWithExtendableBuffer headerBuffer(
75             BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
76     MutableEntryCounters entryCounters;
77     entryCounters.setNgramCount(NgramType::Unigram, unigramCount);
78     entryCounters.setNgramCount(NgramType::Bigram, bigramCount);
79     if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
80             entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
81         return false;
82     }
83     return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
84 }
85 
runGC(const int rootPtNodeArrayPos,const HeaderPolicy * const headerPolicy,Ver4DictBuffers * const buffersToWrite,int * const outUnigramCount,int * const outBigramCount)86 bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
87         const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
88         int *const outUnigramCount, int *const outBigramCount) {
89     Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
90             mBuffers->getProbabilityDictContent(), headerPolicy);
91     Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
92     Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
93             mBuffers->getTerminalPositionLookupTable(), headerPolicy);
94     Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
95             mBuffers->getTerminalPositionLookupTable());
96     Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
97             mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
98             &shortcutPolicy);
99 
100     DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
101     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
102     DynamicPtGcEventListeners
103             ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
104                     traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
105                             &ptNodeWriter);
106     if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
107             &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
108         return false;
109     }
110     const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
111             .getValidUnigramCount();
112     const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram);
113     if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
114         if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
115             AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
116                     maxUnigramCount);
117             return false;
118         }
119     }
120 
121     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
122     DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
123             traversePolicyToUpdateBigramProbability(&ptNodeWriter);
124     if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
125             &traversePolicyToUpdateBigramProbability)) {
126         return false;
127     }
128     const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
129     const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram);
130     if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
131         if (!truncateBigrams(maxBigramCount)) {
132             AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
133             return false;
134         }
135     }
136 
137     // Mapping from positions in mBuffer to positions in bufferToWrite.
138     PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
139     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
140     Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
141             buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
142             &shortcutPolicy);
143     DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
144             traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
145                     buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
146     if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
147             &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
148         return false;
149     }
150 
151     // Create policy instances for the GCed dictionary.
152     Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
153             buffersToWrite->getProbabilityDictContent(), headerPolicy);
154     Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
155     Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
156             buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
157     Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
158             buffersToWrite->getTerminalPositionLookupTable());
159     Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
160             buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
161             &newShortcutPolicy);
162     // Re-assign terminal IDs for valid terminal PtNodes.
163     TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
164     if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
165             &terminalIdMap)) {
166         return false;
167     }
168     // Run GC for probability dict content.
169     if (!buffersToWrite->getMutableProbabilityDictContent()->runGC(&terminalIdMap,
170             mBuffers->getProbabilityDictContent())) {
171         return false;
172     }
173     // Run GC for bigram dict content.
174     if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
175             mBuffers->getBigramDictContent(), outBigramCount)) {
176         return false;
177     }
178     // Run GC for shortcut dict content.
179     if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
180             mBuffers->getShortcutDictContent())) {
181         return false;
182     }
183     DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
184     newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
185     DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
186             traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
187     if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
188             &traversePolicyToUpdateAllPositionFields)) {
189         return false;
190     }
191     newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
192     TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
193             traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
194     if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
195             &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
196         return false;
197     }
198     *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
199     return true;
200 }
201 
truncateUnigrams(const Ver4PatriciaTrieNodeReader * const ptNodeReader,Ver4PatriciaTrieNodeWriter * const ptNodeWriter,const int maxUnigramCount)202 bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
203         const Ver4PatriciaTrieNodeReader *const ptNodeReader,
204         Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
205     const TerminalPositionLookupTable *const terminalPosLookupTable =
206             mBuffers->getTerminalPositionLookupTable();
207     const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
208     std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
209             priorityQueue;
210     for (int i = 0; i < nextTerminalId; ++i) {
211         const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
212         if (terminalPos == NOT_A_DICT_POS) {
213             continue;
214         }
215         const ProbabilityEntry probabilityEntry =
216                 mBuffers->getProbabilityDictContent()->getProbabilityEntry(i);
217         const int probability = probabilityEntry.hasHistoricalInfo() ?
218                 ForgettingCurveUtils::decodeProbability(
219                         probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
220                 probabilityEntry.getProbability();
221         priorityQueue.push(DictProbability(terminalPos, probability,
222                 probabilityEntry.getHistoricalInfo()->getTimestamp()));
223     }
224 
225     // Delete unigrams.
226     while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
227         const int ptNodePos = priorityQueue.top().getDictPos();
228         priorityQueue.pop();
229         const PtNodeParams ptNodeParams =
230                 ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
231         if (ptNodeParams.representsNonWordInfo()) {
232             continue;
233         }
234         if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
235             AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
236             return false;
237         }
238     }
239     return true;
240 }
241 
truncateBigrams(const int maxBigramCount)242 bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
243     const TerminalPositionLookupTable *const terminalPosLookupTable =
244             mBuffers->getTerminalPositionLookupTable();
245     const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
246     std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
247             priorityQueue;
248     BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
249     for (int i = 0; i < nextTerminalId; ++i) {
250         const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
251         if (bigramListPos == NOT_A_DICT_POS) {
252             continue;
253         }
254         bool hasNext = true;
255         int readingPos = bigramListPos;
256         while (hasNext) {
257             const int entryPos = readingPos;
258             const BigramEntry bigramEntry =
259                     bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
260             hasNext = bigramEntry.hasNext();
261             if (!bigramEntry.isValid()) {
262                 continue;
263             }
264             const int probability = bigramEntry.hasHistoricalInfo() ?
265                     ForgettingCurveUtils::decodeProbability(
266                             bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
267                     bigramEntry.getProbability();
268             priorityQueue.push(DictProbability(entryPos, probability,
269                     bigramEntry.getHistoricalInfo()->getTimestamp()));
270         }
271     }
272 
273     // Delete bigrams.
274     while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
275         const int entryPos = priorityQueue.top().getDictPos();
276         const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
277         const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
278         if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
279             AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
280             return false;
281         }
282         priorityQueue.pop();
283     }
284     return true;
285 }
286 
287 bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
onVisitingPtNode(const PtNodeParams * const ptNodeParams)288         ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
289     if (!ptNodeParams->isTerminal()) {
290         return true;
291     }
292     TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
293             mTerminalIdMap->find(ptNodeParams->getTerminalId());
294     if (it == mTerminalIdMap->end()) {
295         AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
296                 ptNodeParams->getTerminalId(), mTerminalIdMap->size());
297         return false;
298     }
299     if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
300         AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
301     }
302     return mPtNodeWriter->updatePtNodeHasBigramsAndShortcutTargetsFlags(ptNodeParams);
303 }
304 
305 } // namespace v402
306 } // namespace backward
307 } // namespace latinime
308