1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
18 
19 #include <cstring>
20 #include <queue>
21 
22 #include "dictionary/header/header_policy.h"
23 #include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
24 #include "dictionary/structure/v4/ver4_dict_buffers.h"
25 #include "dictionary/structure/v4/ver4_dict_constants.h"
26 #include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
27 #include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
28 #include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
29 #include "dictionary/utils/buffer_with_extendable_buffer.h"
30 #include "dictionary/utils/file_utils.h"
31 #include "dictionary/utils/forgetting_curve_utils.h"
32 #include "utils/ngram_utils.h"
33 
34 namespace latinime {
35 
writeToDictFile(const char * const dictDirPath,const EntryCounts & entryCounts) const36 bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
37         const EntryCounts &entryCounts) const {
38     const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
39     BufferWithExtendableBuffer headerBuffer(
40             BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
41     const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
42             + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
43     if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
44             entryCounts, extendedRegionSize, &headerBuffer)) {
45         AKLOGE("Cannot write header structure to buffer. "
46                 "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, trigramCount: %d,"
47                 "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
48                 entryCounts.getNgramCount(NgramType::Bigram),
49                 entryCounts.getNgramCount(NgramType::Trigram),
50                 extendedRegionSize);
51         return false;
52     }
53     return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
54 }
55 
writeToDictFileWithGC(const int rootPtNodeArrayPos,const char * const dictDirPath)56 bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
57         const char *const dictDirPath) {
58     const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
59     Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
60             Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
61                     Ver4DictConstants::MAX_DICTIONARY_SIZE));
62     MutableEntryCounters entryCounters;
63     if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &entryCounters)) {
64         return false;
65     }
66     BufferWithExtendableBuffer headerBuffer(
67             BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
68     if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
69             entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
70         return false;
71     }
72     return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
73 }
74 
runGC(const int rootPtNodeArrayPos,const HeaderPolicy * const headerPolicy,Ver4DictBuffers * const buffersToWrite,MutableEntryCounters * const outEntryCounters)75 bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
76         const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
77         MutableEntryCounters *const outEntryCounters) {
78     Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer());
79     Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
80     Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
81             mBuffers->getTerminalPositionLookupTable());
82     Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
83             mBuffers, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy);
84 
85     if (!mBuffers->getMutableLanguageModelDictContent()->updateAllProbabilityEntriesForGC(
86             headerPolicy, outEntryCounters)) {
87         AKLOGE("Failed to update probabilities in language model dict content.");
88         return false;
89     }
90     if (headerPolicy->isDecayingDict()) {
91         const EntryCounts &maxEntryCounts = headerPolicy->getMaxNgramCounts();
92         if (!mBuffers->getMutableLanguageModelDictContent()->truncateEntries(
93                 outEntryCounters->getEntryCounts(), maxEntryCounts, headerPolicy,
94                 outEntryCounters)) {
95             AKLOGE("Failed to truncate entries in language model dict content.");
96             return false;
97         }
98     }
99 
100     DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
101     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
102     DynamicPtGcEventListeners
103             ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
104                     traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
105                             &ptNodeWriter);
106     if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
107             &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
108         return false;
109     }
110 
111     // Mapping from positions in mBuffer to positions in bufferToWrite.
112     PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
113     readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
114     Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
115             buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy);
116     DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
117             traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
118                     buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
119     if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
120             &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
121         return false;
122     }
123 
124     // Create policy instances for the GCed dictionary.
125     Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer());
126     Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
127     Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
128             buffersToWrite->getTerminalPositionLookupTable());
129     Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
130             buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader,
131             &newShortcutPolicy);
132     // Re-assign terminal IDs for valid terminal PtNodes.
133     TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
134     if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
135             &terminalIdMap)) {
136         return false;
137     }
138     // Run GC for language model dict content.
139     if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap,
140             mBuffers->getLanguageModelDictContent())) {
141         return false;
142     }
143     // Run GC for shortcut dict content.
144     if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
145             mBuffers->getShortcutDictContent())) {
146         return false;
147     }
148     DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
149     newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
150     DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
151             traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
152     if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
153             &traversePolicyToUpdateAllPositionFields)) {
154         return false;
155     }
156     newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
157     TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
158             traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
159     if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
160             &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
161         return false;
162     }
163     return true;
164 }
165 
166 bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
onVisitingPtNode(const PtNodeParams * const ptNodeParams)167         ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
168     if (!ptNodeParams->isTerminal()) {
169         return true;
170     }
171     TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
172             mTerminalIdMap->find(ptNodeParams->getTerminalId());
173     if (it == mTerminalIdMap->end()) {
174         AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
175                 ptNodeParams->getTerminalId(), mTerminalIdMap->size());
176         return false;
177     }
178     if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
179         AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
180         return false;
181     }
182     return true;
183 }
184 
185 } // namespace latinime
186