1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.annotations.UsedForTesting;
20 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
21 import com.android.inputmethod.latin.makedict.BinaryDictEncoderUtils.CodePointTable;
22 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
23 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
24 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
25 
26 import java.io.File;
27 import java.io.FileNotFoundException;
28 import java.io.FileOutputStream;
29 import java.io.IOException;
30 import java.io.OutputStream;
31 import java.util.ArrayList;
32 import java.util.Collections;
33 import java.util.Comparator;
34 import java.util.HashMap;
35 import java.util.Iterator;
36 import java.util.Map.Entry;
37 
38 /**
39  * An implementation of DictEncoder for version 2 binary dictionary.
40  */
41 @UsedForTesting
42 public class Ver2DictEncoder implements DictEncoder {
43 
44     private final File mDictFile;
45     private OutputStream mOutStream;
46     private byte[] mBuffer;
47     private int mPosition;
48     private final int mCodePointTableMode;
49     public static final int CODE_POINT_TABLE_OFF = 0;
50     public static final int CODE_POINT_TABLE_ON = 1;
51 
52     @UsedForTesting
Ver2DictEncoder(final File dictFile, final int codePointTableMode)53     public Ver2DictEncoder(final File dictFile, final int codePointTableMode) {
54         mDictFile = dictFile;
55         mOutStream = null;
56         mBuffer = null;
57         mCodePointTableMode = codePointTableMode;
58     }
59 
60     // This constructor is used only by BinaryDictOffdeviceUtilsTests.
61     // If you want to use this in the production code, you should consider keeping consistency of
62     // the interface of Ver3DictDecoder by using factory.
63     @UsedForTesting
Ver2DictEncoder(final OutputStream outStream)64     public Ver2DictEncoder(final OutputStream outStream) {
65         mDictFile = null;
66         mOutStream = outStream;
67         mCodePointTableMode = CODE_POINT_TABLE_OFF;
68     }
69 
openStream()70     private void openStream() throws FileNotFoundException {
71         mOutStream = new FileOutputStream(mDictFile);
72     }
73 
close()74     private void close() throws IOException {
75         if (mOutStream != null) {
76             mOutStream.close();
77             mOutStream = null;
78         }
79     }
80 
81     // Package for testing
makeCodePointTable(final FusionDictionary dict)82     static CodePointTable makeCodePointTable(final FusionDictionary dict) {
83         final HashMap<Integer, Integer> codePointOccurrenceCounts = new HashMap<>();
84         for (final WordProperty word : dict) {
85             // Store per code point occurrence
86             final String wordString = word.mWord;
87             for (int i = 0; i < wordString.length(); ++i) {
88                 final int codePoint = Character.codePointAt(wordString, i);
89                 if (codePointOccurrenceCounts.containsKey(codePoint)) {
90                     codePointOccurrenceCounts.put(codePoint,
91                             codePointOccurrenceCounts.get(codePoint) + 1);
92                 } else {
93                     codePointOccurrenceCounts.put(codePoint, 1);
94                 }
95             }
96         }
97         final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray =
98                 new ArrayList<>(codePointOccurrenceCounts.entrySet());
99         // Descending order sort by occurrence (value side)
100         Collections.sort(codePointOccurrenceArray, new Comparator<Entry<Integer, Integer>>() {
101             @Override
102             public int compare(final Entry<Integer, Integer> a, final Entry<Integer, Integer> b) {
103                 if (a.getValue() != b.getValue()) {
104                     return b.getValue().compareTo(a.getValue());
105                 }
106                 return b.getKey().compareTo(a.getKey());
107             }
108         });
109         int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE;
110         // Temporary map for writing of nodes
111         final HashMap<Integer, Integer> codePointToOneByteCodeMap = new HashMap<>();
112         for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) {
113             // Put a relation from the original code point to the one byte code.
114             codePointToOneByteCodeMap.put(entry.getKey(), currentCodePointTableIndex);
115             if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) {
116                 break;
117             }
118         }
119         // codePointToOneByteCodeMap for writing the trie
120         // codePointOccurrenceArray for writing the header
121         return new CodePointTable(codePointToOneByteCodeMap, codePointOccurrenceArray);
122     }
123 
124     @Override
writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)125     public void writeDictionary(final FusionDictionary dict, final FormatOptions formatOptions)
126             throws IOException, UnsupportedFormatException {
127         // We no longer support anything but the latest version of v2.
128         if (formatOptions.mVersion != FormatSpec.VERSION202) {
129             throw new UnsupportedFormatException(
130                     "The given format options has wrong version number : "
131                     + formatOptions.mVersion);
132         }
133 
134         if (mOutStream == null) {
135             openStream();
136         }
137 
138         // Make code point conversion table ordered by occurrence of code points
139         // Version 201 or later have codePointTable
140         final CodePointTable codePointTable;
141         if (mCodePointTableMode == CODE_POINT_TABLE_OFF || formatOptions.mVersion
142                 < FormatSpec.MINIMUM_SUPPORTED_VERSION_OF_CODE_POINT_TABLE) {
143             codePointTable = new CodePointTable();
144         } else {
145             codePointTable = makeCodePointTable(dict);
146         }
147 
148         BinaryDictEncoderUtils.writeDictionaryHeader(mOutStream, dict, formatOptions,
149                 codePointTable.mCodePointOccurrenceArray);
150 
151         // Addresses are limited to 3 bytes, but since addresses can be relative to each node
152         // array, the structure itself is not limited to 16MB. However, if it is over 16MB deciding
153         // the order of the PtNode arrays becomes a quite complicated problem, because though the
154         // dictionary itself does not have a size limit, each node array must still be within 16MB
155         // of all its children and parents. As long as this is ensured, the dictionary file may
156         // grow to any size.
157 
158         // Leave the choice of the optimal node order to the flattenTree function.
159         MakedictLog.i("Flattening the tree...");
160         ArrayList<PtNodeArray> flatNodes = BinaryDictEncoderUtils.flattenTree(dict.mRootNodeArray);
161 
162         MakedictLog.i("Computing addresses...");
163         BinaryDictEncoderUtils.computeAddresses(dict, flatNodes,
164                 codePointTable.mCodePointToOneByteCodeMap);
165         MakedictLog.i("Checking PtNode array...");
166         if (MakedictLog.DBG) BinaryDictEncoderUtils.checkFlatPtNodeArrayList(flatNodes);
167 
168         // Create a buffer that matches the final dictionary size.
169         final PtNodeArray lastNodeArray = flatNodes.get(flatNodes.size() - 1);
170         final int bufferSize = lastNodeArray.mCachedAddressAfterUpdate + lastNodeArray.mCachedSize;
171         mBuffer = new byte[bufferSize];
172 
173         MakedictLog.i("Writing file...");
174 
175         for (PtNodeArray nodeArray : flatNodes) {
176             BinaryDictEncoderUtils.writePlacedPtNodeArray(dict, this, nodeArray,
177                     codePointTable.mCodePointToOneByteCodeMap);
178         }
179         if (MakedictLog.DBG) BinaryDictEncoderUtils.showStatistics(flatNodes);
180         mOutStream.write(mBuffer, 0, mPosition);
181 
182         MakedictLog.i("Done");
183         close();
184     }
185 
186     @Override
setPosition(final int position)187     public void setPosition(final int position) {
188         if (mBuffer == null || position < 0 || position >= mBuffer.length) return;
189         mPosition = position;
190     }
191 
192     @Override
getPosition()193     public int getPosition() {
194         return mPosition;
195     }
196 
197     @Override
writePtNodeCount(final int ptNodeCount)198     public void writePtNodeCount(final int ptNodeCount) {
199         final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
200         if (countSize != 1 && countSize != 2) {
201             throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
202         }
203         final int encodedPtNodeCount = (countSize == 2) ?
204                 (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
205         mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, encodedPtNodeCount,
206                 countSize);
207     }
208 
writePtNodeFlags(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap)209     private void writePtNodeFlags(final PtNode ptNode,
210             final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
211         final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
212                 codePointToOneByteCodeMap);
213         mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition,
214                 BinaryDictEncoderUtils.makePtNodeFlags(ptNode, childrenPos),
215                 FormatSpec.PTNODE_FLAGS_SIZE);
216     }
217 
writeCharacters(final int[] codePoints, final boolean hasSeveralChars, final HashMap<Integer, Integer> codePointToOneByteCodeMap)218     private void writeCharacters(final int[] codePoints, final boolean hasSeveralChars,
219             final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
220         mPosition = CharEncoding.writeCharArray(codePoints, mBuffer, mPosition,
221                 codePointToOneByteCodeMap);
222         if (hasSeveralChars) {
223             mBuffer[mPosition++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
224         }
225     }
226 
writeFrequency(final int frequency)227     private void writeFrequency(final int frequency) {
228         if (frequency >= 0) {
229             mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, frequency,
230                     FormatSpec.PTNODE_FREQUENCY_SIZE);
231         }
232     }
233 
writeChildrenPosition(final PtNode ptNode, final HashMap<Integer, Integer> codePointToOneByteCodeMap)234     private void writeChildrenPosition(final PtNode ptNode,
235             final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
236         final int childrenPos = BinaryDictEncoderUtils.getChildrenPosition(ptNode,
237                 codePointToOneByteCodeMap);
238         mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
239                 childrenPos);
240     }
241 
242     /**
243      * Write a bigram attributes list to mBuffer.
244      *
245      * @param bigrams the bigram attributes list.
246      * @param dict the dictionary the node array is a part of (for relative offsets).
247      */
writeBigrams(final ArrayList<WeightedString> bigrams, final FusionDictionary dict)248     private void writeBigrams(final ArrayList<WeightedString> bigrams,
249             final FusionDictionary dict) {
250         if (bigrams == null) return;
251 
252         final Iterator<WeightedString> bigramIterator = bigrams.iterator();
253         while (bigramIterator.hasNext()) {
254             final WeightedString bigram = bigramIterator.next();
255             final PtNode target =
256                     FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord);
257             final int addressOfBigram = target.mCachedAddressAfterUpdate;
258             final int unigramFrequencyForThisWord = target.getProbability();
259             final int offset = addressOfBigram
260                     - (mPosition + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
261             final int bigramFlags = BinaryDictEncoderUtils.makeBigramFlags(bigramIterator.hasNext(),
262                     offset, bigram.getProbability(), unigramFrequencyForThisWord, bigram.mWord);
263             mPosition = BinaryDictEncoderUtils.writeUIntToBuffer(mBuffer, mPosition, bigramFlags,
264                     FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE);
265             mPosition += BinaryDictEncoderUtils.writeChildrenPosition(mBuffer, mPosition,
266                     Math.abs(offset));
267         }
268     }
269 
270     @Override
writePtNode(final PtNode ptNode, final FusionDictionary dict, final HashMap<Integer, Integer> codePointToOneByteCodeMap)271     public void writePtNode(final PtNode ptNode, final FusionDictionary dict,
272             final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
273         writePtNodeFlags(ptNode, codePointToOneByteCodeMap);
274         writeCharacters(ptNode.mChars, ptNode.hasSeveralChars(), codePointToOneByteCodeMap);
275         writeFrequency(ptNode.getProbability());
276         writeChildrenPosition(ptNode, codePointToOneByteCodeMap);
277         writeBigrams(ptNode.mBigrams, dict);
278     }
279 }
280