1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5  * use this file except in compliance with the License. You may obtain a copy of
6  * the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13  * License for the specific language governing permissions and limitations under
14  * the License.
15  */
16 
17 package com.android.inputmethod.latin.dicttool;
18 
19 import com.android.inputmethod.latin.makedict.FormatSpec;
20 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
21 import com.android.inputmethod.latin.makedict.FusionDictionary;
22 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
23 import com.android.inputmethod.latin.makedict.ProbabilityInfo;
24 import com.android.inputmethod.latin.makedict.WeightedString;
25 import com.android.inputmethod.latin.makedict.WordProperty;
26 import com.android.inputmethod.latin.utils.CombinedFormatUtils;
27 
28 import java.io.BufferedReader;
29 import java.io.BufferedWriter;
30 import java.io.FileReader;
31 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.HashMap;
34 import java.util.TreeSet;
35 
36 /**
37  * Reads and writes combined format for a FusionDictionary.
38  *
39  * All functions in this class are static.
40  */
41 public class CombinedInputOutput {
42     private static final String WHITELIST_TAG = "whitelist";
43     private static final String OPTIONS_TAG = "options";
44     private static final String COMMENT_LINE_STARTER = "#";
45     private static final int HISTORICAL_INFO_ELEMENT_COUNT = 3;
46 
47     /**
48      * Basic test to find out whether the file is in the combined format or not.
49      *
50      * Concretely this only tests the header line.
51      *
52      * @param filename The name of the file to test.
53      * @return true if the file is in the combined format, false otherwise
54      */
isCombinedDictionary(final String filename)55     public static boolean isCombinedDictionary(final String filename) {
56         try (final BufferedReader reader = new BufferedReader(new FileReader(filename))) {
57             String firstLine = reader.readLine();
58             while (firstLine.startsWith(COMMENT_LINE_STARTER)) {
59                 firstLine = reader.readLine();
60             }
61             return firstLine.matches(
62                     "^" + CombinedFormatUtils.DICTIONARY_TAG + "=[^:]+(:[^=]+=[^:]+)*");
63         } catch (final IOException e) {
64             return false;
65         }
66     }
67 
68     /**
69      * Reads a dictionary from a combined format file.
70      *
71      * This is the public method that will read a combined file and return the corresponding memory
72      * representation.
73      *
74      * @param reader the buffered reader to read the data from.
75      * @return the in-memory representation of the dictionary.
76      */
readDictionaryCombined(final BufferedReader reader)77     public static FusionDictionary readDictionaryCombined(final BufferedReader reader)
78             throws IOException {
79         String headerLine = reader.readLine();
80         while (headerLine.startsWith(COMMENT_LINE_STARTER)) {
81             headerLine = reader.readLine();
82         }
83         final String header[] = headerLine.split(",");
84         final HashMap<String, String> attributes = new HashMap<>();
85         for (String item : header) {
86             final String keyValue[] = item.split("=");
87             if (2 != keyValue.length) {
88                 throw new RuntimeException("Wrong header format : " + headerLine);
89             }
90             attributes.put(keyValue[0], keyValue[1]);
91         }
92 
93         attributes.remove(OPTIONS_TAG);
94         final FusionDictionary dict =
95                 new FusionDictionary(new PtNodeArray(), new DictionaryOptions(attributes));
96 
97         String line;
98         String word = null;
99         ProbabilityInfo probabilityInfo = new ProbabilityInfo(0);
100         boolean isNotAWord = false;
101         boolean isPossiblyOffensive = false;
102         ArrayList<WeightedString> bigrams = new ArrayList<>();
103         ArrayList<WeightedString> shortcuts = new ArrayList<>();
104         while (null != (line = reader.readLine())) {
105             if (line.startsWith(COMMENT_LINE_STARTER)) continue;
106             final String args[] = line.trim().split(",");
107             if (args[0].matches(CombinedFormatUtils.WORD_TAG + "=.*")) {
108                 if (null != word) {
109                     dict.add(word, probabilityInfo, isNotAWord, isPossiblyOffensive);
110                     for (WeightedString s : bigrams) {
111                         dict.setBigram(word, s.mWord, s.mProbabilityInfo);
112                     }
113                 }
114                 if (!shortcuts.isEmpty()) shortcuts = new ArrayList<>();
115                 if (!bigrams.isEmpty()) bigrams = new ArrayList<>();
116                 isNotAWord = false;
117                 isPossiblyOffensive = false;
118                 for (String param : args) {
119                     final String params[] = param.split("=", 2);
120                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
121                     switch (params[0]) {
122                         case CombinedFormatUtils.WORD_TAG:
123                             word = params[1];
124                             break;
125                         case CombinedFormatUtils.PROBABILITY_TAG:
126                             probabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
127                                     probabilityInfo.mTimestamp, probabilityInfo.mLevel,
128                                     probabilityInfo.mCount);
129                             break;
130                         case CombinedFormatUtils.HISTORICAL_INFO_TAG:
131                             final String[] historicalInfoParams = params[1].split(
132                                     CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
133                             if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
134                                 throw new RuntimeException("Wrong format (historical info) : "
135                                         + line);
136                             }
137                             probabilityInfo = new ProbabilityInfo(probabilityInfo.mProbability,
138                                     Integer.parseInt(historicalInfoParams[0]),
139                                     Integer.parseInt(historicalInfoParams[1]),
140                                     Integer.parseInt(historicalInfoParams[2]));
141                             break;
142                         case CombinedFormatUtils.NOT_A_WORD_TAG:
143                             isNotAWord = CombinedFormatUtils.isLiteralTrue(params[1]);
144                             break;
145                         case CombinedFormatUtils.POSSIBLY_OFFENSIVE_TAG:
146                             isPossiblyOffensive = CombinedFormatUtils.isLiteralTrue(params[1]);
147                             break;
148                     }
149                 }
150             } else if (args[0].matches(CombinedFormatUtils.BIGRAM_TAG + "=.*")) {
151                 String secondWordOfBigram = null;
152                 ProbabilityInfo bigramProbabilityInfo = new ProbabilityInfo(0);
153                 for (String param : args) {
154                     final String params[] = param.split("=", 2);
155                     if (2 != params.length) throw new RuntimeException("Wrong format : " + line);
156                     if (CombinedFormatUtils.BIGRAM_TAG.equals(params[0])) {
157                         secondWordOfBigram = params[1];
158                     } else if (CombinedFormatUtils.PROBABILITY_TAG.equals(params[0])) {
159                         bigramProbabilityInfo = new ProbabilityInfo(Integer.parseInt(params[1]),
160                                 bigramProbabilityInfo.mTimestamp, bigramProbabilityInfo.mLevel,
161                                 bigramProbabilityInfo.mCount);
162                     }  else if (CombinedFormatUtils.HISTORICAL_INFO_TAG.equals(params[0])) {
163                         final String[] historicalInfoParams =
164                                 params[1].split(CombinedFormatUtils.HISTORICAL_INFO_SEPARATOR);
165                         if (historicalInfoParams.length != HISTORICAL_INFO_ELEMENT_COUNT) {
166                             throw new RuntimeException("Wrong format (historical info) : " + line);
167                         }
168                         bigramProbabilityInfo = new ProbabilityInfo(
169                                 bigramProbabilityInfo.mProbability,
170                                 Integer.parseInt(historicalInfoParams[0]),
171                                 Integer.parseInt(historicalInfoParams[1]),
172                                 Integer.parseInt(historicalInfoParams[2]));
173                     }
174                 }
175                 if (null != secondWordOfBigram) {
176                     bigrams.add(new WeightedString(secondWordOfBigram, bigramProbabilityInfo));
177                 } else {
178                     throw new RuntimeException("Wrong format : " + line);
179                 }
180             }
181         }
182         if (null != word) {
183             dict.add(word, probabilityInfo, isNotAWord, isPossiblyOffensive);
184             for (WeightedString s : bigrams) {
185                 dict.setBigram(word, s.mWord, s.mProbabilityInfo);
186             }
187         }
188 
189         return dict;
190     }
191 
192     /**
193      * Writes a dictionary to a combined file.
194      *
195      * @param destination a destination writer.
196      * @param dict the dictionary to write.
197      */
writeDictionaryCombined(final BufferedWriter destination, final FusionDictionary dict)198     public static void writeDictionaryCombined(final BufferedWriter destination,
199             final FusionDictionary dict) throws IOException {
200         final TreeSet<WordProperty> wordPropertiesInDict = new TreeSet<>();
201         for (final WordProperty wordProperty : dict) {
202             // This for ordering by frequency, then by asciibetic order
203             wordPropertiesInDict.add(wordProperty);
204         }
205         destination.write(CombinedFormatUtils.formatAttributeMap(dict.mOptions.mAttributes));
206         for (final WordProperty wordProperty : wordPropertiesInDict) {
207             destination.write(CombinedFormatUtils.formatWordProperty(wordProperty));
208         }
209     }
210 }
211