1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.annotations.UsedForTesting;
20 import com.android.inputmethod.latin.define.DecoderSpecificConstants;
21 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
22 
23 import java.util.ArrayList;
24 import java.util.Arrays;
25 import java.util.Collections;
26 import java.util.Iterator;
27 import java.util.LinkedList;
28 
29 /**
30  * A dictionary that can fusion heads and tails of words for more compression.
31  */
32 @UsedForTesting
33 public final class FusionDictionary implements Iterable<WordProperty> {
34     private static final boolean DBG = MakedictLog.DBG;
35 
36     private static int CHARACTER_NOT_FOUND_INDEX = -1;
37 
38     /**
39      * A node array of the dictionary, containing several PtNodes.
40      *
41      * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the
42      * real information.
43      * This class also contains fields to cache size and address, to help with binary
44      * generation.
45      */
46     public static final class PtNodeArray {
47         ArrayList<PtNode> mData;
48         // To help with binary generation
49         int mCachedSize = Integer.MIN_VALUE;
50         // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
51         // always hold the same value except between dictionary address compression, during which
52         // the update process needs to know about both values at the same time. Updating will
53         // update the AfterUpdate value, and the code will move them to BeforeUpdate before
54         // the next update pass.
55         int mCachedAddressBeforeUpdate = Integer.MIN_VALUE;
56         int mCachedAddressAfterUpdate = Integer.MIN_VALUE;
57         int mCachedParentAddress = 0;
58 
PtNodeArray()59         public PtNodeArray() {
60             mData = new ArrayList<>();
61         }
PtNodeArray(ArrayList<PtNode> data)62         public PtNodeArray(ArrayList<PtNode> data) {
63             Collections.sort(data, PTNODE_COMPARATOR);
64             mData = data;
65         }
66     }
67 
68     /**
69      * PtNode is a group of characters, with probability information, shortcut targets, bigrams,
70      * and children (Pt means Patricia Trie).
71      *
72      * This is the central class of the in-memory representation. A PtNode is what can
73      * be seen as a traditional "trie node", except it can hold several characters at the
74      * same time. A PtNode essentially represents one or several characters in the middle
75      * of the trie tree; as such, it can be a terminal, and it can have children.
76      * In this in-memory representation, whether the PtNode is a terminal or not is represented
77      * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the
78      * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null
79      * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present,
80      * are non-null.
81      */
82     public static final class PtNode {
83         private static final int NOT_A_TERMINAL = -1;
84         final int mChars[];
85         ArrayList<WeightedString> mBigrams;
86         // null == mProbabilityInfo indicates this is not a terminal.
87         ProbabilityInfo mProbabilityInfo;
88         int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
89         PtNodeArray mChildren;
90         boolean mIsNotAWord; // Only a shortcut
91         boolean mIsPossiblyOffensive;
92         // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
93         // generation. Before and After always hold the same value except during dictionary
94         // address compression, where the update process needs to know about both values at the
95         // same time. Updating will update the AfterUpdate value, and the code will move them
96         // to BeforeUpdate before the next update pass.
97         // The update process does not need two versions of mCachedSize.
98         int mCachedSize; // The size, in bytes, of this PtNode.
99         int mCachedAddressBeforeUpdate; // The address of this PtNode (before update)
100         int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
101 
PtNode(final int[] chars, final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isPossiblyOffensive)102         public PtNode(final int[] chars, final ArrayList<WeightedString> bigrams,
103                 final ProbabilityInfo probabilityInfo, final boolean isNotAWord,
104                 final boolean isPossiblyOffensive) {
105             mChars = chars;
106             mProbabilityInfo = probabilityInfo;
107             mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability;
108             mBigrams = bigrams;
109             mChildren = null;
110             mIsNotAWord = isNotAWord;
111             mIsPossiblyOffensive = isPossiblyOffensive;
112         }
113 
PtNode(final int[] chars, final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isPossiblyOffensive, final PtNodeArray children)114         public PtNode(final int[] chars, final ArrayList<WeightedString> bigrams,
115                 final ProbabilityInfo probabilityInfo, final boolean isNotAWord,
116                 final boolean isPossiblyOffensive, final PtNodeArray children) {
117             mChars = chars;
118             mProbabilityInfo = probabilityInfo;
119             mBigrams = bigrams;
120             mChildren = children;
121             mIsNotAWord = isNotAWord;
122             mIsPossiblyOffensive = isPossiblyOffensive;
123         }
124 
addChild(PtNode n)125         public void addChild(PtNode n) {
126             if (null == mChildren) {
127                 mChildren = new PtNodeArray();
128             }
129             mChildren.mData.add(n);
130         }
131 
getTerminalId()132         public int getTerminalId() {
133             return mTerminalId;
134         }
135 
isTerminal()136         public boolean isTerminal() {
137             return mProbabilityInfo != null;
138         }
139 
getProbability()140         public int getProbability() {
141             return isTerminal() ? mProbabilityInfo.mProbability : NOT_A_TERMINAL;
142         }
143 
getIsNotAWord()144         public boolean getIsNotAWord() {
145             return mIsNotAWord;
146         }
147 
getIsPossiblyOffensive()148         public boolean getIsPossiblyOffensive() {
149             return mIsPossiblyOffensive;
150         }
151 
getBigrams()152         public ArrayList<WeightedString> getBigrams() {
153             // We don't want write permission to escape outside the package, so we return a copy
154             if (null == mBigrams) return null;
155             final ArrayList<WeightedString> copyOfBigrams = new ArrayList<>(mBigrams);
156             return copyOfBigrams;
157         }
158 
hasSeveralChars()159         public boolean hasSeveralChars() {
160             assert(mChars.length > 0);
161             return 1 < mChars.length;
162         }
163 
164         /**
165          * Adds a word to the bigram list. Updates the probability information if the word already
166          * exists.
167          */
addBigram(final String word, final ProbabilityInfo probabilityInfo)168         public void addBigram(final String word, final ProbabilityInfo probabilityInfo) {
169             if (mBigrams == null) {
170                 mBigrams = new ArrayList<>();
171             }
172             WeightedString bigram = getBigram(word);
173             if (bigram != null) {
174                 bigram.mProbabilityInfo = probabilityInfo;
175             } else {
176                 bigram = new WeightedString(word, probabilityInfo);
177                 mBigrams.add(bigram);
178             }
179         }
180 
181         /**
182          * Gets the bigram for the given word.
183          * Returns null if the word is not in the bigrams list.
184          */
getBigram(final String word)185         public WeightedString getBigram(final String word) {
186             // TODO: Don't do a linear search
187             if (mBigrams != null) {
188                 final int size = mBigrams.size();
189                 for (int i = 0; i < size; ++i) {
190                     WeightedString bigram = mBigrams.get(i);
191                     if (bigram.mWord.equals(word)) {
192                         return bigram;
193                     }
194                 }
195             }
196             return null;
197         }
198 
199         /**
200          * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to
201          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
202          * updated if they are higher than the existing ones.
203          */
update(final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> bigrams, final boolean isNotAWord, final boolean isPossiblyOffensive)204         void update(final ProbabilityInfo probabilityInfo,
205                 final ArrayList<WeightedString> bigrams,
206                 final boolean isNotAWord, final boolean isPossiblyOffensive) {
207             mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo);
208             if (bigrams != null) {
209                 if (mBigrams == null) {
210                     mBigrams = bigrams;
211                 } else {
212                     final int size = bigrams.size();
213                     for (int i = 0; i < size; ++i) {
214                         final WeightedString bigram = bigrams.get(i);
215                         final WeightedString existingBigram = getBigram(bigram.mWord);
216                         if (existingBigram == null) {
217                             mBigrams.add(bigram);
218                         } else {
219                             existingBigram.mProbabilityInfo = ProbabilityInfo.max(
220                                     existingBigram.mProbabilityInfo, bigram.mProbabilityInfo);
221                         }
222                     }
223                 }
224             }
225             mIsNotAWord = isNotAWord;
226             mIsPossiblyOffensive = isPossiblyOffensive;
227         }
228     }
229 
230     public final DictionaryOptions mOptions;
231     public final PtNodeArray mRootNodeArray;
232 
FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options)233     public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) {
234         mRootNodeArray = rootNodeArray;
235         mOptions = options;
236     }
237 
addOptionAttribute(final String key, final String value)238     public void addOptionAttribute(final String key, final String value) {
239         mOptions.mAttributes.put(key, value);
240     }
241 
242     /**
243      * Helper method to convert a String to an int array.
244      */
getCodePoints(final String word)245     static int[] getCodePoints(final String word) {
246         // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray,
247         // which is not visible from the makedict package. Factor this code.
248         final int length = word.length();
249         if (length <= 0) return new int[] {};
250         final char[] characters = word.toCharArray();
251         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
252         int codePoint = Character.codePointAt(characters, 0);
253         int dsti = 0;
254         for (int srci = Character.charCount(codePoint);
255                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
256             codePoints[dsti] = codePoint;
257             codePoint = Character.codePointAt(characters, srci);
258         }
259         codePoints[dsti] = codePoint;
260         return codePoints;
261     }
262 
263     /**
264      * Helper method to add a word as a string.
265      *
266      * This method adds a word to the dictionary with the given frequency. Optional
267      * lists of bigrams can be passed here. For each word inside,
268      * they will be added to the dictionary as necessary.
269      *  @param word the word to add.
270      * @param probabilityInfo probability information of the word.
271      * @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
272      * @param isPossiblyOffensive true if this word is possibly offensive
273      */
add(final String word, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isPossiblyOffensive)274     public void add(final String word, final ProbabilityInfo probabilityInfo,
275             final boolean isNotAWord, final boolean isPossiblyOffensive) {
276         add(getCodePoints(word), probabilityInfo, isNotAWord, isPossiblyOffensive);
277     }
278 
279     /**
280      * Validity check for a PtNode array.
281      *
282      * This method checks that all PtNodes in a node array are ordered as expected.
283      * If they are, nothing happens. If they aren't, an exception is thrown.
284      */
checkStack(PtNodeArray ptNodeArray)285     private static void checkStack(PtNodeArray ptNodeArray) {
286         ArrayList<PtNode> stack = ptNodeArray.mData;
287         int lastValue = -1;
288         for (int i = 0; i < stack.size(); ++i) {
289             int currentValue = stack.get(i).mChars[0];
290             if (currentValue <= lastValue) {
291                 throw new RuntimeException("Invalid stack");
292             }
293             lastValue = currentValue;
294         }
295     }
296 
297     /**
298      * Helper method to add a new bigram to the dictionary.
299      *
300      * @param word0 the previous word of the context
301      * @param word1 the next word of the context
302      * @param probabilityInfo the bigram probability info
303      */
setBigram(final String word0, final String word1, final ProbabilityInfo probabilityInfo)304     public void setBigram(final String word0, final String word1,
305             final ProbabilityInfo probabilityInfo) {
306         PtNode ptNode0 = findWordInTree(mRootNodeArray, word0);
307         if (ptNode0 != null) {
308             final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1);
309             if (ptNode1 == null) {
310                 add(getCodePoints(word1), new ProbabilityInfo(0), false /* isNotAWord */,
311                         false /* isPossiblyOffensive */);
312                 // The PtNode for the first word may have moved by the above insertion,
313                 // if word1 and word2 share a common stem that happens not to have been
314                 // a cutting point until now. In this case, we need to refresh ptNode.
315                 ptNode0 = findWordInTree(mRootNodeArray, word0);
316             }
317             ptNode0.addBigram(word1, probabilityInfo);
318         } else {
319             throw new RuntimeException("First word of bigram not found " + word0);
320         }
321     }
322 
323     /**
324      * Add a word to this dictionary.
325      *
326      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
327      * an exception is thrown.
328      *  @param word the word, as an int array.
329      * @param probabilityInfo the probability information of the word.
330      * @param isNotAWord true if this is not a word for spellchecking purposes (shortcut only or so)
331      * @param isPossiblyOffensive true if this word is possibly offensive
332      */
add(final int[] word, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isPossiblyOffensive)333     private void add(final int[] word, final ProbabilityInfo probabilityInfo,
334             final boolean isNotAWord, final boolean isPossiblyOffensive) {
335         assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY);
336         if (word.length >= DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH) {
337             MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
338             return;
339         }
340 
341         PtNodeArray currentNodeArray = mRootNodeArray;
342         int charIndex = 0;
343 
344         PtNode currentPtNode = null;
345         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
346         int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]);
347         while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) {
348             currentPtNode = currentNodeArray.mData.get(nodeIndex);
349             differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex);
350             if (ARRAYS_ARE_EQUAL != differentCharIndex
351                     && differentCharIndex < currentPtNode.mChars.length) break;
352             if (null == currentPtNode.mChildren) break;
353             charIndex += currentPtNode.mChars.length;
354             if (charIndex >= word.length) break;
355             currentNodeArray = currentPtNode.mChildren;
356             nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]);
357         }
358 
359         if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) {
360             // No node at this point to accept the word. Create one.
361             final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
362             final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
363                     null /* bigrams */, probabilityInfo, isNotAWord,
364                     isPossiblyOffensive);
365             currentNodeArray.mData.add(insertionIndex, newPtNode);
366             if (DBG) checkStack(currentNodeArray);
367         } else {
368             // There is a word with a common prefix.
369             if (differentCharIndex == currentPtNode.mChars.length) {
370                 if (charIndex + differentCharIndex >= word.length) {
371                     // The new word is a prefix of an existing word, but the node on which it
372                     // should end already exists as is. Since the old PtNode was not a terminal,
373                     // make it one by filling in its frequency and other attributes
374                     currentPtNode.update(probabilityInfo, null, isNotAWord,
375                             isPossiblyOffensive);
376                 } else {
377                     // The new word matches the full old word and extends past it.
378                     // We only have to create a new node and add it to the end of this.
379                     final PtNode newNode = new PtNode(
380                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
381                                     null /* bigrams */, probabilityInfo,
382                                     isNotAWord, isPossiblyOffensive);
383                     currentPtNode.mChildren = new PtNodeArray();
384                     currentPtNode.mChildren.mData.add(newNode);
385                 }
386             } else {
387                 if (0 == differentCharIndex) {
388                     // Exact same word. Update the frequency if higher. This will also add the
389                     // new shortcuts to the existing shortcut list if it already exists.
390                     currentPtNode.update(probabilityInfo, null,
391                             currentPtNode.mIsNotAWord && isNotAWord,
392                             currentPtNode.mIsPossiblyOffensive || isPossiblyOffensive);
393                 } else {
394                     // Partial prefix match only. We have to replace the current node with a node
395                     // containing the current prefix and create two new ones for the tails.
396                     PtNodeArray newChildren = new PtNodeArray();
397                     final PtNode newOldWord = new PtNode(
398                             Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
399                                     currentPtNode.mChars.length),
400                             currentPtNode.mBigrams, currentPtNode.mProbabilityInfo,
401                             currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive,
402                             currentPtNode.mChildren);
403                     newChildren.mData.add(newOldWord);
404 
405                     final PtNode newParent;
406                     if (charIndex + differentCharIndex >= word.length) {
407                         newParent = new PtNode(
408                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
409                                 null /* bigrams */, probabilityInfo,
410                                 isNotAWord, isPossiblyOffensive, newChildren);
411                     } else {
412                         newParent = new PtNode(
413                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
414                                 null /* bigrams */, null /* probabilityInfo */,
415                                 false /* isNotAWord */, false /* isPossiblyOffensive */,
416                                 newChildren);
417                         final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
418                                 charIndex + differentCharIndex, word.length),
419                                 null /* bigrams */, probabilityInfo,
420                                 isNotAWord, isPossiblyOffensive);
421                         final int addIndex = word[charIndex + differentCharIndex]
422                                 > currentPtNode.mChars[differentCharIndex] ? 1 : 0;
423                         newChildren.mData.add(addIndex, newWord);
424                     }
425                     currentNodeArray.mData.set(nodeIndex, newParent);
426                 }
427                 if (DBG) checkStack(currentNodeArray);
428             }
429         }
430     }
431 
432     private static int ARRAYS_ARE_EQUAL = 0;
433 
434     /**
435      * Custom comparison of two int arrays taken to contain character codes.
436      *
437      * This method compares the two arrays passed as an argument in a lexicographic way,
438      * with an offset in the dst string.
439      * This method does NOT test for the first character. It is taken to be equal.
440      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
441      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
442      * strings are equal. This works BECAUSE we don't look at the first character.
443      *
444      * @param src the left-hand side string of the comparison.
445      * @param dst the right-hand side string of the comparison.
446      * @param dstOffset the offset in the right-hand side string.
447      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
448      */
compareCharArrays(final int[] src, final int[] dst, int dstOffset)449     private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) {
450         // We do NOT test the first char, because we come from a method that already
451         // tested it.
452         for (int i = 1; i < src.length; ++i) {
453             if (dstOffset + i >= dst.length) return i;
454             if (src[i] != dst[dstOffset + i]) return i;
455         }
456         if (dst.length > src.length) return src.length;
457         return ARRAYS_ARE_EQUAL;
458     }
459 
460     /**
461      * Helper class that compares and sorts two PtNodes according to their
462      * first element only. I repeat: ONLY the first element is considered, the rest
463      * is ignored.
464      * This comparator imposes orderings that are inconsistent with equals.
465      */
466     static final class PtNodeComparator implements java.util.Comparator<PtNode> {
467         @Override
compare(PtNode p1, PtNode p2)468         public int compare(PtNode p1, PtNode p2) {
469             if (p1.mChars[0] == p2.mChars[0]) return 0;
470             return p1.mChars[0] < p2.mChars[0] ? -1 : 1;
471         }
472     }
473     final static PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator();
474 
475     /**
476      * Finds the insertion index of a character within a node array.
477      */
findInsertionIndex(final PtNodeArray nodeArray, int character)478     private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
479         final ArrayList<PtNode> data = nodeArray.mData;
480         final PtNode reference = new PtNode(new int[] { character },
481                 null /* bigrams */, null /* probabilityInfo */,
482                 false /* isNotAWord */, false /* isPossiblyOffensive */);
483         int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
484         return result >= 0 ? result : -result - 1;
485     }
486 
487     /**
488      * Find the index of a char in a node array, if it exists.
489      *
490      * @param nodeArray the node array to search in.
491      * @param character the character to search for.
492      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else.
493      */
findIndexOfChar(final PtNodeArray nodeArray, int character)494     private static int findIndexOfChar(final PtNodeArray nodeArray, int character) {
495         final int insertionIndex = findInsertionIndex(nodeArray, character);
496         if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX;
497         return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex
498                 : CHARACTER_NOT_FOUND_INDEX;
499     }
500 
501     /**
502      * Helper method to find a word in a given branch.
503      */
findWordInTree(final PtNodeArray rootNodeArray, final String string)504     public static PtNode findWordInTree(final PtNodeArray rootNodeArray, final String string) {
505         PtNodeArray nodeArray = rootNodeArray;
506         int index = 0;
507         final StringBuilder checker = DBG ? new StringBuilder() : null;
508         final int[] codePoints = getCodePoints(string);
509 
510         PtNode currentPtNode;
511         do {
512             int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]);
513             if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null;
514             currentPtNode = nodeArray.mData.get(indexOfGroup);
515 
516             if (codePoints.length - index < currentPtNode.mChars.length) return null;
517             int newIndex = index;
518             while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) {
519                 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null;
520                 newIndex++;
521             }
522             index = newIndex;
523 
524             if (DBG) {
525                 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length));
526             }
527             if (index < codePoints.length) {
528                 nodeArray = currentPtNode.mChildren;
529             }
530         } while (null != nodeArray && index < codePoints.length);
531 
532         if (index < codePoints.length) return null;
533         if (!currentPtNode.isTerminal()) return null;
534         if (DBG && !string.equals(checker.toString())) return null;
535         return currentPtNode;
536     }
537 
538     /**
539      * Helper method to find out whether a word is in the dict or not.
540      */
hasWord(final String s)541     public boolean hasWord(final String s) {
542         if (null == s || "".equals(s)) {
543             throw new RuntimeException("Can't search for a null or empty string");
544         }
545         return null != findWordInTree(mRootNodeArray, s);
546     }
547 
548     /**
549      * Recursively count the number of PtNodes in a given branch of the trie.
550      *
551      * @param nodeArray the parent node.
552      * @return the number of PtNodes in all the branch under this node.
553      */
countPtNodes(final PtNodeArray nodeArray)554     public static int countPtNodes(final PtNodeArray nodeArray) {
555         final int nodeSize = nodeArray.mData.size();
556         int size = nodeSize;
557         for (int i = nodeSize - 1; i >= 0; --i) {
558             PtNode ptNode = nodeArray.mData.get(i);
559             if (null != ptNode.mChildren)
560                 size += countPtNodes(ptNode.mChildren);
561         }
562         return size;
563     }
564 
565     /**
566      * Iterator to walk through a dictionary.
567      *
568      * This is purely for convenience.
569      */
570     public static final class DictionaryIterator implements Iterator<WordProperty> {
571         private static final class Position {
572             public Iterator<PtNode> pos;
573             public int length;
Position(ArrayList<PtNode> ptNodes)574             public Position(ArrayList<PtNode> ptNodes) {
575                 pos = ptNodes.iterator();
576                 length = 0;
577             }
578         }
579         final StringBuilder mCurrentString;
580         final LinkedList<Position> mPositions;
581 
DictionaryIterator(ArrayList<PtNode> ptRoot)582         public DictionaryIterator(ArrayList<PtNode> ptRoot) {
583             mCurrentString = new StringBuilder();
584             mPositions = new LinkedList<>();
585             final Position rootPos = new Position(ptRoot);
586             mPositions.add(rootPos);
587         }
588 
589         @Override
hasNext()590         public boolean hasNext() {
591             for (Position p : mPositions) {
592                 if (p.pos.hasNext()) {
593                     return true;
594                 }
595             }
596             return false;
597         }
598 
599         @Override
next()600         public WordProperty next() {
601             Position currentPos = mPositions.getLast();
602             mCurrentString.setLength(currentPos.length);
603 
604             do {
605                 if (currentPos.pos.hasNext()) {
606                     final PtNode currentPtNode = currentPos.pos.next();
607                     currentPos.length = mCurrentString.length();
608                     for (int i : currentPtNode.mChars) {
609                         mCurrentString.append(Character.toChars(i));
610                     }
611                     if (null != currentPtNode.mChildren) {
612                         currentPos = new Position(currentPtNode.mChildren.mData);
613                         currentPos.length = mCurrentString.length();
614                         mPositions.addLast(currentPos);
615                     }
616                     if (currentPtNode.isTerminal()) {
617                         return new WordProperty(mCurrentString.toString(),
618                                 currentPtNode.mProbabilityInfo, currentPtNode.mBigrams,
619                                 currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive);
620                     }
621                 } else {
622                     mPositions.removeLast();
623                     currentPos = mPositions.getLast();
624                     mCurrentString.setLength(mPositions.getLast().length);
625                 }
626             } while (true);
627         }
628 
629         @Override
remove()630         public void remove() {
631             throw new UnsupportedOperationException("Unsupported yet");
632         }
633 
634     }
635 
636     /**
637      * Method to return an iterator.
638      *
639      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
640      * and say : for (Word w : x) {}
641      */
642     @Override
iterator()643     public Iterator<WordProperty> iterator() {
644         return new DictionaryIterator(mRootNodeArray.mData);
645     }
646 }
647