1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.providers.contacts;
18 
19 import android.icu.text.Transliterator;
20 import android.text.TextUtils;
21 import android.util.Log;
22 
23 import java.util.ArrayList;
24 import java.util.Locale;
25 
26 
27 /**
28  * An object to convert Chinese character to its corresponding pinyin string.
29  * For characters with multiple possible pinyin string, only one is selected
30  * according to ICU Transliterator class. Polyphone is not supported in this
31  * implementation.
32  */
33 public class HanziToPinyin {
34     private static final String TAG = "HanziToPinyin";
35 
36     private static HanziToPinyin sInstance;
37     private Transliterator mPinyinTransliterator;
38     private Transliterator mAsciiTransliterator;
39 
40     public static class Token {
41         /**
42          * Separator between target string for each source char
43          */
44         public static final String SEPARATOR = " ";
45 
46         public static final int LATIN = 1;
47         public static final int PINYIN = 2;
48         public static final int UNKNOWN = 3;
49 
Token()50         public Token() {
51         }
52 
Token(int type, String source, String target)53         public Token(int type, String source, String target) {
54             this.type = type;
55             this.source = source;
56             this.target = target;
57         }
58 
59         /**
60          * Type of this token, ASCII, PINYIN or UNKNOWN.
61          */
62         public int type;
63         /**
64          * Original string before translation.
65          */
66         public String source;
67         /**
68          * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
69          * original string in source.
70          */
71         public String target;
72     }
73 
HanziToPinyin()74     private HanziToPinyin() {
75         try {
76             mPinyinTransliterator = Transliterator.getInstance(
77                     "Han-Latin/Names; Latin-Ascii; Any-Upper");
78             mAsciiTransliterator = Transliterator.getInstance("Latin-Ascii");
79         } catch (IllegalArgumentException e) {
80             Log.w(TAG, "Han-Latin/Names transliterator data is missing,"
81                   + " HanziToPinyin is disabled");
82         }
83     }
84 
hasChineseTransliterator()85     public boolean hasChineseTransliterator() {
86         return mPinyinTransliterator != null;
87     }
88 
getInstance()89     public static HanziToPinyin getInstance() {
90         synchronized (HanziToPinyin.class) {
91             if (sInstance == null) {
92                 sInstance = new HanziToPinyin();
93             }
94             return sInstance;
95         }
96     }
97 
tokenize(char character, Token token)98     private void tokenize(char character, Token token) {
99         token.source = Character.toString(character);
100 
101         // ASCII
102         if (character < 128) {
103             token.type = Token.LATIN;
104             token.target = token.source;
105             return;
106         }
107 
108         // Extended Latin. Transcode these to ASCII equivalents
109         if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) {
110             token.type = Token.LATIN;
111             token.target = mAsciiTransliterator == null ? token.source :
112                 mAsciiTransliterator.transliterate(token.source);
113             return;
114         }
115 
116         token.type = Token.PINYIN;
117         token.target = mPinyinTransliterator.transliterate(token.source);
118         if (TextUtils.isEmpty(token.target) ||
119             TextUtils.equals(token.source, token.target)) {
120             token.type = Token.UNKNOWN;
121             token.target = token.source;
122         }
123     }
124 
transliterate(final String input)125     public String transliterate(final String input) {
126         if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
127             return null;
128         }
129         return mPinyinTransliterator.transliterate(input);
130     }
131 
132     /**
133      * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
134      * space will be put into a Token, One Hanzi character which has pinyin will be treated as a
135      * Token. If there is no Chinese transliterator, the empty token array is returned.
136      */
getTokens(final String input)137     public ArrayList<Token> getTokens(final String input) {
138         ArrayList<Token> tokens = new ArrayList<Token>();
139         if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
140             // return empty tokens.
141             return tokens;
142         }
143 
144         final int inputLength = input.length();
145         final StringBuilder sb = new StringBuilder();
146         int tokenType = Token.LATIN;
147         Token token = new Token();
148 
149         // Go through the input, create a new token when
150         // a. Token type changed
151         // b. Get the Pinyin of current charater.
152         // c. current character is space.
153         for (int i = 0; i < inputLength; i++) {
154             final char character = input.charAt(i);
155             if (Character.isSpaceChar(character)) {
156                 if (sb.length() > 0) {
157                     addToken(sb, tokens, tokenType);
158                 }
159             } else {
160                 tokenize(character, token);
161                 if (token.type == Token.PINYIN) {
162                     if (sb.length() > 0) {
163                         addToken(sb, tokens, tokenType);
164                     }
165                     tokens.add(token);
166                     token = new Token();
167                 } else {
168                     if (tokenType != token.type && sb.length() > 0) {
169                         addToken(sb, tokens, tokenType);
170                     }
171                     sb.append(token.target);
172                 }
173                 tokenType = token.type;
174             }
175         }
176         if (sb.length() > 0) {
177             addToken(sb, tokens, tokenType);
178         }
179         return tokens;
180     }
181 
addToken( final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType)182     private void addToken(
183             final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
184         String str = sb.toString();
185         tokens.add(new Token(tokenType, str, str));
186         sb.setLength(0);
187     }
188 }
189