1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.providers.contacts; 18 19 import android.icu.text.Transliterator; 20 import android.text.TextUtils; 21 import android.util.Log; 22 23 import java.util.ArrayList; 24 import java.util.Locale; 25 26 27 /** 28 * An object to convert Chinese character to its corresponding pinyin string. 29 * For characters with multiple possible pinyin string, only one is selected 30 * according to ICU Transliterator class. Polyphone is not supported in this 31 * implementation. 32 */ 33 public class HanziToPinyin { 34 private static final String TAG = "HanziToPinyin"; 35 36 private static HanziToPinyin sInstance; 37 private Transliterator mPinyinTransliterator; 38 private Transliterator mAsciiTransliterator; 39 40 public static class Token { 41 /** 42 * Separator between target string for each source char 43 */ 44 public static final String SEPARATOR = " "; 45 46 public static final int LATIN = 1; 47 public static final int PINYIN = 2; 48 public static final int UNKNOWN = 3; 49 Token()50 public Token() { 51 } 52 Token(int type, String source, String target)53 public Token(int type, String source, String target) { 54 this.type = type; 55 this.source = source; 56 this.target = target; 57 } 58 59 /** 60 * Type of this token, ASCII, PINYIN or UNKNOWN. 61 */ 62 public int type; 63 /** 64 * Original string before translation. 65 */ 66 public String source; 67 /** 68 * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is 69 * original string in source. 70 */ 71 public String target; 72 } 73 HanziToPinyin()74 private HanziToPinyin() { 75 try { 76 mPinyinTransliterator = Transliterator.getInstance( 77 "Han-Latin/Names; Latin-Ascii; Any-Upper"); 78 mAsciiTransliterator = Transliterator.getInstance("Latin-Ascii"); 79 } catch (IllegalArgumentException e) { 80 Log.w(TAG, "Han-Latin/Names transliterator data is missing," 81 + " HanziToPinyin is disabled"); 82 } 83 } 84 hasChineseTransliterator()85 public boolean hasChineseTransliterator() { 86 return mPinyinTransliterator != null; 87 } 88 getInstance()89 public static HanziToPinyin getInstance() { 90 synchronized (HanziToPinyin.class) { 91 if (sInstance == null) { 92 sInstance = new HanziToPinyin(); 93 } 94 return sInstance; 95 } 96 } 97 tokenize(char character, Token token)98 private void tokenize(char character, Token token) { 99 token.source = Character.toString(character); 100 101 // ASCII 102 if (character < 128) { 103 token.type = Token.LATIN; 104 token.target = token.source; 105 return; 106 } 107 108 // Extended Latin. Transcode these to ASCII equivalents 109 if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) { 110 token.type = Token.LATIN; 111 token.target = mAsciiTransliterator == null ? token.source : 112 mAsciiTransliterator.transliterate(token.source); 113 return; 114 } 115 116 token.type = Token.PINYIN; 117 token.target = mPinyinTransliterator.transliterate(token.source); 118 if (TextUtils.isEmpty(token.target) || 119 TextUtils.equals(token.source, token.target)) { 120 token.type = Token.UNKNOWN; 121 token.target = token.source; 122 } 123 } 124 transliterate(final String input)125 public String transliterate(final String input) { 126 if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) { 127 return null; 128 } 129 return mPinyinTransliterator.transliterate(input); 130 } 131 132 /** 133 * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without 134 * space will be put into a Token, One Hanzi character which has pinyin will be treated as a 135 * Token. If there is no Chinese transliterator, the empty token array is returned. 136 */ getTokens(final String input)137 public ArrayList<Token> getTokens(final String input) { 138 ArrayList<Token> tokens = new ArrayList<Token>(); 139 if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) { 140 // return empty tokens. 141 return tokens; 142 } 143 144 final int inputLength = input.length(); 145 final StringBuilder sb = new StringBuilder(); 146 int tokenType = Token.LATIN; 147 Token token = new Token(); 148 149 // Go through the input, create a new token when 150 // a. Token type changed 151 // b. Get the Pinyin of current charater. 152 // c. current character is space. 153 for (int i = 0; i < inputLength; i++) { 154 final char character = input.charAt(i); 155 if (Character.isSpaceChar(character)) { 156 if (sb.length() > 0) { 157 addToken(sb, tokens, tokenType); 158 } 159 } else { 160 tokenize(character, token); 161 if (token.type == Token.PINYIN) { 162 if (sb.length() > 0) { 163 addToken(sb, tokens, tokenType); 164 } 165 tokens.add(token); 166 token = new Token(); 167 } else { 168 if (tokenType != token.type && sb.length() > 0) { 169 addToken(sb, tokens, tokenType); 170 } 171 sb.append(token.target); 172 } 173 tokenType = token.type; 174 } 175 } 176 if (sb.length() > 0) { 177 addToken(sb, tokens, tokenType); 178 } 179 return tokens; 180 } 181 addToken( final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType)182 private void addToken( 183 final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) { 184 String str = sb.toString(); 185 tokens.add(new Token(tokenType, str, str)); 186 sb.setLength(0); 187 } 188 } 189