1#!/usr/bin/env python 2# 3# Copyright 2016 The Android Open Source Project. All Rights Reserved. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18"""Generate a C++ data table containing locale data.""" 19 20import collections 21import glob 22import os.path 23import sys 24 25 26def get_locale_parts(locale): 27 """Split a locale into three parts, for langauge, script, and region.""" 28 parts = locale.split('_') 29 if len(parts) == 1: 30 return (parts[0], None, None) 31 elif len(parts) == 2: 32 if len(parts[1]) == 4: # parts[1] is a script 33 return (parts[0], parts[1], None) 34 else: 35 return (parts[0], None, parts[1]) 36 else: 37 assert len(parts) == 3 38 return tuple(parts) 39 40 41def read_likely_subtags(input_file_name): 42 """Read and parse ICU's likelySubtags.txt.""" 43 with open(input_file_name) as input_file: 44 likely_script_dict = { 45 # Android's additions for pseudo-locales. These internal codes make 46 # sure that the pseudo-locales would not match other English or 47 # Arabic locales. (We can't use private-use ISO 15924 codes, since 48 # they may be used by apps for other purposes.) 49 "en_XA": "~~~A", 50 "ar_XB": "~~~B", 51 # Removed data from later versions of ICU 52 "ji": "Hebr", # Old code for Yiddish, still used in Java and Android 53 } 54 representative_locales = { 55 # Android's additions 56 "en_Latn_GB", # representative for en_Latn_001 57 "es_Latn_MX", # representative for es_Latn_419 58 "es_Latn_US", # representative for es_Latn_419 (not the best idea, 59 # but Android has been shipping with it for quite a 60 # while. Fortunately, MX < US, so if both exist, MX 61 # would be chosen.) 62 } 63 for line in input_file: 64 line = unicode(line, 'UTF-8').strip(u' \n\uFEFF').encode('UTF-8') 65 if line.startswith('//'): 66 continue 67 if '{' in line and '}' in line: 68 from_locale = line[:line.index('{')] 69 to_locale = line[line.index('"')+1:line.rindex('"')] 70 from_lang, from_scr, from_region = get_locale_parts(from_locale) 71 _, to_scr, to_region = get_locale_parts(to_locale) 72 if from_lang == 'und': 73 continue # not very useful for our purposes 74 if from_region is None and to_region not in ['001', 'ZZ']: 75 representative_locales.add(to_locale) 76 if from_scr is None: 77 likely_script_dict[from_locale] = to_scr 78 return likely_script_dict, frozenset(representative_locales) 79 80 81# From packLanguageOrRegion() in ResourceTypes.cpp 82def pack_language_or_region(inp, base): 83 """Pack langauge or region in a two-byte tuple.""" 84 if inp is None: 85 return (0, 0) 86 elif len(inp) == 2: 87 return ord(inp[0]), ord(inp[1]) 88 else: 89 assert len(inp) == 3 90 base = ord(base) 91 first = ord(inp[0]) - base 92 second = ord(inp[1]) - base 93 third = ord(inp[2]) - base 94 95 return (0x80 | (third << 2) | (second >>3), 96 ((second << 5) | first) & 0xFF) 97 98 99# From packLanguage() in ResourceTypes.cpp 100def pack_language(language): 101 """Pack language in a two-byte tuple.""" 102 return pack_language_or_region(language, 'a') 103 104 105# From packRegion() in ResourceTypes.cpp 106def pack_region(region): 107 """Pack region in a two-byte tuple.""" 108 return pack_language_or_region(region, '0') 109 110 111def pack_to_uint32(locale): 112 """Pack language+region of locale into a 32-bit unsigned integer.""" 113 lang, _, region = get_locale_parts(locale) 114 plang = pack_language(lang) 115 pregion = pack_region(region) 116 return (plang[0] << 24) | (plang[1] << 16) | (pregion[0] << 8) | pregion[1] 117 118 119def dump_script_codes(all_scripts): 120 """Dump the SCRIPT_CODES table.""" 121 print 'const char SCRIPT_CODES[][4] = {' 122 for index, script in enumerate(all_scripts): 123 print " /* %-2d */ {'%c', '%c', '%c', '%c'}," % ( 124 index, script[0], script[1], script[2], script[3]) 125 print '};' 126 print 127 128 129def dump_script_data(likely_script_dict, all_scripts): 130 """Dump the script data.""" 131 print 132 print 'const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({' 133 for locale in sorted(likely_script_dict.keys()): 134 script = likely_script_dict[locale] 135 print ' {0x%08Xu, %2du}, // %s -> %s' % ( 136 pack_to_uint32(locale), 137 all_scripts.index(script), 138 locale.replace('_', '-'), 139 script) 140 print '});' 141 142 143def pack_to_uint64(locale): 144 """Pack a full locale into a 64-bit unsigned integer.""" 145 _, script, _ = get_locale_parts(locale) 146 return ((pack_to_uint32(locale) << 32) | 147 (ord(script[0]) << 24) | 148 (ord(script[1]) << 16) | 149 (ord(script[2]) << 8) | 150 ord(script[3])) 151 152 153def dump_representative_locales(representative_locales): 154 """Dump the set of representative locales.""" 155 print 156 print 'std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({' 157 for locale in sorted(representative_locales): 158 print ' 0x%08XLLU, // %s' % ( 159 pack_to_uint64(locale), 160 locale) 161 print '});' 162 163 164def read_and_dump_likely_data(icu_data_dir): 165 """Read and dump the likely-script data.""" 166 likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt') 167 likely_script_dict, representative_locales = read_likely_subtags( 168 likely_subtags_txt) 169 170 all_scripts = list(set(likely_script_dict.values())) 171 assert len(all_scripts) <= 256 172 all_scripts.sort() 173 174 dump_script_codes(all_scripts) 175 dump_script_data(likely_script_dict, all_scripts) 176 dump_representative_locales(representative_locales) 177 return likely_script_dict 178 179def escape_script_variable_name(script): 180 """Escape characters, e.g. '~', in a C++ variable name""" 181 return script.replace("~", "_") 182 183def read_parent_data(icu_data_dir): 184 """Read locale parent data from ICU data files.""" 185 all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '*', '*.txt')) 186 parent_dict = {} 187 for data_file in all_icu_data_files: 188 locale = os.path.splitext(os.path.basename(data_file))[0] 189 with open(data_file) as input_file: 190 for line in input_file: 191 if '%%Parent' in line: 192 parent = line[line.index('"')+1:line.rindex('"')] 193 if locale in parent_dict: 194 # Different files shouldn't have different parent info 195 assert parent_dict[locale] == parent 196 else: 197 parent_dict[locale] = parent 198 elif locale.startswith('ar_') and 'default{"latn"}' in line: 199 # Arabic parent overrides for ASCII digits. Since 200 # Unicode extensions are not supported in ResourceTypes, 201 # we will use ar-015 (Arabic, Northern Africa) instead 202 # of the more correct ar-u-nu-latn. 203 parent_dict[locale] = 'ar_015' 204 return parent_dict 205 206 207def get_likely_script(locale, likely_script_dict): 208 """Find the likely script for a locale, given the likely-script dictionary. 209 """ 210 if locale.count('_') == 2: 211 # it already has a script 212 return locale.split('_')[1] 213 elif locale in likely_script_dict: 214 return likely_script_dict[locale] 215 else: 216 language = locale.split('_')[0] 217 return likely_script_dict[language] 218 219 220def dump_parent_data(script_organized_dict): 221 """Dump information for parents of locales.""" 222 sorted_scripts = sorted(script_organized_dict.keys()) 223 print 224 for script in sorted_scripts: 225 parent_dict = script_organized_dict[script] 226 print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({' 227 % escape_script_variable_name(script.upper())) 228 for locale in sorted(parent_dict.keys()): 229 parent = parent_dict[locale] 230 print ' {0x%08Xu, 0x%08Xu}, // %s -> %s' % ( 231 pack_to_uint32(locale), 232 pack_to_uint32(parent), 233 locale.replace('_', '-'), 234 parent.replace('_', '-')) 235 print '});' 236 print 237 238 print 'const struct {' 239 print ' const char script[4];' 240 print ' const std::unordered_map<uint32_t, uint32_t>* map;' 241 print '} SCRIPT_PARENTS[] = {' 242 for script in sorted_scripts: 243 print " {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % ( 244 script[0], script[1], script[2], script[3], 245 escape_script_variable_name(script.upper())) 246 print '};' 247 248 249def dump_parent_tree_depth(parent_dict): 250 """Find and dump the depth of the parent tree.""" 251 max_depth = 1 252 for locale, _ in parent_dict.items(): 253 depth = 1 254 while locale in parent_dict: 255 locale = parent_dict[locale] 256 depth += 1 257 max_depth = max(max_depth, depth) 258 assert max_depth < 5 # Our algorithms assume small max_depth 259 print 260 print 'const size_t MAX_PARENT_DEPTH = %d;' % max_depth 261 262 263def read_and_dump_parent_data(icu_data_dir, likely_script_dict): 264 """Read parent data from ICU and dump it.""" 265 parent_dict = read_parent_data(icu_data_dir) 266 script_organized_dict = collections.defaultdict(dict) 267 for locale in parent_dict: 268 parent = parent_dict[locale] 269 if parent == 'root': 270 continue 271 script = get_likely_script(locale, likely_script_dict) 272 script_organized_dict[script][locale] = parent_dict[locale] 273 dump_parent_data(script_organized_dict) 274 dump_parent_tree_depth(parent_dict) 275 276 277def main(): 278 """Read the data files from ICU and dump the output to a C++ file.""" 279 source_root = sys.argv[1] 280 icu_data_dir = os.path.join( 281 source_root, 282 'external', 'icu', 'icu4c', 'source', 'data') 283 284 print '// Auto-generated by %s' % sys.argv[0] 285 print 286 likely_script_dict = read_and_dump_likely_data(icu_data_dir) 287 read_and_dump_parent_data(icu_data_dir, likely_script_dict) 288 289 290if __name__ == '__main__': 291 main() 292